diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 50aea6a469c..c3d9387942a 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -50,6 +50,7 @@ set(WEBGPU_SRCS
     runtime/ops/slice/Slice.cpp
     runtime/ops/permute/Permute.cpp
     runtime/ops/cat/Cat.cpp
+    runtime/ops/index/Index.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
@@ -193,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
     target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
     set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
   endif()
+  add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
 endif()
diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md
index 8c3928c5f49..a0711d862d5 100644
--- a/backends/webgpu/README.md
+++ b/backends/webgpu/README.md
@@ -140,11 +140,10 @@ backends/webgpu/
     ├── test_webgpu_native.cpp     # C++ native test runner
     ├── test_wgsl_codegen.py       # Shader codegen check
     ├── native/                    # C++ operator tests
-    └── ops/                       # Python export tests
-        ├── add/
-        │   └── test_add.py        # add export tests
-        └── rms_norm/
-            └── test_rms_norm.py   # rms_norm export tests
+    └── ops/                       # Python op test suites (flat: test_<op>.py)
+        ├── test_add.py
+        ├── test_rms_norm.py
+        └── ...                    # one test_<op>.py per op
 ```
 
 ## Requirements
diff --git a/backends/webgpu/runtime/ops/index/Index.cpp b/backends/webgpu/runtime/ops/index/Index.cpp
new file mode 100644
index 00000000000..0c33d616c03
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/Index.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/index/index_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct IndexParams {
+  uint32_t numel;
+  uint32_t _pad[3]; // pad to 16 bytes
+};
+
+// aten.index.Tensor 1D-self gather out[i]=self[index[i]] (mirrors Vulkan).
+void index_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, indices (Tensor?[] -> ValueList), out].
+  const int self_id = args.at(0);
+  const int list_id = args.at(1);
+  const int out_id = args.at(args.size() - 1);
+
+  if (graph.get_value_type(self_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("index: self arg is not a tensor");
+  }
+  if (graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("index: out arg is not a tensor");
+  }
+  if (graph.get_value_type(list_id) != WebGPUGraph::ValueType::ValueList) {
+    throw std::runtime_error("index: indices arg is not a ValueList");
+  }
+
+  // Exactly one non-Null index tensor (mirror Vulkan IndexTensor.cpp:67-69).
+  const std::vector<int>& ids = graph.get_value_list(list_id);
+  int index_id = -1;
+  for (int id : ids) {
+    if (graph.get_value_type(id) == WebGPUGraph::ValueType::Null) {
+      continue;
+    }
+    if (graph.get_value_type(id) != WebGPUGraph::ValueType::Tensor) {
+      throw std::runtime_error("index: index list element is not a tensor");
+    }
+    if (index_id != -1) {
+      throw std::runtime_error("index: expected exactly one index tensor");
+    }
+    index_id = id;
+  }
+  if (index_id == -1) {
+    throw std::runtime_error("index: no index tensor provided");
+  }
+
+  WGPUDevice device = graph.device();
+
+  const auto& self_tensor = graph.get_tensor(self_id);
+  const auto& index_tensor = graph.get_tensor(index_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+
+  if (self_tensor.buffer == nullptr || index_tensor.buffer == nullptr ||
+      out_tensor.buffer == nullptr) {
+    throw std::runtime_error("index: null buffer binding");
+  }
+  // 1D-self gather: the kernel flat-indexes self by a scalar; fail loud on a
+  // higher-rank self (mirrors Vulkan index_tensor_buffer's 1D-self contract).
+  if (self_tensor.dims.size() != 1) {
+    throw std::runtime_error("index: only 1D self is supported");
+  }
+
+  const size_t out_numel = out_tensor.nbytes / sizeof(float);
+  if (out_tensor.nbytes != out_numel * sizeof(float) ||
+      self_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error("index: non-fp32 self/out (nbytes != numel * 4)");
+  }
+  // Index is the int32 downcast of the int64 advanced index (downcast_64_bit).
+  const size_t index_numel = index_tensor.nbytes / sizeof(int32_t);
+  if (index_tensor.nbytes != index_numel * sizeof(int32_t)) {
+    throw std::runtime_error("index: index buffer is not int32 (nbytes % 4)");
+  }
+  // out is one self element per index element (row_width == 1, 1D self).
+  if (out_numel != index_numel) {
+    throw std::runtime_error("index: out numel != index numel");
+  }
+
+  uint32_t num_elements = static_cast<uint32_t>(out_numel);
+  uint32_t wg_size = utils::clamp_workgroup_size(device, kIndexWorkgroupSizeX);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, num_elements, wg_size, "index");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  IndexParams params = {};
+  params.numel = num_elements;
+
+  WGPUBuffer uniform_buffer =
+      utils::make_uniform(device, &params, sizeof(IndexParams));
+  graph.add_uniform_buffer_bytes(sizeof(IndexParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kIndexWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // self (read), out (read_write), index (read i32), params (uniform).
+  WGPUBindGroupLayoutEntry entries[4] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 4;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[4] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = self_tensor.buffer;
+  bg_entries[0].size = self_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = index_tensor.buffer;
+  bg_entries[2].size = index_tensor.nbytes;
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = uniform_buffer;
+  bg_entries[3].size = sizeof(IndexParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 4;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // The bind group keeps the uniform buffer alive until release.
+  wgpuBufferRelease(uniform_buffer);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.index.Tensor, index_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/index/index.wgsl b/backends/webgpu/runtime/ops/index/index.wgsl
new file mode 100644
index 00000000000..b0fd6df81bf
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/index.wgsl
@@ -0,0 +1,22 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<storage, read> index: array<i32>;
+
+struct Params {
+  numel: u32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+override wg_size: u32 = 64;
+
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= params.numel) {
+        return;
+    }
+
+    // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl).
+    let i = index[out_bufi];
+    output[out_bufi] = input[u32(i)];
+}
diff --git a/backends/webgpu/runtime/ops/index/index_wgsl.h b/backends/webgpu/runtime/ops/index/index_wgsl.h
new file mode 100644
index 00000000000..839a3b164bb
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/index_wgsl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from index.wgsl - DO NOT EDIT.
+// wgsl-sha256: daed48e60bfcf2b7420d277576d794137d3bff383aef4f68464c98c8a7235c8e
+inline constexpr const char* kIndexWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<storage, read> index: array<i32>;
+
+struct Params {
+  numel: u32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+override wg_size: u32 = 64;
+
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= params.numel) {
+        return;
+    }
+
+    // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl).
+    let i = index[out_bufi];
+    output[out_bufi] = input[u32(i)];
+}
+)";
+
+inline constexpr uint32_t kIndexWorkgroupSizeX = 64;
+inline constexpr uint32_t kIndexWorkgroupSizeY = 1;
+inline constexpr uint32_t kIndexWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
index d56c721ce3e..67119472643 100644
--- a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
+++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
@@ -53,10 +53,17 @@ void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
 }
 
+// clone = flat copy; survives Vulkan RemoveRedundantOpsTransform in Llama 1B.
+void clone_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, memory_format?, out]; out = last value-id.
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
 } // namespace
 
 WEBGPU_REGISTER_OPERATORS {
   WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
+  WEBGPU_REGISTER_OP(aten.clone.default, clone_impl);
 }
 
 } // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index 84b5349ef2d..38195535732 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
+INDEX_DIR="/tmp/index"
+INDEX_OK=1
 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
 EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
 EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
@@ -65,36 +67,36 @@ PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte"
 PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
+from executorch.backends.webgpu.test.ops.test_quantized_linear import export_all_quantized_linear_models
 export_all_quantized_linear_models('/tmp')
 " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import export_embedding_q4gsw_model
+from executorch.backends.webgpu.test.ops.test_embedding_q4gsw import export_embedding_q4gsw_model
 export_embedding_q4gsw_model('${EMBEDDING_MODEL}', '${EMBEDDING_GOLDEN}', '${EMBEDDING_INDICES}')
 export_embedding_q4gsw_model('${EMBEDDING_LLAMA1B_MODEL}', '${EMBEDDING_LLAMA1B_GOLDEN}', '${EMBEDDING_LLAMA1B_INDICES}', 'llama1b')
 " || echo "WARN: embedding_q4gsw export failed; embedding configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model
+from executorch.backends.webgpu.test.ops.test_rope import export_rope_model
 export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
 export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode')
 " || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model
+from executorch.backends.webgpu.test.ops.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model
 export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
 export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
 export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}')
 " || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test"
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
+from executorch.backends.webgpu.test.ops.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
 " || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }
 
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_update_cache import (
+from executorch.backends.webgpu.test.ops.test_update_cache import (
     export_update_cache_cases,
     export_update_cache_replay,
     export_update_cache_negative,
@@ -104,10 +106,15 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}')
 export_update_cache_negative('${UPDATE_CACHE_DIR}')
 " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models
+export_all_index_models('${INDEX_DIR}')
+" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }
+
 # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
 # webgpu_native_test below (precise per-config error), so don't exit/mask here.
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import (
+from executorch.backends.webgpu.test.ops.test_sdpa import (
     export_all_sdpa_models,
     export_replay_sequences,
     export_dynamic_decode,
@@ -136,7 +143,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -201,6 +208,9 @@ fi
 if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
   "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 fi
+if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
+  "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
+fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="
diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS
index 9008f32cd2c..d4f7046f0bd 100644
--- a/backends/webgpu/test/TARGETS
+++ b/backends/webgpu/test/TARGETS
@@ -7,7 +7,20 @@ oncall("executorch")
 python_unittest(
     name = "test_add",
     srcs = [
-        "ops/add/test_add.py",
+        "ops/test_add.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_unittest(
+    name = "test_index",
+    srcs = [
+        "ops/index/test_index.py",
     ],
     deps = [
         "//caffe2:torch",
diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp
new file mode 100644
index 00000000000..aed24c0a796
--- /dev/null
+++ b/backends/webgpu/test/native/test_index.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case).
+constexpr const char* kIndexCases[] = {
+    "index_n16_m5",
+    "index_n8_rev",
+    "index_n32_m3",
+    "index_n4_rep",
+};
+
+std::vector<float> read_f32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
+  f.seekg(0);
+  std::vector<float> data(bytes / sizeof(float));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+std::vector<int32_t> read_i32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(int32_t) * sizeof(int32_t);
+  f.seekg(0);
+  std::vector<int32_t> data(bytes / sizeof(int32_t));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+bool run_case(const std::string& dir, const char* name) {
+  printf("\n--- Test: %s ---\n", name);
+  const std::string base = dir + "/" + name;
+  std::vector<float> self_data = read_f32_bin(base + ".self.bin");
+  std::vector<int32_t> idx32 = read_i32_bin(base + ".idx.bin");
+  std::vector<float> golden = read_f32_bin(base + ".golden.bin");
+  if (self_data.empty() || idx32.empty() || golden.empty()) {
+    printf("FAIL: could not read self/idx/golden for %s\n", name);
+    return false;
+  }
+
+  Module module(base + ".pte");
+  if (module.load_forward() != Error::Ok) {
+    printf("FAIL: could not load %s.pte\n", name);
+    return false;
+  }
+
+  const int32_t n = static_cast<int32_t>(self_data.size());
+  const int32_t m = static_cast<int32_t>(idx32.size());
+  auto x = make_tensor_ptr({n}, std::vector<float>(self_data));
+  // int64 at the program boundary; copy_inputs narrows to the int32 buffer.
+  std::vector<int64_t> idx64(idx32.begin(), idx32.end());
+  auto idx = make_tensor_ptr({m}, std::vector<int64_t>(idx64));
+
+  auto result = module.forward({EValue(x), EValue(idx)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  // index.Tensor has exactly one output of shape [num_indices]; fail loud else.
+  if (outputs.size() != 1 || !outputs[0].isTensor()) {
+    printf("FAIL: expected exactly one tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.dim() != 1 || out_tensor.size(0) != m) {
+    printf(
+        "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n",
+        (int)out_tensor.dim(),
+        (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1),
+        m);
+    return false;
+  }
+  if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
+    printf(
+        "FAIL: output numel %zu != golden %zu\n",
+        (size_t)out_tensor.numel(),
+        golden.size());
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f;
+  float max_rel_err = 0.0f;
+  for (size_t i = 0; i < golden.size(); i++) {
+    const float abs_err = std::abs(out_data[i] - golden[i]);
+    max_abs_err = std::max(max_abs_err, abs_err);
+    const float denom = std::max(std::abs(golden[i]), 1e-6f);
+    max_rel_err = std::max(max_rel_err, abs_err / denom);
+  }
+  printf(
+      "Max abs error: %e   Max rel error: %e (%zu elements)\n",
+      max_abs_err,
+      max_rel_err,
+      golden.size());
+  if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
+    printf("FAIL: %s exceeds tolerance 1e-3\n", name);
+    return false;
+  }
+  printf("PASS: %s\n", name);
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  std::string dir = "/tmp/index";
+  if (argc > 1) {
+    dir = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) {
+    dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
+
+  bool ok = true;
+  for (const char* name : kIndexCases) {
+    ok = run_case(dir, name) && ok;
+  }
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll index tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index f1ab0c6cca4..1b6b0fa7085 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -25,16 +25,14 @@
     WebGPUTestSuite,
     XS,
 )
-from executorch.backends.webgpu.test.ops.add.test_add import (
+from executorch.backends.webgpu.test.ops.test_add import (
     AddChainedModule,
     AddModule,
     AddSelfModule,
 )
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import (
-    _CASES,
-    _linspace_weight,
-    _ramp,
-    RmsNormModule,
+from executorch.backends.webgpu.test.ops.test_cat import (
+    CatModule,
+    CONFIGS as _CAT_CONFIGS,
 )
 from executorch.backends.webgpu.test.ops.test_mul import (
     CONFIGS as _MUL_CONFIGS,
@@ -44,6 +42,12 @@
     CONFIGS as _PERMUTE_CONFIGS,
     PermuteModule,
 )
+from executorch.backends.webgpu.test.ops.test_rms_norm import (
+    _CASES,
+    _linspace_weight,
+    _ramp,
+    RmsNormModule,
+)
 from executorch.backends.webgpu.test.ops.test_select import (
     CONFIGS as _SELECT_CONFIGS,
     SelectModule,
@@ -88,7 +92,7 @@ def _add_factory(variant: str = "regular") -> torch.nn.Module:
 @register_op_test("add")
 def _add_suite() -> WebGPUTestSuite:
     # Same-shape numeric coverage only: broadcast adds stay export-smoke in
-    # ops/add/test_add.py because the kernel can't broadcast.
+    # ops/test_add.py because the kernel can't broadcast.
     return WebGPUTestSuite(
         module_factory=_add_factory,
         cases=[
@@ -247,3 +251,16 @@ def _permute_suite() -> WebGPUTestSuite:
         ],
         golden_dtype="float32",  # permutation reorders values; fp64 bit-identical
     )
+
+
+@register_op_test("cat")
+def _cat_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (list_of_input_shapes, dim). Variadic input count per case.
+    return WebGPUTestSuite(
+        module_factory=lambda dim: CatModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=tuple(shapes))
+            for n, (shapes, dim) in _CAT_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # concatenation copies values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/embedding_q4gsw/__init__.py b/backends/webgpu/test/ops/embedding_q4gsw/__init__.py
deleted file mode 100644
index 2e41cd717f6..00000000000
--- a/backends/webgpu/test/ops/embedding_q4gsw/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/dispatch_order/__init__.py b/backends/webgpu/test/ops/index/__init__.py
similarity index 100%
rename from backends/webgpu/test/ops/dispatch_order/__init__.py
rename to backends/webgpu/test/ops/index/__init__.py
diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py
new file mode 100644
index 00000000000..b5e83dcf329
--- /dev/null
+++ b/backends/webgpu/test/ops/index/test_index.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.index.Tensor` export + goldens for the WebGPU backend.
+
+Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner --
+the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see
+op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index
+serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat
+indices make a wrong-gather bug visible. Each config writes `index_<name>.pte`,
+`index_<name>.self.bin` (fp32 self), `index_<name>.idx.bin` (int32 index), and
+`index_<name>.golden.bin` so the native `test_index` self-discovers them.
+"""
+
+import os
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (self_len, index_values)
+CONFIGS = {
+    "n16_m5": (16, [0, 15, 7, 7, 2]),
+    "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]),
+    "n32_m3": (32, [31, 0, 16]),
+    "n4_rep": (4, [2, 2, 2, 2, 0, 1]),
+}
+
+
+class IndexModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        return x[idx]
+
+
+def _inputs(self_len, index_values):
+    # Distinct self values so a wrong-index gather is visible.
+    x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5
+    idx = torch.tensor(index_values, dtype=torch.int64)
+    return x, idx
+
+
+def _lower(x, idx):
+    ep = torch.export.export(IndexModule().eval(), (x, idx))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _export(x, idx):
+    return _lower(x, idx).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class TestIndex(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (n, iv) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(*_inputs(n, iv))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (index {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "index.Tensor"),
+                    f"index.Tensor not delegated (fell back to CPU) for {name}",
+                )
+
+    def test_golden_matches_eager(self) -> None:
+        for name, (n, iv) in CONFIGS.items():
+            with self.subTest(name=name):
+                x, idx = _inputs(n, iv)
+                torch.testing.assert_close(IndexModule()(x, idx), x[idx])
+
+
+def export_all_index_models(out_dir: str) -> None:
+    """Write index_<name>.pte + .self/.idx/.golden.bin for every config."""
+    os.makedirs(out_dir, exist_ok=True)
+    for name, (n, iv) in CONFIGS.items():
+        x, idx = _inputs(n, iv)
+        golden = x[idx].contiguous().detach().numpy().astype("<f4")
+        et = _export(x, idx)
+        base = os.path.join(out_dir, f"index_{name}")
+        with open(base + ".pte", "wb") as f:
+            f.write(et.buffer)
+        x.numpy().astype("<f4").tofile(base + ".self.bin")
+        idx.numpy().astype("<i4").tofile(base + ".idx.bin")
+        golden.tofile(base + ".golden.bin")
+        print(f"Exported {base}.pte; self {n} -> golden {golden.size} floats")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/webgpu/test/ops/prepack/__init__.py b/backends/webgpu/test/ops/prepack/__init__.py
deleted file mode 100644
index 2e41cd717f6..00000000000
--- a/backends/webgpu/test/ops/prepack/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/quantized_linear/__init__.py b/backends/webgpu/test/ops/quantized_linear/__init__.py
deleted file mode 100644
index 2e41cd717f6..00000000000
--- a/backends/webgpu/test/ops/quantized_linear/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/rms_norm/__init__.py b/backends/webgpu/test/ops/rms_norm/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/backends/webgpu/test/ops/rope/__init__.py b/backends/webgpu/test/ops/rope/__init__.py
deleted file mode 100644
index 2e41cd717f6..00000000000
--- a/backends/webgpu/test/ops/rope/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/sdpa/__init__.py b/backends/webgpu/test/ops/sdpa/__init__.py
deleted file mode 100644
index 2e41cd717f6..00000000000
--- a/backends/webgpu/test/ops/sdpa/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/test_add.py
similarity index 100%
rename from backends/webgpu/test/ops/add/test_add.py
rename to backends/webgpu/test/ops/test_add.py
diff --git a/backends/webgpu/test/ops/test_cat.py b/backends/webgpu/test/ops/test_cat.py
new file mode 100644
index 00000000000..d0d714fd934
--- /dev/null
+++ b/backends/webgpu/test/ops/test_cat.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.cat.default` module + configs for the WebGPU op-test framework.
+
+`CatModule` + `CONFIGS` are imported by `cases.py` to drive the declarative op-test
+suite. `CatTest` is the export-delegation smoke test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (list_of_input_shapes, dim)
+CONFIGS = {
+    "dim0_3": ([(2, 3), (2, 3), (2, 3)], 0),
+    "dim1_2": ([(1, 4, 8), (1, 4, 8)], 1),
+    "dim2_3": ([(2, 3, 4), (2, 3, 4), (2, 3, 4)], 2),
+    "uneven": ([(2, 1, 4), (2, 3, 4), (2, 2, 4)], 1),
+}
+
+
+class CatModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, *xs: torch.Tensor) -> torch.Tensor:
+        return torch.cat(xs, self.dim)
+
+
+def _det_inputs(shapes):
+    # Distinct value range per input so a cross-contamination bug is visible.
+    inputs = []
+    base = 0.0
+    for sh in shapes:
+        n = 1
+        for s in sh:
+            n *= s
+        inputs.append(torch.arange(base, base + n, dtype=torch.float32).reshape(sh))
+        base += 1000.0
+    return tuple(inputs)
+
+
+def _lower(dim, xs):
+    ep = torch.export.export(CatModule(dim).eval(), xs)
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class CatTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shapes, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_inputs(shapes))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et), f"Expected a VulkanBackend delegate (cat {name})"
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "cat"),
+                    f"cat not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py b/backends/webgpu/test/ops/test_dispatch_order.py
similarity index 97%
rename from backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
rename to backends/webgpu/test/ops/test_dispatch_order.py
index fbb13ff6426..08454ef0a31 100644
--- a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
+++ b/backends/webgpu/test/ops/test_dispatch_order.py
@@ -17,7 +17,7 @@
 
 import torch
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import RmsNormModule
+from executorch.backends.webgpu.test.ops.test_rms_norm import RmsNormModule
 from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS
 from executorch.exir import to_edge_transform_and_lower
 
diff --git a/backends/webgpu/test/ops/embedding_q4gsw/test_embedding_q4gsw.py b/backends/webgpu/test/ops/test_embedding_q4gsw.py
similarity index 100%
rename from backends/webgpu/test/ops/embedding_q4gsw/test_embedding_q4gsw.py
rename to backends/webgpu/test/ops/test_embedding_q4gsw.py
diff --git a/backends/webgpu/test/ops/prepack/test_prepack.py b/backends/webgpu/test/ops/test_prepack.py
similarity index 100%
rename from backends/webgpu/test/ops/prepack/test_prepack.py
rename to backends/webgpu/test/ops/test_prepack.py
diff --git a/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py b/backends/webgpu/test/ops/test_quantized_linear.py
similarity index 100%
rename from backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py
rename to backends/webgpu/test/ops/test_quantized_linear.py
diff --git a/backends/webgpu/test/ops/rms_norm/test_rms_norm.py b/backends/webgpu/test/ops/test_rms_norm.py
similarity index 100%
rename from backends/webgpu/test/ops/rms_norm/test_rms_norm.py
rename to backends/webgpu/test/ops/test_rms_norm.py
diff --git a/backends/webgpu/test/ops/rope/test_rope.py b/backends/webgpu/test/ops/test_rope.py
similarity index 100%
rename from backends/webgpu/test/ops/rope/test_rope.py
rename to backends/webgpu/test/ops/test_rope.py
diff --git a/backends/webgpu/test/ops/sdpa/test_sdpa.py b/backends/webgpu/test/ops/test_sdpa.py
similarity index 100%
rename from backends/webgpu/test/ops/sdpa/test_sdpa.py
rename to backends/webgpu/test/ops/test_sdpa.py
diff --git a/backends/webgpu/test/ops/sdpa/test_update_cache.py b/backends/webgpu/test/ops/test_update_cache.py
similarity index 100%
rename from backends/webgpu/test/ops/sdpa/test_update_cache.py
rename to backends/webgpu/test/ops/test_update_cache.py
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 5ea465e853b..2fd1dea1a52 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -25,8 +25,8 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v
 # ── Step 1: Python export tests ──────────────────────────────────────────────
 
 echo "=== Step 1: Run Python export tests ==="
-$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v
-$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/test_add.py" -v
+$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/test_rms_norm.py" -v
 
 # ── Step 2: Export .pte model ─────────────────────────────────────────────────
 
@@ -35,38 +35,38 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
+from executorch.backends.webgpu.test.ops.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
 "
 
 echo "=== Export update_cache model ==="
 UPDATE_CACHE_OK=1
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_update_cache import export_update_cache_model
+from executorch.backends.webgpu.test.ops.test_update_cache import export_update_cache_model
 export_update_cache_model('${PTE_UPDATE_CACHE_MODEL}')
 " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }
 
 echo "=== Export SDPA sweep models (sdpa_<name>.pte + .golden.bin to /tmp) ==="
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_all_sdpa_models
+from executorch.backends.webgpu.test.ops.test_sdpa import export_all_sdpa_models
 export_all_sdpa_models('/tmp')
 " || echo "WARN: sdpa export failed; the native test self-skips configs whose .pte is absent"
 
 echo "=== Export SDPA replay sequences (sdpa_<seq>_step<t>_S<S>_pos<p>.* to /tmp) ==="
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_replay_sequences
+from executorch.backends.webgpu.test.ops.test_sdpa import export_replay_sequences
 export_replay_sequences('/tmp')
 " || echo "WARN: sdpa replay export failed; the native test self-skips absent sequences"
 
 echo "=== Export SDPA dynamic-input_pos decode (sdpa_dyn_<name>.* to /tmp) ==="
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_dynamic_decode
+from executorch.backends.webgpu.test.ops.test_sdpa import export_dynamic_decode
 export_dynamic_decode('/tmp')
 " || echo "WARN: sdpa dynamic export failed; the native test self-skips when absent"
 
 echo "=== Export SDPA in-graph-cache decode (sdpa_incache_<name>.* to /tmp) ==="
 $PYTHON_EXECUTABLE -c "
-from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_incache_decode
+from executorch.backends.webgpu.test.ops.test_sdpa import export_incache_decode
 export_incache_decode('/tmp')
 " || echo "WARN: sdpa in-graph-cache export failed; the native test self-skips when absent"
 
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index b7f38246f8c..ed23f01ef8e 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -29,6 +29,7 @@
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.slice_copy.Tensor,
     exir_ops.edge.aten.permute_copy.default,
+    exir_ops.edge.aten.cat.default,
 ]