From 5168df781b1b6be9211e2358c5429b3dc8bbe092 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Sat, 27 Jun 2026 13:49:03 -0700
Subject: [PATCH 1/3] [ExecuTorch][WebGPU] Add clone op (aten.clone.default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20463

`aten.clone.default` is a pure flat copy on the buffer-only WebGPU backend, identical to `view_copy`: `clone_impl` reuses the existing `add_flat_copy` helper (`output[i] = input[i]`) and registers a handler under `aten.clone.default`. No new shader, generated WGSL header, or CMake source — it shares the `view_copy` flat-copy compute pipeline. Required for end-to-end Llama 3.2 1B (4-bit, KV cache): the exported model serializes 2 `aten.clone.default` ops into its runtime operator chain (the RoPE-frequency clones reused across all 16 transformer layers), so without a handler the partition graph-breaks at those nodes. Mirrors the Vulkan delegate, which registers the same op and routes a buffer clone to a flat view-copy.
ghstack-source-id: 397534700
@exported-using-ghexport

@diff-train-skip-merge

Differential Revision: [D109477717](https://our.internmc.facebook.com/intern/diff/D109477717/)
---
 backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
index d56c721ce3e..67119472643 100644
--- a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
+++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
@@ -53,10 +53,17 @@ void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
   add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
 }
 
+// clone = flat copy; survives Vulkan RemoveRedundantOpsTransform in Llama 1B.
+void clone_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, memory_format?, out]; out = last value-id.
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
 } // namespace
 
 WEBGPU_REGISTER_OPERATORS {
   WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
+  WEBGPU_REGISTER_OP(aten.clone.default, clone_impl);
 }
 
 } // namespace executorch::backends::webgpu

From a47bb14f1b4d2d6014cd01cc3c1293639c3bb2bb Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Sat, 27 Jun 2026 13:49:04 -0700
Subject: [PATCH 2/3] [ExecuTorch][WebGPU] Add aten.index.Tensor (1D-self
 gather)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20464

Adds the WebGPU delegate handler for aten.index.Tensor, the 1D-self advanced-index
gather out[i] = self[index[i]] (output shape == index shape). This is the form the
VulkanPartitioner delegates -- it requires a 1D self and exactly one non-None index
(op_registry.py); 2D mask/freqs gathers stay on CPU. It mirrors the Vulkan delegate's
index_tensor op (IndexTensor.cpp + index_tensor_buffer.glsl) as a single compute
dispatch over the output elements, each reading the int32 index and gathering the
corresponding fp32 self element.

The op is composed as:
- index.wgsl: one workgroup-strided pass, out[i] = self[u32(index[i])], guarded by a
  numel bound; buffer-only, fp32 self/out, int32 index, 1D dispatch via the shared
  WebGPUUtils helpers (clamp workgroup size + 1D count).
- Index.cpp: validates the args (self/out tensors; indices ValueList with exactly one
  index tensor; fp32 self/out; int32 index; out numel == index numel), failing loud on
  any violation, then records the dispatch. row_width is dropped (always 1 for 1D self).
ghstack-source-id: 397756251
@exported-using-ghexport

@diff-train-skip-merge

Differential Revision: [D109478967](https://our.internmc.facebook.com/intern/diff/D109478967/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 backends/webgpu/runtime/ops/index/Index.cpp   | 189 ++++++++++++++++++
 backends/webgpu/runtime/ops/index/index.wgsl  |  22 ++
 .../webgpu/runtime/ops/index/index_wgsl.h     |  46 +++++
 4 files changed, 258 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/index/Index.cpp
 create mode 100644 backends/webgpu/runtime/ops/index/index.wgsl
 create mode 100644 backends/webgpu/runtime/ops/index/index_wgsl.h

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 50aea6a469c..cbe36dade70 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -50,6 +50,7 @@ set(WEBGPU_SRCS
     runtime/ops/slice/Slice.cpp
     runtime/ops/permute/Permute.cpp
     runtime/ops/cat/Cat.cpp
+    runtime/ops/index/Index.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/index/Index.cpp b/backends/webgpu/runtime/ops/index/Index.cpp
new file mode 100644
index 00000000000..0c33d616c03
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/Index.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/index/index_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct IndexParams {
+  uint32_t numel;
+  uint32_t _pad[3]; // pad to 16 bytes
+};
+
+// aten.index.Tensor 1D-self gather out[i]=self[index[i]] (mirrors Vulkan).
+void index_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, indices (Tensor?[] -> ValueList), out].
+  const int self_id = args.at(0);
+  const int list_id = args.at(1);
+  const int out_id = args.at(args.size() - 1);
+
+  if (graph.get_value_type(self_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("index: self arg is not a tensor");
+  }
+  if (graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("index: out arg is not a tensor");
+  }
+  if (graph.get_value_type(list_id) != WebGPUGraph::ValueType::ValueList) {
+    throw std::runtime_error("index: indices arg is not a ValueList");
+  }
+
+  // Exactly one non-Null index tensor (mirror Vulkan IndexTensor.cpp:67-69).
+  const std::vector<int>& ids = graph.get_value_list(list_id);
+  int index_id = -1;
+  for (int id : ids) {
+    if (graph.get_value_type(id) == WebGPUGraph::ValueType::Null) {
+      continue;
+    }
+    if (graph.get_value_type(id) != WebGPUGraph::ValueType::Tensor) {
+      throw std::runtime_error("index: index list element is not a tensor");
+    }
+    if (index_id != -1) {
+      throw std::runtime_error("index: expected exactly one index tensor");
+    }
+    index_id = id;
+  }
+  if (index_id == -1) {
+    throw std::runtime_error("index: no index tensor provided");
+  }
+
+  WGPUDevice device = graph.device();
+
+  const auto& self_tensor = graph.get_tensor(self_id);
+  const auto& index_tensor = graph.get_tensor(index_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+
+  if (self_tensor.buffer == nullptr || index_tensor.buffer == nullptr ||
+      out_tensor.buffer == nullptr) {
+    throw std::runtime_error("index: null buffer binding");
+  }
+  // 1D-self gather: the kernel flat-indexes self by a scalar; fail loud on a
+  // higher-rank self (mirrors Vulkan index_tensor_buffer's 1D-self contract).
+  if (self_tensor.dims.size() != 1) {
+    throw std::runtime_error("index: only 1D self is supported");
+  }
+
+  const size_t out_numel = out_tensor.nbytes / sizeof(float);
+  if (out_tensor.nbytes != out_numel * sizeof(float) ||
+      self_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error("index: non-fp32 self/out (nbytes != numel * 4)");
+  }
+  // Index is the int32 downcast of the int64 advanced index (downcast_64_bit).
+  const size_t index_numel = index_tensor.nbytes / sizeof(int32_t);
+  if (index_tensor.nbytes != index_numel * sizeof(int32_t)) {
+    throw std::runtime_error("index: index buffer is not int32 (nbytes % 4)");
+  }
+  // out is one self element per index element (row_width == 1, 1D self).
+  if (out_numel != index_numel) {
+    throw std::runtime_error("index: out numel != index numel");
+  }
+
+  uint32_t num_elements = static_cast<uint32_t>(out_numel);
+  uint32_t wg_size = utils::clamp_workgroup_size(device, kIndexWorkgroupSizeX);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, num_elements, wg_size, "index");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  IndexParams params = {};
+  params.numel = num_elements;
+
+  WGPUBuffer uniform_buffer =
+      utils::make_uniform(device, &params, sizeof(IndexParams));
+  graph.add_uniform_buffer_bytes(sizeof(IndexParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kIndexWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // self (read), out (read_write), index (read i32), params (uniform).
+  WGPUBindGroupLayoutEntry entries[4] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 4;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[4] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = self_tensor.buffer;
+  bg_entries[0].size = self_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = index_tensor.buffer;
+  bg_entries[2].size = index_tensor.nbytes;
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = uniform_buffer;
+  bg_entries[3].size = sizeof(IndexParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 4;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // The bind group keeps the uniform buffer alive until release.
+  wgpuBufferRelease(uniform_buffer);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.index.Tensor, index_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/index/index.wgsl b/backends/webgpu/runtime/ops/index/index.wgsl
new file mode 100644
index 00000000000..b0fd6df81bf
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/index.wgsl
@@ -0,0 +1,22 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<storage, read> index: array<i32>;
+
+struct Params {
+  numel: u32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+override wg_size: u32 = 64;
+
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= params.numel) {
+        return;
+    }
+
+    // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl).
+    let i = index[out_bufi];
+    output[out_bufi] = input[u32(i)];
+}
diff --git a/backends/webgpu/runtime/ops/index/index_wgsl.h b/backends/webgpu/runtime/ops/index/index_wgsl.h
new file mode 100644
index 00000000000..839a3b164bb
--- /dev/null
+++ b/backends/webgpu/runtime/ops/index/index_wgsl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from index.wgsl - DO NOT EDIT.
+// wgsl-sha256: daed48e60bfcf2b7420d277576d794137d3bff383aef4f68464c98c8a7235c8e
+inline constexpr const char* kIndexWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+@group(0) @binding(2) var<storage, read> index: array<i32>;
+
+struct Params {
+  numel: u32,
+}
+@group(0) @binding(3) var<uniform> params: Params;
+
+override wg_size: u32 = 64;
+
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= params.numel) {
+        return;
+    }
+
+    // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl).
+    let i = index[out_bufi];
+    output[out_bufi] = input[u32(i)];
+}
+)";
+
+inline constexpr uint32_t kIndexWorkgroupSizeX = 64;
+inline constexpr uint32_t kIndexWorkgroupSizeY = 1;
+inline constexpr uint32_t kIndexWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

From a598a84e8a0e1ab7185ceb9870e15677a1c0157a Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Sat, 27 Jun 2026 13:49:04 -0700
Subject: [PATCH 3/3] [ExecuTorch][WebGPU] aten.index.Tensor test suite (export
 + native golden)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20465

Adds the test suite for the aten.index.Tensor op (stacked on the op diff):
- test/ops/index/test_index.py: exports a module computing x[idx] through
  VulkanPartitioner for four configs (reorder/repeat indices over distinct self
  values, so a wrong-gather is visible), asserts a VulkanBackend delegate with
  index.Tensor absorbed (not a CPU fallback), and writes per-config .pte +
  .self/.idx/.golden.bin.
- test/native/test_index.cpp: a standalone Dawn binary that loads each .pte, feeds
  self (fp32) + index (int64 at the program boundary, narrowed to the int32 buffer)
  and compares the gather against the torch golden at 1e-3, with a single-output
  shape guard.
- Wired into CMake (webgpu_index_test), test/TARGETS (python_unittest test_index),
  and the Dawn native CI script.
ghstack-source-id: 397763261
@exported-using-ghexport

@diff-train-skip-merge

Differential Revision: [D109479000](https://our.internmc.facebook.com/intern/diff/D109479000/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 .../webgpu/scripts/test_webgpu_native_ci.sh   |  12 +-
 backends/webgpu/test/TARGETS                  |  13 ++
 backends/webgpu/test/native/test_index.cpp    | 174 ++++++++++++++++++
 backends/webgpu/test/ops/index/__init__.py    |   0
 backends/webgpu/test/ops/index/test_index.py  | 109 +++++++++++
 6 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 backends/webgpu/test/native/test_index.cpp
 create mode 100644 backends/webgpu/test/ops/index/__init__.py
 create mode 100644 backends/webgpu/test/ops/index/test_index.py

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index cbe36dade70..c3d9387942a 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
     target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions)
     set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17)
   endif()
+  add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp)
 endif()
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index b2eb56a505e..38195535732 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 DISPATCH_ORDER_OK=1
 UPDATE_CACHE_DIR="/tmp/update_cache"
 UPDATE_CACHE_OK=1
+INDEX_DIR="/tmp/index"
+INDEX_OK=1
 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte"
 EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin"
 EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
@@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}')
 export_update_cache_negative('${UPDATE_CACHE_DIR}')
 " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; }
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models
+export_all_index_models('${INDEX_DIR}')
+" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; }
+
 # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in
 # webgpu_native_test below (precise per-config error), so don't exit/mask here.
 $PYTHON_EXECUTABLE -c "
@@ -136,7 +143,7 @@ cmake \
     "${EXECUTORCH_ROOT}"
 
 # ── Build + run every native test target that exists in this tree ────────────
-TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test)
+TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test)
 BIN_DIR="${BUILD_DIR}/backends/webgpu"
 
 # Which targets are defined depends on which diffs are landed (native_test +
@@ -201,6 +208,9 @@ fi
 if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
   "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
 fi
+if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then
+  "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}"
+fi
 [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
 
 echo "=== WebGPU native tests on Dawn: all run targets passed ==="
diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS
index 6bf76a525e9..d4f7046f0bd 100644
--- a/backends/webgpu/test/TARGETS
+++ b/backends/webgpu/test/TARGETS
@@ -17,6 +17,19 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_index",
+    srcs = [
+        "ops/index/test_index.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+        "//executorch/exir:lib",
+    ],
+)
+
 runtime.python_library(
     name = "tester",
     srcs = ["tester.py"],
diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp
new file mode 100644
index 00000000000..aed24c0a796
--- /dev/null
+++ b/backends/webgpu/test/native/test_index.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case).
+constexpr const char* kIndexCases[] = {
+    "index_n16_m5",
+    "index_n8_rev",
+    "index_n32_m3",
+    "index_n4_rep",
+};
+
+std::vector<float> read_f32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(float) * sizeof(float);
+  f.seekg(0);
+  std::vector<float> data(bytes / sizeof(float));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+std::vector<int32_t> read_i32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const size_t bytes =
+      static_cast<size_t>(f.tellg()) / sizeof(int32_t) * sizeof(int32_t);
+  f.seekg(0);
+  std::vector<int32_t> data(bytes / sizeof(int32_t));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(bytes));
+  return data;
+}
+
+bool run_case(const std::string& dir, const char* name) {
+  printf("\n--- Test: %s ---\n", name);
+  const std::string base = dir + "/" + name;
+  std::vector<float> self_data = read_f32_bin(base + ".self.bin");
+  std::vector<int32_t> idx32 = read_i32_bin(base + ".idx.bin");
+  std::vector<float> golden = read_f32_bin(base + ".golden.bin");
+  if (self_data.empty() || idx32.empty() || golden.empty()) {
+    printf("FAIL: could not read self/idx/golden for %s\n", name);
+    return false;
+  }
+
+  Module module(base + ".pte");
+  if (module.load_forward() != Error::Ok) {
+    printf("FAIL: could not load %s.pte\n", name);
+    return false;
+  }
+
+  const int32_t n = static_cast<int32_t>(self_data.size());
+  const int32_t m = static_cast<int32_t>(idx32.size());
+  auto x = make_tensor_ptr({n}, std::vector<float>(self_data));
+  // int64 at the program boundary; copy_inputs narrows to the int32 buffer.
+  std::vector<int64_t> idx64(idx32.begin(), idx32.end());
+  auto idx = make_tensor_ptr({m}, std::vector<int64_t>(idx64));
+
+  auto result = module.forward({EValue(x), EValue(idx)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+
+  const auto& outputs = result.get();
+  // index.Tensor has exactly one output of shape [num_indices]; fail loud else.
+  if (outputs.size() != 1 || !outputs[0].isTensor()) {
+    printf("FAIL: expected exactly one tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.dim() != 1 || out_tensor.size(0) != m) {
+    printf(
+        "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n",
+        (int)out_tensor.dim(),
+        (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1),
+        m);
+    return false;
+  }
+  if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
+    printf(
+        "FAIL: output numel %zu != golden %zu\n",
+        (size_t)out_tensor.numel(),
+        golden.size());
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f;
+  float max_rel_err = 0.0f;
+  for (size_t i = 0; i < golden.size(); i++) {
+    const float abs_err = std::abs(out_data[i] - golden[i]);
+    max_abs_err = std::max(max_abs_err, abs_err);
+    const float denom = std::max(std::abs(golden[i]), 1e-6f);
+    max_rel_err = std::max(max_rel_err, abs_err / denom);
+  }
+  printf(
+      "Max abs error: %e   Max rel error: %e (%zu elements)\n",
+      max_abs_err,
+      max_rel_err,
+      golden.size());
+  if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
+    printf("FAIL: %s exceeds tolerance 1e-3\n", name);
+    return false;
+  }
+  printf("PASS: %s\n", name);
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  std::string dir = "/tmp/index";
+  if (argc > 1) {
+    dir = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) {
+    dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
+
+  bool ok = true;
+  for (const char* name : kIndexCases) {
+    ok = run_case(dir, name) && ok;
+  }
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll index tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/ops/index/__init__.py b/backends/webgpu/test/ops/index/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py
new file mode 100644
index 00000000000..b5e83dcf329
--- /dev/null
+++ b/backends/webgpu/test/ops/index/test_index.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.index.Tensor` export + goldens for the WebGPU backend.
+
+Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner --
+the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see
+op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index
+serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat
+indices make a wrong-gather bug visible. Each config writes `index_<name>.pte`,
+`index_<name>.self.bin` (fp32 self), `index_<name>.idx.bin` (int32 index), and
+`index_<name>.golden.bin` so the native `test_index` self-discovers them.
+"""
+
+import os
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (self_len, index_values)
+CONFIGS = {
+    "n16_m5": (16, [0, 15, 7, 7, 2]),
+    "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]),
+    "n32_m3": (32, [31, 0, 16]),
+    "n4_rep": (4, [2, 2, 2, 2, 0, 1]),
+}
+
+
+class IndexModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+        return x[idx]
+
+
+def _inputs(self_len, index_values):
+    # Distinct self values so a wrong-index gather is visible.
+    x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5
+    idx = torch.tensor(index_values, dtype=torch.int64)
+    return x, idx
+
+
+def _lower(x, idx):
+    ep = torch.export.export(IndexModule().eval(), (x, idx))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _export(x, idx):
+    return _lower(x, idx).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class TestIndex(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (n, iv) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(*_inputs(n, iv))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (index {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "index.Tensor"),
+                    f"index.Tensor not delegated (fell back to CPU) for {name}",
+                )
+
+    def test_golden_matches_eager(self) -> None:
+        for name, (n, iv) in CONFIGS.items():
+            with self.subTest(name=name):
+                x, idx = _inputs(n, iv)
+                torch.testing.assert_close(IndexModule()(x, idx), x[idx])
+
+
+def export_all_index_models(out_dir: str) -> None:
+    """Write index_<name>.pte + .self/.idx/.golden.bin for every config."""
+    os.makedirs(out_dir, exist_ok=True)
+    for name, (n, iv) in CONFIGS.items():
+        x, idx = _inputs(n, iv)
+        golden = x[idx].contiguous().detach().numpy().astype("<f4")
+        et = _export(x, idx)
+        base = os.path.join(out_dir, f"index_{name}")
+        with open(base + ".pte", "wb") as f:
+            f.write(et.buffer)
+        x.numpy().astype("<f4").tofile(base + ".self.bin")
+        idx.numpy().astype("<i4").tofile(base + ".idx.bin")
+        golden.tofile(base + ".golden.bin")
+        print(f"Exported {base}.pte; self {n} -> golden {golden.size} floats")
+
+
+if __name__ == "__main__":
+    unittest.main()