From 5168df781b1b6be9211e2358c5429b3dc8bbe092 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Sat, 27 Jun 2026 13:49:03 -0700 Subject: [PATCH 1/3] [ExecuTorch][WebGPU] Add clone op (aten.clone.default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20463 `aten.clone.default` is a pure flat copy on the buffer-only WebGPU backend, identical to `view_copy`: `clone_impl` reuses the existing `add_flat_copy` helper (`output[i] = input[i]`) and registers a handler under `aten.clone.default`. No new shader, generated WGSL header, or CMake source — it shares the `view_copy` flat-copy compute pipeline. Required for end-to-end Llama 3.2 1B (4-bit, KV cache): the exported model serializes 2 `aten.clone.default` ops into its runtime operator chain (the RoPE-frequency clones reused across all 16 transformer layers), so without a handler the partition graph-breaks at those nodes. Mirrors the Vulkan delegate, which registers the same op and routes a buffer clone to a flat view-copy. ghstack-source-id: 397534700 @exported-using-ghexport @diff-train-skip-merge Differential Revision: [D109477717](https://our.internmc.facebook.com/intern/diff/D109477717/) --- backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp index d56c721ce3e..67119472643 100644 --- a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp +++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp @@ -53,10 +53,17 @@ void view_copy_impl(WebGPUGraph& graph, const std::vector& args) { add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); } +// clone = flat copy; survives Vulkan RemoveRedundantOpsTransform in Llama 1B. +void clone_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, memory_format?, out]; out = last value-id. + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + } // namespace WEBGPU_REGISTER_OPERATORS { WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl); + WEBGPU_REGISTER_OP(aten.clone.default, clone_impl); } } // namespace executorch::backends::webgpu From a47bb14f1b4d2d6014cd01cc3c1293639c3bb2bb Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Sat, 27 Jun 2026 13:49:04 -0700 Subject: [PATCH 2/3] [ExecuTorch][WebGPU] Add aten.index.Tensor (1D-self gather) Pull Request resolved: https://github.com/pytorch/executorch/pull/20464 Adds the WebGPU delegate handler for aten.index.Tensor, the 1D-self advanced-index gather out[i] = self[index[i]] (output shape == index shape). This is the form the VulkanPartitioner delegates -- it requires a 1D self and exactly one non-None index (op_registry.py); 2D mask/freqs gathers stay on CPU. It mirrors the Vulkan delegate's index_tensor op (IndexTensor.cpp + index_tensor_buffer.glsl) as a single compute dispatch over the output elements, each reading the int32 index and gathering the corresponding fp32 self element. The op is composed as: - index.wgsl: one workgroup-strided pass, out[i] = self[u32(index[i])], guarded by a numel bound; buffer-only, fp32 self/out, int32 index, 1D dispatch via the shared WebGPUUtils helpers (clamp workgroup size + 1D count). - Index.cpp: validates the args (self/out tensors; indices ValueList with exactly one index tensor; fp32 self/out; int32 index; out numel == index numel), failing loud on any violation, then records the dispatch. row_width is dropped (always 1 for 1D self). ghstack-source-id: 397756251 @exported-using-ghexport @diff-train-skip-merge Differential Revision: [D109478967](https://our.internmc.facebook.com/intern/diff/D109478967/) --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/ops/index/Index.cpp | 189 ++++++++++++++++++ backends/webgpu/runtime/ops/index/index.wgsl | 22 ++ .../webgpu/runtime/ops/index/index_wgsl.h | 46 +++++ 4 files changed, 258 insertions(+) create mode 100644 backends/webgpu/runtime/ops/index/Index.cpp create mode 100644 backends/webgpu/runtime/ops/index/index.wgsl create mode 100644 backends/webgpu/runtime/ops/index/index_wgsl.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 50aea6a469c..cbe36dade70 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -50,6 +50,7 @@ set(WEBGPU_SRCS runtime/ops/slice/Slice.cpp runtime/ops/permute/Permute.cpp runtime/ops/cat/Cat.cpp + runtime/ops/index/Index.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/ops/index/Index.cpp b/backends/webgpu/runtime/ops/index/Index.cpp new file mode 100644 index 00000000000..0c33d616c03 --- /dev/null +++ b/backends/webgpu/runtime/ops/index/Index.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct IndexParams { + uint32_t numel; + uint32_t _pad[3]; // pad to 16 bytes +}; + +// aten.index.Tensor 1D-self gather out[i]=self[index[i]] (mirrors Vulkan). +void index_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, indices (Tensor?[] -> ValueList), out]. + const int self_id = args.at(0); + const int list_id = args.at(1); + const int out_id = args.at(args.size() - 1); + + if (graph.get_value_type(self_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: self arg is not a tensor"); + } + if (graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: out arg is not a tensor"); + } + if (graph.get_value_type(list_id) != WebGPUGraph::ValueType::ValueList) { + throw std::runtime_error("index: indices arg is not a ValueList"); + } + + // Exactly one non-Null index tensor (mirror Vulkan IndexTensor.cpp:67-69). + const std::vector& ids = graph.get_value_list(list_id); + int index_id = -1; + for (int id : ids) { + if (graph.get_value_type(id) == WebGPUGraph::ValueType::Null) { + continue; + } + if (graph.get_value_type(id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: index list element is not a tensor"); + } + if (index_id != -1) { + throw std::runtime_error("index: expected exactly one index tensor"); + } + index_id = id; + } + if (index_id == -1) { + throw std::runtime_error("index: no index tensor provided"); + } + + WGPUDevice device = graph.device(); + + const auto& self_tensor = graph.get_tensor(self_id); + const auto& index_tensor = graph.get_tensor(index_id); + const auto& out_tensor = graph.get_tensor(out_id); + + if (self_tensor.buffer == nullptr || index_tensor.buffer == nullptr || + out_tensor.buffer == nullptr) { + throw std::runtime_error("index: null buffer binding"); + } + // 1D-self gather: the kernel flat-indexes self by a scalar; fail loud on a + // higher-rank self (mirrors Vulkan index_tensor_buffer's 1D-self contract). + if (self_tensor.dims.size() != 1) { + throw std::runtime_error("index: only 1D self is supported"); + } + + const size_t out_numel = out_tensor.nbytes / sizeof(float); + if (out_tensor.nbytes != out_numel * sizeof(float) || + self_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error("index: non-fp32 self/out (nbytes != numel * 4)"); + } + // Index is the int32 downcast of the int64 advanced index (downcast_64_bit). + const size_t index_numel = index_tensor.nbytes / sizeof(int32_t); + if (index_tensor.nbytes != index_numel * sizeof(int32_t)) { + throw std::runtime_error("index: index buffer is not int32 (nbytes % 4)"); + } + // out is one self element per index element (row_width == 1, 1D self). + if (out_numel != index_numel) { + throw std::runtime_error("index: out numel != index numel"); + } + + uint32_t num_elements = static_cast(out_numel); + uint32_t wg_size = utils::clamp_workgroup_size(device, kIndexWorkgroupSizeX); + uint32_t workgroup_count = + utils::compute_1d_workgroup_count(device, num_elements, wg_size, "index"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + IndexParams params = {}; + params.numel = num_elements; + + WGPUBuffer uniform_buffer = + utils::make_uniform(device, ¶ms, sizeof(IndexParams)); + graph.add_uniform_buffer_bytes(sizeof(IndexParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kIndexWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // self (read), out (read_write), index (read i32), params (uniform). + WGPUBindGroupLayoutEntry entries[4] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 4; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[4] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = self_tensor.buffer; + bg_entries[0].size = self_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = index_tensor.buffer; + bg_entries[2].size = index_tensor.nbytes; + bg_entries[3].binding = 3; + bg_entries[3].buffer = uniform_buffer; + bg_entries[3].size = sizeof(IndexParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 4; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // The bind group keeps the uniform buffer alive until release. + wgpuBufferRelease(uniform_buffer); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.index.Tensor, index_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/index/index.wgsl b/backends/webgpu/runtime/ops/index/index.wgsl new file mode 100644 index 00000000000..b0fd6df81bf --- /dev/null +++ b/backends/webgpu/runtime/ops/index/index.wgsl @@ -0,0 +1,22 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; +@group(0) @binding(2) var index: array; + +struct Params { + numel: u32, +} +@group(0) @binding(3) var params: Params; + +override wg_size: u32 = 64; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= params.numel) { + return; + } + + // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl). + let i = index[out_bufi]; + output[out_bufi] = input[u32(i)]; +} diff --git a/backends/webgpu/runtime/ops/index/index_wgsl.h b/backends/webgpu/runtime/ops/index/index_wgsl.h new file mode 100644 index 00000000000..839a3b164bb --- /dev/null +++ b/backends/webgpu/runtime/ops/index/index_wgsl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from index.wgsl - DO NOT EDIT. +// wgsl-sha256: daed48e60bfcf2b7420d277576d794137d3bff383aef4f68464c98c8a7235c8e +inline constexpr const char* kIndexWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; +@group(0) @binding(2) var index: array; + +struct Params { + numel: u32, +} +@group(0) @binding(3) var params: Params; + +override wg_size: u32 = 64; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= params.numel) { + return; + } + + // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl). + let i = index[out_bufi]; + output[out_bufi] = input[u32(i)]; +} +)"; + +inline constexpr uint32_t kIndexWorkgroupSizeX = 64; +inline constexpr uint32_t kIndexWorkgroupSizeY = 1; +inline constexpr uint32_t kIndexWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu From a598a84e8a0e1ab7185ceb9870e15677a1c0157a Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Sat, 27 Jun 2026 13:49:04 -0700 Subject: [PATCH 3/3] [ExecuTorch][WebGPU] aten.index.Tensor test suite (export + native golden) Pull Request resolved: https://github.com/pytorch/executorch/pull/20465 Adds the test suite for the aten.index.Tensor op (stacked on the op diff): - test/ops/index/test_index.py: exports a module computing x[idx] through VulkanPartitioner for four configs (reorder/repeat indices over distinct self values, so a wrong-gather is visible), asserts a VulkanBackend delegate with index.Tensor absorbed (not a CPU fallback), and writes per-config .pte + .self/.idx/.golden.bin. - test/native/test_index.cpp: a standalone Dawn binary that loads each .pte, feeds self (fp32) + index (int64 at the program boundary, narrowed to the int32 buffer) and compares the gather against the torch golden at 1e-3, with a single-output shape guard. - Wired into CMake (webgpu_index_test), test/TARGETS (python_unittest test_index), and the Dawn native CI script. ghstack-source-id: 397763261 @exported-using-ghexport @diff-train-skip-merge Differential Revision: [D109479000](https://our.internmc.facebook.com/intern/diff/D109479000/) --- backends/webgpu/CMakeLists.txt | 1 + .../webgpu/scripts/test_webgpu_native_ci.sh | 12 +- backends/webgpu/test/TARGETS | 13 ++ backends/webgpu/test/native/test_index.cpp | 174 ++++++++++++++++++ backends/webgpu/test/ops/index/__init__.py | 0 backends/webgpu/test/ops/index/test_index.py | 109 +++++++++++ 6 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 backends/webgpu/test/native/test_index.cpp create mode 100644 backends/webgpu/test/ops/index/__init__.py create mode 100644 backends/webgpu/test/ops/index/test_index.py diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index cbe36dade70..c3d9387942a 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -194,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST) target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions) set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17) endif() + add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp) endif() diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index b2eb56a505e..38195535732 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order" DISPATCH_ORDER_OK=1 UPDATE_CACHE_DIR="/tmp/update_cache" UPDATE_CACHE_OK=1 +INDEX_DIR="/tmp/index" +INDEX_OK=1 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte" EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin" EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin" @@ -104,6 +106,11 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}') export_update_cache_negative('${UPDATE_CACHE_DIR}') " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; } +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models +export_all_index_models('${INDEX_DIR}') +" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; } + # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in # webgpu_native_test below (precise per-config error), so don't exit/mask here. $PYTHON_EXECUTABLE -c " @@ -136,7 +143,7 @@ cmake \ "${EXECUTORCH_ROOT}" # ── Build + run every native test target that exists in this tree ──────────── -TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) +TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test) BIN_DIR="${BUILD_DIR}/backends/webgpu" # Which targets are defined depends on which diffs are landed (native_test + @@ -201,6 +208,9 @@ fi if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" fi +if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then + "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}" +fi [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" echo "=== WebGPU native tests on Dawn: all run targets passed ===" diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS index 6bf76a525e9..d4f7046f0bd 100644 --- a/backends/webgpu/test/TARGETS +++ b/backends/webgpu/test/TARGETS @@ -17,6 +17,19 @@ python_unittest( ], ) +python_unittest( + name = "test_index", + srcs = [ + "ops/index/test_index.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan/partitioner:vulkan_partitioner", + "//executorch/backends/vulkan:vulkan_preprocess", + "//executorch/exir:lib", + ], +) + runtime.python_library( name = "tester", srcs = ["tester.py"], diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp new file mode 100644 index 00000000000..aed24c0a796 --- /dev/null +++ b/backends/webgpu/test/native/test_index.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; +using namespace executorch::extension; +using namespace executorch::runtime; + +namespace { + +// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case). +constexpr const char* kIndexCases[] = { + "index_n16_m5", + "index_n8_rev", + "index_n32_m3", + "index_n4_rep", +}; + +std::vector read_f32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(float) * sizeof(float); + f.seekg(0); + std::vector data(bytes / sizeof(float)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +std::vector read_i32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(int32_t) * sizeof(int32_t); + f.seekg(0); + std::vector data(bytes / sizeof(int32_t)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +bool run_case(const std::string& dir, const char* name) { + printf("\n--- Test: %s ---\n", name); + const std::string base = dir + "/" + name; + std::vector self_data = read_f32_bin(base + ".self.bin"); + std::vector idx32 = read_i32_bin(base + ".idx.bin"); + std::vector golden = read_f32_bin(base + ".golden.bin"); + if (self_data.empty() || idx32.empty() || golden.empty()) { + printf("FAIL: could not read self/idx/golden for %s\n", name); + return false; + } + + Module module(base + ".pte"); + if (module.load_forward() != Error::Ok) { + printf("FAIL: could not load %s.pte\n", name); + return false; + } + + const int32_t n = static_cast(self_data.size()); + const int32_t m = static_cast(idx32.size()); + auto x = make_tensor_ptr({n}, std::vector(self_data)); + // int64 at the program boundary; copy_inputs narrows to the int32 buffer. + std::vector idx64(idx32.begin(), idx32.end()); + auto idx = make_tensor_ptr({m}, std::vector(idx64)); + + auto result = module.forward({EValue(x), EValue(idx)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + // index.Tensor has exactly one output of shape [num_indices]; fail loud else. + if (outputs.size() != 1 || !outputs[0].isTensor()) { + printf("FAIL: expected exactly one tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (out_tensor.dim() != 1 || out_tensor.size(0) != m) { + printf( + "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n", + (int)out_tensor.dim(), + (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1), + m); + return false; + } + if (static_cast(out_tensor.numel()) != golden.size()) { + printf( + "FAIL: output numel %zu != golden %zu\n", + (size_t)out_tensor.numel(), + golden.size()); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + float max_abs_err = 0.0f; + float max_rel_err = 0.0f; + for (size_t i = 0; i < golden.size(); i++) { + const float abs_err = std::abs(out_data[i] - golden[i]); + max_abs_err = std::max(max_abs_err, abs_err); + const float denom = std::max(std::abs(golden[i]), 1e-6f); + max_rel_err = std::max(max_rel_err, abs_err / denom); + } + printf( + "Max abs error: %e Max rel error: %e (%zu elements)\n", + max_abs_err, + max_rel_err, + golden.size()); + if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) { + printf("FAIL: %s exceeds tolerance 1e-3\n", name); + return false; + } + printf("PASS: %s\n", name); + return true; +} + +} // namespace + +int main(int argc, char** argv) { + std::string dir = "/tmp/index"; + if (argc > 1) { + dir = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) { + dir = env; + } + + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str()); + + bool ok = true; + for (const char* name : kIndexCases) { + ok = run_case(dir, name) && ok; + } + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll index tests passed\n"); + return 0; +} diff --git a/backends/webgpu/test/ops/index/__init__.py b/backends/webgpu/test/ops/index/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py new file mode 100644 index 00000000000..b5e83dcf329 --- /dev/null +++ b/backends/webgpu/test/ops/index/test_index.py @@ -0,0 +1,109 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.index.Tensor` export + goldens for the WebGPU backend. + +Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner -- +the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see +op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index +serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat +indices make a wrong-gather bug visible. Each config writes `index_.pte`, +`index_.self.bin` (fp32 self), `index_.idx.bin` (int32 index), and +`index_.golden.bin` so the native `test_index` self-discovers them. +""" + +import os +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (self_len, index_values) +CONFIGS = { + "n16_m5": (16, [0, 15, 7, 7, 2]), + "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]), + "n32_m3": (32, [31, 0, 16]), + "n4_rep": (4, [2, 2, 2, 2, 0, 1]), +} + + +class IndexModule(torch.nn.Module): + def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: + return x[idx] + + +def _inputs(self_len, index_values): + # Distinct self values so a wrong-index gather is visible. + x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5 + idx = torch.tensor(index_values, dtype=torch.int64) + return x, idx + + +def _lower(x, idx): + ep = torch.export.export(IndexModule().eval(), (x, idx)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _export(x, idx): + return _lower(x, idx).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class TestIndex(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (n, iv) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(*_inputs(n, iv)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (index {name})", + ) + self.assertTrue( + _op_delegated(edge, "index.Tensor"), + f"index.Tensor not delegated (fell back to CPU) for {name}", + ) + + def test_golden_matches_eager(self) -> None: + for name, (n, iv) in CONFIGS.items(): + with self.subTest(name=name): + x, idx = _inputs(n, iv) + torch.testing.assert_close(IndexModule()(x, idx), x[idx]) + + +def export_all_index_models(out_dir: str) -> None: + """Write index_.pte + .self/.idx/.golden.bin for every config.""" + os.makedirs(out_dir, exist_ok=True) + for name, (n, iv) in CONFIGS.items(): + x, idx = _inputs(n, iv) + golden = x[idx].contiguous().detach().numpy().astype(" golden {golden.size} floats") + + +if __name__ == "__main__": + unittest.main()