diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 50aea6a469c..c3d9387942a 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -50,6 +50,7 @@ set(WEBGPU_SRCS runtime/ops/slice/Slice.cpp runtime/ops/permute/Permute.cpp runtime/ops/cat/Cat.cpp + runtime/ops/index/Index.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) @@ -193,4 +194,5 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST) target_compile_options(webgpu_op_test_util_test PRIVATE -fexceptions) set_property(TARGET webgpu_op_test_util_test PROPERTY CXX_STANDARD 17) endif() + add_webgpu_native_test(webgpu_index_test test/native/test_index.cpp) endif() diff --git a/backends/webgpu/README.md b/backends/webgpu/README.md index 8c3928c5f49..a0711d862d5 100644 --- a/backends/webgpu/README.md +++ b/backends/webgpu/README.md @@ -140,11 +140,10 @@ backends/webgpu/ ├── test_webgpu_native.cpp # C++ native test runner ├── test_wgsl_codegen.py # Shader codegen check ├── native/ # C++ operator tests - └── ops/ # Python export tests - ├── add/ - │ └── test_add.py # add export tests - └── rms_norm/ - └── test_rms_norm.py # rms_norm export tests + └── ops/ # Python op test suites (flat: test_.py) + ├── test_add.py + ├── test_rms_norm.py + └── ... # one test_.py per op ``` ## Requirements diff --git a/backends/webgpu/runtime/ops/index/Index.cpp b/backends/webgpu/runtime/ops/index/Index.cpp new file mode 100644 index 00000000000..0c33d616c03 --- /dev/null +++ b/backends/webgpu/runtime/ops/index/Index.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct IndexParams { + uint32_t numel; + uint32_t _pad[3]; // pad to 16 bytes +}; + +// aten.index.Tensor 1D-self gather out[i]=self[index[i]] (mirrors Vulkan). +void index_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, indices (Tensor?[] -> ValueList), out]. + const int self_id = args.at(0); + const int list_id = args.at(1); + const int out_id = args.at(args.size() - 1); + + if (graph.get_value_type(self_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: self arg is not a tensor"); + } + if (graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: out arg is not a tensor"); + } + if (graph.get_value_type(list_id) != WebGPUGraph::ValueType::ValueList) { + throw std::runtime_error("index: indices arg is not a ValueList"); + } + + // Exactly one non-Null index tensor (mirror Vulkan IndexTensor.cpp:67-69). + const std::vector& ids = graph.get_value_list(list_id); + int index_id = -1; + for (int id : ids) { + if (graph.get_value_type(id) == WebGPUGraph::ValueType::Null) { + continue; + } + if (graph.get_value_type(id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("index: index list element is not a tensor"); + } + if (index_id != -1) { + throw std::runtime_error("index: expected exactly one index tensor"); + } + index_id = id; + } + if (index_id == -1) { + throw std::runtime_error("index: no index tensor provided"); + } + + WGPUDevice device = graph.device(); + + const auto& self_tensor = graph.get_tensor(self_id); + const auto& index_tensor = graph.get_tensor(index_id); + const auto& out_tensor = graph.get_tensor(out_id); + + if (self_tensor.buffer == nullptr || index_tensor.buffer == nullptr || + out_tensor.buffer == nullptr) { + throw std::runtime_error("index: null buffer binding"); + } + // 1D-self gather: the kernel flat-indexes self by a scalar; fail loud on a + // higher-rank self (mirrors Vulkan index_tensor_buffer's 1D-self contract). + if (self_tensor.dims.size() != 1) { + throw std::runtime_error("index: only 1D self is supported"); + } + + const size_t out_numel = out_tensor.nbytes / sizeof(float); + if (out_tensor.nbytes != out_numel * sizeof(float) || + self_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error("index: non-fp32 self/out (nbytes != numel * 4)"); + } + // Index is the int32 downcast of the int64 advanced index (downcast_64_bit). + const size_t index_numel = index_tensor.nbytes / sizeof(int32_t); + if (index_tensor.nbytes != index_numel * sizeof(int32_t)) { + throw std::runtime_error("index: index buffer is not int32 (nbytes % 4)"); + } + // out is one self element per index element (row_width == 1, 1D self). + if (out_numel != index_numel) { + throw std::runtime_error("index: out numel != index numel"); + } + + uint32_t num_elements = static_cast(out_numel); + uint32_t wg_size = utils::clamp_workgroup_size(device, kIndexWorkgroupSizeX); + uint32_t workgroup_count = + utils::compute_1d_workgroup_count(device, num_elements, wg_size, "index"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + IndexParams params = {}; + params.numel = num_elements; + + WGPUBuffer uniform_buffer = + utils::make_uniform(device, ¶ms, sizeof(IndexParams)); + graph.add_uniform_buffer_bytes(sizeof(IndexParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kIndexWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // self (read), out (read_write), index (read i32), params (uniform). + WGPUBindGroupLayoutEntry entries[4] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 4; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[4] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = self_tensor.buffer; + bg_entries[0].size = self_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = index_tensor.buffer; + bg_entries[2].size = index_tensor.nbytes; + bg_entries[3].binding = 3; + bg_entries[3].buffer = uniform_buffer; + bg_entries[3].size = sizeof(IndexParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 4; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // The bind group keeps the uniform buffer alive until release. + wgpuBufferRelease(uniform_buffer); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.index.Tensor, index_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/index/index.wgsl b/backends/webgpu/runtime/ops/index/index.wgsl new file mode 100644 index 00000000000..b0fd6df81bf --- /dev/null +++ b/backends/webgpu/runtime/ops/index/index.wgsl @@ -0,0 +1,22 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; +@group(0) @binding(2) var index: array; + +struct Params { + numel: u32, +} +@group(0) @binding(3) var params: Params; + +override wg_size: u32 = 64; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= params.numel) { + return; + } + + // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl). + let i = index[out_bufi]; + output[out_bufi] = input[u32(i)]; +} diff --git a/backends/webgpu/runtime/ops/index/index_wgsl.h b/backends/webgpu/runtime/ops/index/index_wgsl.h new file mode 100644 index 00000000000..839a3b164bb --- /dev/null +++ b/backends/webgpu/runtime/ops/index/index_wgsl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from index.wgsl - DO NOT EDIT. +// wgsl-sha256: daed48e60bfcf2b7420d277576d794137d3bff383aef4f68464c98c8a7235c8e +inline constexpr const char* kIndexWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; +@group(0) @binding(2) var index: array; + +struct Params { + numel: u32, +} +@group(0) @binding(3) var params: Params; + +override wg_size: u32 = 64; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= params.numel) { + return; + } + + // 1D-self gather out[i]=self[index[i]] (mirrors Vulkan index_tensor_buffer.glsl). + let i = index[out_bufi]; + output[out_bufi] = input[u32(i)]; +} +)"; + +inline constexpr uint32_t kIndexWorkgroupSizeX = 64; +inline constexpr uint32_t kIndexWorkgroupSizeY = 1; +inline constexpr uint32_t kIndexWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp index d56c721ce3e..67119472643 100644 --- a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp +++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp @@ -53,10 +53,17 @@ void view_copy_impl(WebGPUGraph& graph, const std::vector& args) { add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); } +// clone = flat copy; survives Vulkan RemoveRedundantOpsTransform in Llama 1B. +void clone_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, memory_format?, out]; out = last value-id. + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + } // namespace WEBGPU_REGISTER_OPERATORS { WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl); + WEBGPU_REGISTER_OP(aten.clone.default, clone_impl); } } // namespace executorch::backends::webgpu diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index 84b5349ef2d..38195535732 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -45,6 +45,8 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order" DISPATCH_ORDER_OK=1 UPDATE_CACHE_DIR="/tmp/update_cache" UPDATE_CACHE_OK=1 +INDEX_DIR="/tmp/index" +INDEX_OK=1 EMBEDDING_MODEL="/tmp/webgpu_embedding_q4gsw.pte" EMBEDDING_INDICES="/tmp/webgpu_embedding_q4gsw_indices.bin" EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin" @@ -65,36 +67,36 @@ PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte" PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models +from executorch.backends.webgpu.test.ops.test_quantized_linear import export_all_quantized_linear_models export_all_quantized_linear_models('/tmp') " || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.embedding_q4gsw.test_embedding_q4gsw import export_embedding_q4gsw_model +from executorch.backends.webgpu.test.ops.test_embedding_q4gsw import export_embedding_q4gsw_model export_embedding_q4gsw_model('${EMBEDDING_MODEL}', '${EMBEDDING_GOLDEN}', '${EMBEDDING_INDICES}') export_embedding_q4gsw_model('${EMBEDDING_LLAMA1B_MODEL}', '${EMBEDDING_LLAMA1B_GOLDEN}', '${EMBEDDING_LLAMA1B_INDICES}', 'llama1b') " || echo "WARN: embedding_q4gsw export failed; embedding configs will FAIL in webgpu_native_test" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model +from executorch.backends.webgpu.test.ops.test_rope import export_rope_model export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}') export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode') " || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model +from executorch.backends.webgpu.test.ops.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}') export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}') export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}') " || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases +from executorch.backends.webgpu.test.ops.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') " || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; } $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_update_cache import ( +from executorch.backends.webgpu.test.ops.test_update_cache import ( export_update_cache_cases, export_update_cache_replay, export_update_cache_negative, @@ -104,10 +106,15 @@ export_update_cache_replay('${UPDATE_CACHE_DIR}') export_update_cache_negative('${UPDATE_CACHE_DIR}') " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; } +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.index.test_index import export_all_index_models +export_all_index_models('${INDEX_DIR}') +" || { echo "WARN: index export failed; skipping index native test"; INDEX_OK=0; } + # Non-fatal: a failed sdpa export makes the required 4k/8k configs hard-fail in # webgpu_native_test below (precise per-config error), so don't exit/mask here. $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import ( +from executorch.backends.webgpu.test.ops.test_sdpa import ( export_all_sdpa_models, export_replay_sequences, export_dynamic_decode, @@ -136,7 +143,7 @@ cmake \ "${EXECUTORCH_ROOT}" # ── Build + run every native test target that exists in this tree ──────────── -TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test) +TARGETS=(webgpu_native_test webgpu_dispatch_order_test webgpu_scratch_buffer_test webgpu_update_cache_test webgpu_index_test) BIN_DIR="${BUILD_DIR}/backends/webgpu" # Which targets are defined depends on which diffs are landed (native_test + @@ -201,6 +208,9 @@ fi if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" fi +if [[ "${INDEX_OK}" == "1" && -x "${BIN_DIR}/webgpu_index_test" ]]; then + "${BIN_DIR}/webgpu_index_test" "${INDEX_DIR}" +fi [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" echo "=== WebGPU native tests on Dawn: all run targets passed ===" diff --git a/backends/webgpu/test/TARGETS b/backends/webgpu/test/TARGETS index 9008f32cd2c..d4f7046f0bd 100644 --- a/backends/webgpu/test/TARGETS +++ b/backends/webgpu/test/TARGETS @@ -7,7 +7,20 @@ oncall("executorch") python_unittest( name = "test_add", srcs = [ - "ops/add/test_add.py", + "ops/test_add.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/vulkan/partitioner:vulkan_partitioner", + "//executorch/backends/vulkan:vulkan_preprocess", + "//executorch/exir:lib", + ], +) + +python_unittest( + name = "test_index", + srcs = [ + "ops/index/test_index.py", ], deps = [ "//caffe2:torch", diff --git a/backends/webgpu/test/native/test_index.cpp b/backends/webgpu/test/native/test_index.cpp new file mode 100644 index 00000000000..aed24c0a796 --- /dev/null +++ b/backends/webgpu/test/native/test_index.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; +using namespace executorch::extension; +using namespace executorch::runtime; + +namespace { + +// Names mirror test_index.py CONFIGS (self/idx/golden bins written per case). +constexpr const char* kIndexCases[] = { + "index_n16_m5", + "index_n8_rev", + "index_n32_m3", + "index_n4_rep", +}; + +std::vector read_f32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(float) * sizeof(float); + f.seekg(0); + std::vector data(bytes / sizeof(float)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +std::vector read_i32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const size_t bytes = + static_cast(f.tellg()) / sizeof(int32_t) * sizeof(int32_t); + f.seekg(0); + std::vector data(bytes / sizeof(int32_t)); + f.read( + reinterpret_cast(data.data()), + static_cast(bytes)); + return data; +} + +bool run_case(const std::string& dir, const char* name) { + printf("\n--- Test: %s ---\n", name); + const std::string base = dir + "/" + name; + std::vector self_data = read_f32_bin(base + ".self.bin"); + std::vector idx32 = read_i32_bin(base + ".idx.bin"); + std::vector golden = read_f32_bin(base + ".golden.bin"); + if (self_data.empty() || idx32.empty() || golden.empty()) { + printf("FAIL: could not read self/idx/golden for %s\n", name); + return false; + } + + Module module(base + ".pte"); + if (module.load_forward() != Error::Ok) { + printf("FAIL: could not load %s.pte\n", name); + return false; + } + + const int32_t n = static_cast(self_data.size()); + const int32_t m = static_cast(idx32.size()); + auto x = make_tensor_ptr({n}, std::vector(self_data)); + // int64 at the program boundary; copy_inputs narrows to the int32 buffer. + std::vector idx64(idx32.begin(), idx32.end()); + auto idx = make_tensor_ptr({m}, std::vector(idx64)); + + auto result = module.forward({EValue(x), EValue(idx)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + // index.Tensor has exactly one output of shape [num_indices]; fail loud else. + if (outputs.size() != 1 || !outputs[0].isTensor()) { + printf("FAIL: expected exactly one tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (out_tensor.dim() != 1 || out_tensor.size(0) != m) { + printf( + "FAIL: output shape mismatch (dim %d size0 %d, expected [%d])\n", + (int)out_tensor.dim(), + (int)(out_tensor.dim() == 1 ? out_tensor.size(0) : -1), + m); + return false; + } + if (static_cast(out_tensor.numel()) != golden.size()) { + printf( + "FAIL: output numel %zu != golden %zu\n", + (size_t)out_tensor.numel(), + golden.size()); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + float max_abs_err = 0.0f; + float max_rel_err = 0.0f; + for (size_t i = 0; i < golden.size(); i++) { + const float abs_err = std::abs(out_data[i] - golden[i]); + max_abs_err = std::max(max_abs_err, abs_err); + const float denom = std::max(std::abs(golden[i]), 1e-6f); + max_rel_err = std::max(max_rel_err, abs_err / denom); + } + printf( + "Max abs error: %e Max rel error: %e (%zu elements)\n", + max_abs_err, + max_rel_err, + golden.size()); + if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) { + printf("FAIL: %s exceeds tolerance 1e-3\n", name); + return false; + } + printf("PASS: %s\n", name); + return true; +} + +} // namespace + +int main(int argc, char** argv) { + std::string dir = "/tmp/index"; + if (argc > 1) { + dir = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_INDEX_DIR")) { + dir = env; + } + + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str()); + + bool ok = true; + for (const char* name : kIndexCases) { + ok = run_case(dir, name) && ok; + } + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll index tests passed\n"); + return 0; +} diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index f1ab0c6cca4..1b6b0fa7085 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -25,16 +25,14 @@ WebGPUTestSuite, XS, ) -from executorch.backends.webgpu.test.ops.add.test_add import ( +from executorch.backends.webgpu.test.ops.test_add import ( AddChainedModule, AddModule, AddSelfModule, ) -from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import ( - _CASES, - _linspace_weight, - _ramp, - RmsNormModule, +from executorch.backends.webgpu.test.ops.test_cat import ( + CatModule, + CONFIGS as _CAT_CONFIGS, ) from executorch.backends.webgpu.test.ops.test_mul import ( CONFIGS as _MUL_CONFIGS, @@ -44,6 +42,12 @@ CONFIGS as _PERMUTE_CONFIGS, PermuteModule, ) +from executorch.backends.webgpu.test.ops.test_rms_norm import ( + _CASES, + _linspace_weight, + _ramp, + RmsNormModule, +) from executorch.backends.webgpu.test.ops.test_select import ( CONFIGS as _SELECT_CONFIGS, SelectModule, @@ -88,7 +92,7 @@ def _add_factory(variant: str = "regular") -> torch.nn.Module: @register_op_test("add") def _add_suite() -> WebGPUTestSuite: # Same-shape numeric coverage only: broadcast adds stay export-smoke in - # ops/add/test_add.py because the kernel can't broadcast. + # ops/test_add.py because the kernel can't broadcast. return WebGPUTestSuite( module_factory=_add_factory, cases=[ @@ -247,3 +251,16 @@ def _permute_suite() -> WebGPUTestSuite: ], golden_dtype="float32", # permutation reorders values; fp64 bit-identical ) + + +@register_op_test("cat") +def _cat_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (list_of_input_shapes, dim). Variadic input count per case. + return WebGPUTestSuite( + module_factory=lambda dim: CatModule(dim), + cases=[ + Case(name=n, construct={"dim": dim}, inputs=tuple(shapes)) + for n, (shapes, dim) in _CAT_CONFIGS.items() + ], + golden_dtype="float32", # concatenation copies values; fp64 bit-identical + ) diff --git a/backends/webgpu/test/ops/embedding_q4gsw/__init__.py b/backends/webgpu/test/ops/embedding_q4gsw/__init__.py deleted file mode 100644 index 2e41cd717f6..00000000000 --- a/backends/webgpu/test/ops/embedding_q4gsw/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/dispatch_order/__init__.py b/backends/webgpu/test/ops/index/__init__.py similarity index 100% rename from backends/webgpu/test/ops/dispatch_order/__init__.py rename to backends/webgpu/test/ops/index/__init__.py diff --git a/backends/webgpu/test/ops/index/test_index.py b/backends/webgpu/test/ops/index/test_index.py new file mode 100644 index 00000000000..b5e83dcf329 --- /dev/null +++ b/backends/webgpu/test/ops/index/test_index.py @@ -0,0 +1,109 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.index.Tensor` export + goldens for the WebGPU backend. + +Exports the 1D-self advanced-index form `self[idx]` through VulkanPartitioner -- +the only delegated index.Tensor (the 2D mask/freqs gathers are CPU fallbacks; see +op_registry.py:1427). It is a flat gather out[i]=self[index[i]]; the int64 index +serializes as int32 (downcast_64_bit). Distinct self values + reorder/repeat +indices make a wrong-gather bug visible. Each config writes `index_.pte`, +`index_.self.bin` (fp32 self), `index_.idx.bin` (int32 index), and +`index_.golden.bin` so the native `test_index` self-discovers them. +""" + +import os +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (self_len, index_values) +CONFIGS = { + "n16_m5": (16, [0, 15, 7, 7, 2]), + "n8_rev": (8, [7, 6, 5, 4, 3, 2, 1, 0]), + "n32_m3": (32, [31, 0, 16]), + "n4_rep": (4, [2, 2, 2, 2, 0, 1]), +} + + +class IndexModule(torch.nn.Module): + def forward(self, x: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: + return x[idx] + + +def _inputs(self_len, index_values): + # Distinct self values so a wrong-index gather is visible. + x = torch.arange(self_len, dtype=torch.float32) * 3.0 + 0.5 + idx = torch.tensor(index_values, dtype=torch.int64) + return x, idx + + +def _lower(x, idx): + ep = torch.export.export(IndexModule().eval(), (x, idx)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _export(x, idx): + return _lower(x, idx).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class TestIndex(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (n, iv) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(*_inputs(n, iv)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (index {name})", + ) + self.assertTrue( + _op_delegated(edge, "index.Tensor"), + f"index.Tensor not delegated (fell back to CPU) for {name}", + ) + + def test_golden_matches_eager(self) -> None: + for name, (n, iv) in CONFIGS.items(): + with self.subTest(name=name): + x, idx = _inputs(n, iv) + torch.testing.assert_close(IndexModule()(x, idx), x[idx]) + + +def export_all_index_models(out_dir: str) -> None: + """Write index_.pte + .self/.idx/.golden.bin for every config.""" + os.makedirs(out_dir, exist_ok=True) + for name, (n, iv) in CONFIGS.items(): + x, idx = _inputs(n, iv) + golden = x[idx].contiguous().detach().numpy().astype(" golden {golden.size} floats") + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/webgpu/test/ops/prepack/__init__.py b/backends/webgpu/test/ops/prepack/__init__.py deleted file mode 100644 index 2e41cd717f6..00000000000 --- a/backends/webgpu/test/ops/prepack/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/quantized_linear/__init__.py b/backends/webgpu/test/ops/quantized_linear/__init__.py deleted file mode 100644 index 2e41cd717f6..00000000000 --- a/backends/webgpu/test/ops/quantized_linear/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/rms_norm/__init__.py b/backends/webgpu/test/ops/rms_norm/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/backends/webgpu/test/ops/rope/__init__.py b/backends/webgpu/test/ops/rope/__init__.py deleted file mode 100644 index 2e41cd717f6..00000000000 --- a/backends/webgpu/test/ops/rope/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/sdpa/__init__.py b/backends/webgpu/test/ops/sdpa/__init__.py deleted file mode 100644 index 2e41cd717f6..00000000000 --- a/backends/webgpu/test/ops/sdpa/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/test_add.py similarity index 100% rename from backends/webgpu/test/ops/add/test_add.py rename to backends/webgpu/test/ops/test_add.py diff --git a/backends/webgpu/test/ops/test_cat.py b/backends/webgpu/test/ops/test_cat.py new file mode 100644 index 00000000000..d0d714fd934 --- /dev/null +++ b/backends/webgpu/test/ops/test_cat.py @@ -0,0 +1,82 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.cat.default` module + configs for the WebGPU op-test framework. + +`CatModule` + `CONFIGS` are imported by `cases.py` to drive the declarative op-test +suite. `CatTest` is the export-delegation smoke test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (list_of_input_shapes, dim) +CONFIGS = { + "dim0_3": ([(2, 3), (2, 3), (2, 3)], 0), + "dim1_2": ([(1, 4, 8), (1, 4, 8)], 1), + "dim2_3": ([(2, 3, 4), (2, 3, 4), (2, 3, 4)], 2), + "uneven": ([(2, 1, 4), (2, 3, 4), (2, 2, 4)], 1), +} + + +class CatModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, *xs: torch.Tensor) -> torch.Tensor: + return torch.cat(xs, self.dim) + + +def _det_inputs(shapes): + # Distinct value range per input so a cross-contamination bug is visible. + inputs = [] + base = 0.0 + for sh in shapes: + n = 1 + for s in sh: + n *= s + inputs.append(torch.arange(base, base + n, dtype=torch.float32).reshape(sh)) + base += 1000.0 + return tuple(inputs) + + +def _lower(dim, xs): + ep = torch.export.export(CatModule(dim).eval(), xs) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class CatTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shapes, dim) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(dim, _det_inputs(shapes)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (cat {name})" + ) + self.assertTrue( + _op_delegated(edge, "cat"), + f"cat not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py b/backends/webgpu/test/ops/test_dispatch_order.py similarity index 97% rename from backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py rename to backends/webgpu/test/ops/test_dispatch_order.py index fbb13ff6426..08454ef0a31 100644 --- a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py +++ b/backends/webgpu/test/ops/test_dispatch_order.py @@ -17,7 +17,7 @@ import torch from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner -from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import RmsNormModule +from executorch.backends.webgpu.test.ops.test_rms_norm import RmsNormModule from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS from executorch.exir import to_edge_transform_and_lower diff --git a/backends/webgpu/test/ops/embedding_q4gsw/test_embedding_q4gsw.py b/backends/webgpu/test/ops/test_embedding_q4gsw.py similarity index 100% rename from backends/webgpu/test/ops/embedding_q4gsw/test_embedding_q4gsw.py rename to backends/webgpu/test/ops/test_embedding_q4gsw.py diff --git a/backends/webgpu/test/ops/prepack/test_prepack.py b/backends/webgpu/test/ops/test_prepack.py similarity index 100% rename from backends/webgpu/test/ops/prepack/test_prepack.py rename to backends/webgpu/test/ops/test_prepack.py diff --git a/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py b/backends/webgpu/test/ops/test_quantized_linear.py similarity index 100% rename from backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py rename to backends/webgpu/test/ops/test_quantized_linear.py diff --git a/backends/webgpu/test/ops/rms_norm/test_rms_norm.py b/backends/webgpu/test/ops/test_rms_norm.py similarity index 100% rename from backends/webgpu/test/ops/rms_norm/test_rms_norm.py rename to backends/webgpu/test/ops/test_rms_norm.py diff --git a/backends/webgpu/test/ops/rope/test_rope.py b/backends/webgpu/test/ops/test_rope.py similarity index 100% rename from backends/webgpu/test/ops/rope/test_rope.py rename to backends/webgpu/test/ops/test_rope.py diff --git a/backends/webgpu/test/ops/sdpa/test_sdpa.py b/backends/webgpu/test/ops/test_sdpa.py similarity index 100% rename from backends/webgpu/test/ops/sdpa/test_sdpa.py rename to backends/webgpu/test/ops/test_sdpa.py diff --git a/backends/webgpu/test/ops/sdpa/test_update_cache.py b/backends/webgpu/test/ops/test_update_cache.py similarity index 100% rename from backends/webgpu/test/ops/sdpa/test_update_cache.py rename to backends/webgpu/test/ops/test_update_cache.py diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 5ea465e853b..2fd1dea1a52 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -25,8 +25,8 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/test_wgsl_codegen.py" -v # ── Step 1: Python export tests ────────────────────────────────────────────── echo "=== Step 1: Run Python export tests ===" -$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v -$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/rms_norm/test_rms_norm.py" -v +$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/test_add.py" -v +$PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/test_rms_norm.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── @@ -35,38 +35,38 @@ DISPATCH_ORDER_DIR="/tmp/dispatch_order" PTE_UPDATE_CACHE_MODEL="/tmp/webgpu_update_cache_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases +from executorch.backends.webgpu.test.ops.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') " echo "=== Export update_cache model ===" UPDATE_CACHE_OK=1 $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_update_cache import export_update_cache_model +from executorch.backends.webgpu.test.ops.test_update_cache import export_update_cache_model export_update_cache_model('${PTE_UPDATE_CACHE_MODEL}') " || { echo "WARN: update_cache export failed; skipping update_cache native test"; UPDATE_CACHE_OK=0; } echo "=== Export SDPA sweep models (sdpa_.pte + .golden.bin to /tmp) ===" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_all_sdpa_models +from executorch.backends.webgpu.test.ops.test_sdpa import export_all_sdpa_models export_all_sdpa_models('/tmp') " || echo "WARN: sdpa export failed; the native test self-skips configs whose .pte is absent" echo "=== Export SDPA replay sequences (sdpa__step_S_pos

.* to /tmp) ===" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_replay_sequences +from executorch.backends.webgpu.test.ops.test_sdpa import export_replay_sequences export_replay_sequences('/tmp') " || echo "WARN: sdpa replay export failed; the native test self-skips absent sequences" echo "=== Export SDPA dynamic-input_pos decode (sdpa_dyn_.* to /tmp) ===" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_dynamic_decode +from executorch.backends.webgpu.test.ops.test_sdpa import export_dynamic_decode export_dynamic_decode('/tmp') " || echo "WARN: sdpa dynamic export failed; the native test self-skips when absent" echo "=== Export SDPA in-graph-cache decode (sdpa_incache_.* to /tmp) ===" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.sdpa.test_sdpa import export_incache_decode +from executorch.backends.webgpu.test.ops.test_sdpa import export_incache_decode export_incache_decode('/tmp') " || echo "WARN: sdpa in-graph-cache export failed; the native test self-skips when absent" diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index b7f38246f8c..ed23f01ef8e 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -29,6 +29,7 @@ exir_ops.edge.aten.unsqueeze_copy.default, exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.permute_copy.default, + exir_ops.edge.aten.cat.default, ]