From 476353a07b3a1057cd19af521a4682997acfc322 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:14 -0700 Subject: [PATCH 1/8] [ExecuTorch][WebGPU] Add view_copy op (aten.view_copy.default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20360 **Add `aten.view_copy.default` as a native buffer-to-buffer DMA** — a contiguous reshape on the dense row-major buffer backend is a flat copy, so it needs no shader. **Problem:** a reshape only relabels shape metadata; the bytes are unchanged. Launching a compute dispatch (shader module + pipeline + bind group + uniform) just to run `output[i] = input[i]` is wasted setup for every view/clone/squeeze/unsqueeze in the graph. **Solution:** - Before: a `view_copy.wgsl` compute kernel dispatched over `num_elements`, with its own pipeline/bind-group/uniform per copy. - After: `add_flat_copy` records a `wgpuCommandEncoderCopyBufferToBuffer` DMA in graph order — no shader, no pipeline, no uniform. **Implementation:** - `WebGPUGraph` gains a `WebGPUDispatch::Kind::Copy` command + `add_buffer_copy(src, dst, nbytes)`; `execute()` emits the encoder-level copy between compute passes (both single-shot and chunked paths), preserving the existing per-pass read-after-write ordering. - `add_flat_copy` (declared in `view_copy.h`, reused by the stacked clone/squeeze/unsqueeze) keeps the fail-loud guards (both tensors, fp32 4-byte alignment, equal `nbytes`) and treats an aliased in/out buffer as a no-op. - Tensor buffers already carry `CopySrc | CopyDst`, so no usage-flag change is needed. - Mirrors Vulkan `add_view_copy_node` (`backends/vulkan/runtime/graph/ops/impl/View.cpp`): Vulkan always dispatches `view_buffer.glsl` only to remap non-contiguous layouts, which the buffer-only WebGPU backend never produces — so the contiguous DMA is the equivalent path. ghstack-source-id: 397026498 @exported-using-ghexport Differential Revision: [D108793164](https://our.internmc.facebook.com/intern/diff/D108793164/) --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/WebGPUGraph.cpp | 20 ++++++ backends/webgpu/runtime/WebGPUGraph.h | 17 +++++ .../webgpu/runtime/ops/view_copy/ViewCopy.cpp | 62 +++++++++++++++++++ .../webgpu/runtime/ops/view_copy/view_copy.h | 18 ++++++ 5 files changed, 118 insertions(+) create mode 100644 backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp create mode 100644 backends/webgpu/runtime/ops/view_copy/view_copy.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index f7cd85f9758..88c7b1f73a3 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -42,6 +42,7 @@ set(WEBGPU_SRCS runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp runtime/ops/rope/RotaryEmbedding.cpp runtime/ops/prepack/Prepack.cpp + runtime/ops/view_copy/ViewCopy.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b7fb4313400..0e00b2cb42b 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -679,6 +679,16 @@ void WebGPUGraph::execute() { // One pass per dispatch: enforces storage RAW ordering across deps. for (size_t i = 0; i < n; i++) { const auto& dispatch = dispatches_[i]; + if (dispatch.kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatch.copy_src, + 0, + dispatch.copy_dst, + 0, + dispatch.copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; #ifdef WGPU_BACKEND_ENABLE_PROFILING // tw must outlive BeginComputePass (the descriptor points at it). @@ -757,6 +767,16 @@ void WebGPUGraph::execute() { wgpuDeviceCreateCommandEncoder(device_, &enc_desc); for (size_t i = start; i < end; i++) { + if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatches_[i].copy_src, + 0, + dispatches_[i].copy_dst, + 0, + dispatches_[i].copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 3572f751a06..b9326cf016c 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -42,6 +42,12 @@ struct WebGPUDispatch { WGPUBindGroup bind_group = nullptr; uint32_t workgroup_count_x = 1; std::string kernel_name; // bench label + // DMA copy command; default Compute keeps existing positional inits valid. + enum class Kind { Compute, Copy }; + Kind kind = Kind::Compute; + WGPUBuffer copy_src = nullptr; + WGPUBuffer copy_dst = nullptr; + size_t copy_nbytes = 0; }; struct OutputCopy { @@ -189,6 +195,17 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } + // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy). + void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) { + WebGPUDispatch d; + d.kind = WebGPUDispatch::Kind::Copy; + d.copy_src = src; + d.copy_dst = dst; + d.copy_nbytes = nbytes; + d.kernel_name = "flat_copy"; + dispatches_.push_back(d); + } + // Materialize a recorded prepack-routed constant into dst via one CPU->GPU // transfer. Build-time only (the .pte bytes are freed after build()). // Mirrors Vulkan prepack_standard. diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp new file mode 100644 index 00000000000..d56c721ce3e --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include + +namespace executorch::backends::webgpu { + +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) { + // get_tensor doesn't type-check; assert both args are tensors (fail loud). + if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor || + graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("flat_copy: in/out arg is not a tensor"); + } + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap). + + // 4-byte alignment guard (fp32 element size); does not verify dtype. + if (in_tensor.nbytes % sizeof(float) != 0 || + out_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error("flat_copy: operand not 4-byte aligned"); + } + + // view preserves numel; this guard also prevents an OOB copy. + if (in_tensor.nbytes != out_tensor.nbytes) { + throw std::runtime_error("flat_copy: input/output size mismatch"); + } + + // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst. + if (in_tensor.buffer == out_tensor.buffer) { + return; + } + + graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes); +} + +namespace { + +// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer). +void view_copy_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, size, out]; out = last value-id (shape from out_tensor.dims). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h new file mode 100644 index 00000000000..bfa81174ba9 --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/view_copy.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h). +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id); + +} // namespace executorch::backends::webgpu From ffc99c6ce195f90c984de969dea0b3203e580cbd Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:15 -0700 Subject: [PATCH 2/8] [ExecuTorch][WebGPU] view_copy op test suite (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20361 Registers `aten.view_copy.default` in the `cases.py` op-test framework: a `_view_copy_suite` of 3 configs (rank-reducing reshape `(2,3,4)->(6,4)`, flatten `->(-1,)`, rank-increasing reshape `->(1,2,3,4)`) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/view_copy/test_view_copy.py` (`ViewModule` + `CONFIGS` + export-delegation/eager smoke test) and the `aten.view_copy.default` partitioner-allowlist entry in `tester.py`. ghstack-source-id: 397026501 @exported-using-ghexport Differential Revision: [D108793155](https://our.internmc.facebook.com/intern/diff/D108793155/) --- backends/webgpu/test/op_tests/cases.py | 23 ++++++++ backends/webgpu/test/ops/test_view_copy.py | 66 ++++++++++++++++++++++ backends/webgpu/test/tester.py | 1 + 3 files changed, 90 insertions(+) create mode 100644 backends/webgpu/test/ops/test_view_copy.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index 4d01517bedb..c0fbade9bbf 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -40,6 +40,10 @@ CONFIGS as _MUL_CONFIGS, MulModule, ) +from executorch.backends.webgpu.test.ops.test_view_copy import ( + CONFIGS as _VIEW_CONFIGS, + ViewModule, +) # rms_norm coverage is exactly the 15 cases the native test covered. RMS_NORM_CASES = _CASES @@ -121,3 +125,22 @@ def _mul_suite() -> WebGPUTestSuite: Case(name=name, inputs=(sa, sb)) for name, (sa, sb) in _MUL_CONFIGS.items() ], ) + + +def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite: + """Builder for ops whose per-case spec is a (shape, fn) pair (view/select/slice). + The fn is a `construct` kwarg baked into the .pte module, never a serialized input. + """ + return WebGPUTestSuite( + module_factory=lambda fn: module_cls(fn), + cases=[ + Case(name=n, construct={"fn": fn}, inputs=(shape,)) + for n, (shape, fn) in configs.items() + ], + golden_dtype="float32", # gather/copy: fp64 bit-identical, skip dual-oracle + ) + + +@register_op_test("view_copy") +def _view_copy_suite() -> WebGPUTestSuite: + return _fn_config_suite(ViewModule, _VIEW_CONFIGS) diff --git a/backends/webgpu/test/ops/test_view_copy.py b/backends/webgpu/test/ops/test_view_copy.py new file mode 100644 index 00000000000..a9c28f2b631 --- /dev/null +++ b/backends/webgpu/test/ops/test_view_copy.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.view_copy.default` module + configs for the WebGPU op-test framework. + +`ViewModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `ViewCopyTest` is the export-delegation +smoke test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, view_fn) +CONFIGS = { + "reshape": ((2, 3, 4), lambda x: x.reshape(6, 4)), + "flatten": ((2, 3, 4), lambda x: x.reshape(-1)), + "reshape4d": ((2, 3, 4), lambda x: x.reshape(1, 2, 3, 4)), +} + + +class ViewModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fn(x) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _export(fn, x: torch.Tensor): + ep = torch.export.export(ViewModule(fn).eval(), (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class ViewCopyTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, fn) in CONFIGS.items(): + with self.subTest(name=name): + et = _export(fn, _det_input(shape)) + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (view_copy {name})", + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index eafb64c6961..bb09fffb43e 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -22,6 +22,7 @@ exir_ops.edge.aten.add.Tensor, exir_ops.edge.et_vk.rms_norm.default, exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.view_copy.default, ] From 9418cd6a8b35d45fe4a1389da284221178a700e5 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:15 -0700 Subject: [PATCH 3/8] [ExecuTorch][WebGPU] Add select_copy op (aten.select_copy.int) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20362 Adds `aten.select_copy.int` to the WebGPU delegate as a gather: picks a fixed index along one dim, producing an output of rank (input rank - 1). Composition (single dispatch): - `select/Select.cpp` — reads `[self, dim, index, out]` (static `Int` via `read_scalar`; throws on dynamic `SymInt`), normalizes + bounds-checks dim/index, builds 2 `TensorMeta` UBOs + a `SelectParams{dim,index}`, fp32 guard, 1D-dispatch over `numel`, releases uniforms after the bind group. - `select/select.wgsl` — seeds the input offset with `index * in.strides[dim]`, delinearizes the output index, maps each out dim to its in dim (shifted past the selected dim), relinearizes on input strides. ghstack-source-id: 397026510 @exported-using-ghexport Differential Revision: [D108793166](https://our.internmc.facebook.com/intern/diff/D108793166/) --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/ops/select/Select.cpp | 184 ++++++++++++++++++ .../webgpu/runtime/ops/select/select.wgsl | 41 ++++ .../webgpu/runtime/ops/select/select_wgsl.h | 65 +++++++ 4 files changed, 291 insertions(+) create mode 100644 backends/webgpu/runtime/ops/select/Select.cpp create mode 100644 backends/webgpu/runtime/ops/select/select.wgsl create mode 100644 backends/webgpu/runtime/ops/select/select_wgsl.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 88c7b1f73a3..c774fd27845 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -43,6 +43,7 @@ set(WEBGPU_SRCS runtime/ops/rope/RotaryEmbedding.cpp runtime/ops/prepack/Prepack.cpp runtime/ops/view_copy/ViewCopy.cpp + runtime/ops/select/Select.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/ops/select/Select.cpp b/backends/webgpu/runtime/ops/select/Select.cpp new file mode 100644 index 00000000000..5686bbc79c0 --- /dev/null +++ b/backends/webgpu/runtime/ops/select/Select.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct SelectParams { + uint32_t dim; + uint32_t index; + uint32_t _pad[2]; +}; + +// dim/index are required Ints (SymInt throws); no Null default unlike slice. +int64_t read_scalar(WebGPUGraph& graph, int id, const char* what) { + if (graph.get_value_type(id) == WebGPUGraph::ValueType::Int) { + return graph.get_int(id); + } + throw std::runtime_error(std::string("select: dynamic/unsupported ") + what); +} + +void select_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dim, index, out]; output rank = in rank - 1. + const int in_id = args.at(0); + const int out_id = args.at(3); + + WGPUDevice device = graph.device(); + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) { + throw std::runtime_error("select: null buffer binding"); + } + + const int in_ndim = static_cast(in_tensor.dims.size()); + int64_t dim = read_scalar(graph, args.at(1), "dim"); + if (dim < 0) { + dim += in_ndim; + } + if (dim < 0 || dim >= in_ndim) { + throw std::runtime_error("select: dim out of range"); + } + const int64_t in_size = in_tensor.dims[dim]; + int64_t index = read_scalar(graph, args.at(2), "index"); + if (index < 0) { + index += in_size; + } + if (index < 0 || index >= in_size) { + throw std::runtime_error("select: index out of range"); + } + + TensorMeta out_meta; + TensorMeta in_meta; + fill_tensor_meta(out_tensor, &out_meta); + fill_tensor_meta(in_tensor, &in_meta); + if (out_tensor.nbytes != + static_cast(out_meta.numel) * sizeof(float) || + in_tensor.nbytes != static_cast(in_meta.numel) * sizeof(float)) { + throw std::runtime_error("select: non-fp32 operand (nbytes != numel * 4)"); + } + + SelectParams params = {}; + params.dim = static_cast(dim); + params.index = static_cast(index); + + uint32_t wg_size = utils::clamp_workgroup_size(device, kSelectWorkgroupSizeX); + uint32_t workgroup_count = utils::compute_1d_workgroup_count( + device, out_meta.numel, wg_size, "select"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + WGPUBuffer out_meta_buf = + utils::make_uniform(device, &out_meta, sizeof(TensorMeta)); + WGPUBuffer in_meta_buf = + utils::make_uniform(device, &in_meta, sizeof(TensorMeta)); + WGPUBuffer params_buf = + utils::make_uniform(device, ¶ms, sizeof(SelectParams)); + graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SelectParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kSelectWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms). + WGPUBindGroupLayoutEntry entries[5] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + entries[4].binding = 4; + entries[4].visibility = WGPUShaderStage_Compute; + entries[4].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 5; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[5] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_meta_buf; + bg_entries[2].size = sizeof(TensorMeta); + bg_entries[3].binding = 3; + bg_entries[3].buffer = in_meta_buf; + bg_entries[3].size = sizeof(TensorMeta); + bg_entries[4].binding = 4; + bg_entries[4].buffer = params_buf; + bg_entries[4].size = sizeof(SelectParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 5; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our refs; the bind group keeps the uniforms alive until release. + wgpuBufferRelease(out_meta_buf); + wgpuBufferRelease(in_meta_buf); + wgpuBufferRelease(params_buf); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.select_copy.int, select_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/select/select.wgsl b/backends/webgpu/runtime/ops/select/select.wgsl new file mode 100644 index 00000000000..84938b2c840 --- /dev/null +++ b/backends/webgpu/runtime/ops/select/select.wgsl @@ -0,0 +1,41 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + index: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index. + var rem = out_bufi; + var in_bufi: u32 = params.index * in_meta.strides[params.dim]; + for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) { + let coord = rem / out_meta.strides[od]; + rem = rem % out_meta.strides[od]; + var id = od; + if (od >= params.dim) { + id = od + 1u; + } + in_bufi = in_bufi + coord * in_meta.strides[id]; + } + output[out_bufi] = input[in_bufi]; +} diff --git a/backends/webgpu/runtime/ops/select/select_wgsl.h b/backends/webgpu/runtime/ops/select/select_wgsl.h new file mode 100644 index 00000000000..e66edde240d --- /dev/null +++ b/backends/webgpu/runtime/ops/select/select_wgsl.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from select.wgsl - DO NOT EDIT. +// wgsl-sha256: 200cf5a8190045aa0562e782f01c1cfaf9681f30f679f5112ccc3d347a0ed8df +inline constexpr const char* kSelectWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + index: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index. + var rem = out_bufi; + var in_bufi: u32 = params.index * in_meta.strides[params.dim]; + for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) { + let coord = rem / out_meta.strides[od]; + rem = rem % out_meta.strides[od]; + var id = od; + if (od >= params.dim) { + id = od + 1u; + } + in_bufi = in_bufi + coord * in_meta.strides[id]; + } + output[out_bufi] = input[in_bufi]; +} +)"; + +inline constexpr uint32_t kSelectWorkgroupSizeX = 64; +inline constexpr uint32_t kSelectWorkgroupSizeY = 1; +inline constexpr uint32_t kSelectWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu From 5f7ab1542e0097e48f3ec4ab1ebd71fb1e0b55a4 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:16 -0700 Subject: [PATCH 4/8] [ExecuTorch][WebGPU] select_copy op test suite (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20363 Registers `aten.select_copy.int` in the `cases.py` op-test framework: a `_select_suite` of 4 configs (leading/middle/last dim + negative index) that `generate_op_tests` exports and compares to a torch golden on Dawn. Also adds `test/ops/select/test_select.py` (`SelectModule` + `CONFIGS` + an export-delegation/eager smoke test) and the `aten.select_copy.int` partitioner-allowlist entry in `tester.py`. ghstack-source-id: 397026513 @exported-using-ghexport Differential Revision: [D108793161](https://our.internmc.facebook.com/intern/diff/D108793161/) --- backends/webgpu/test/op_tests/cases.py | 9 ++++ backends/webgpu/test/ops/test_select.py | 67 +++++++++++++++++++++++++ backends/webgpu/test/tester.py | 1 + 3 files changed, 77 insertions(+) create mode 100644 backends/webgpu/test/ops/test_select.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index c0fbade9bbf..be5276cc57a 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -40,6 +40,10 @@ CONFIGS as _MUL_CONFIGS, MulModule, ) +from executorch.backends.webgpu.test.ops.test_select import ( + CONFIGS as _SELECT_CONFIGS, + SelectModule, +) from executorch.backends.webgpu.test.ops.test_view_copy import ( CONFIGS as _VIEW_CONFIGS, ViewModule, @@ -144,3 +148,8 @@ def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite: @register_op_test("view_copy") def _view_copy_suite() -> WebGPUTestSuite: return _fn_config_suite(ViewModule, _VIEW_CONFIGS) + + +@register_op_test("select") +def _select_suite() -> WebGPUTestSuite: + return _fn_config_suite(SelectModule, _SELECT_CONFIGS) diff --git a/backends/webgpu/test/ops/test_select.py b/backends/webgpu/test/ops/test_select.py new file mode 100644 index 00000000000..739ebe4d190 --- /dev/null +++ b/backends/webgpu/test/ops/test_select.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.select_copy.int` module + configs for the WebGPU op-test framework. + +`SelectModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `SelectTest` is the export-delegation smoke test. +Configs cover the leading, middle, and last dim plus a negative index (output rank = +input rank - 1). +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, select_fn) +CONFIGS = { + "dim0": ((3, 8, 4), lambda x: x[1]), + "middle": ((3, 8, 4), lambda x: x[:, 2]), + "last": ((3, 8, 4), lambda x: x[..., 3]), + "neg_idx": ((3, 8, 4), lambda x: x[:, -1]), +} + + +class SelectModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fn(x) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _export(fn, x: torch.Tensor): + ep = torch.export.export(SelectModule(fn).eval(), (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class SelectTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, fn) in CONFIGS.items(): + with self.subTest(name=name): + et = _export(fn, _det_input(shape)) + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (select {name})" + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index bb09fffb43e..9ba9a4d9ad4 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -23,6 +23,7 @@ exir_ops.edge.et_vk.rms_norm.default, exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.select_copy.int, ] From 369802ad96d7f1bb3b7cdd1ffb377a596a3f7b6b Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:16 -0700 Subject: [PATCH 5/8] [ExecuTorch][WebGPU] Add sigmoid op (aten.sigmoid.default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20390 Adds `aten.sigmoid.default` to the WebGPU delegate: element-wise `1/(1+exp(-x))` over a flat fp32 buffer. On the Llama critical path (`F.silu` -> `sigmoid` + `mul`). Composition (single dispatch): - `sigmoid/UnaryOp.cpp` — binds input (storage, read-only) + output (storage) + a `Params{num_elements}` uniform, 1D-dispatches over `num_elements` with `override wg_size` (clamped to the device limit); mirrors the `add` op (uniform mapped-at-creation, released after the bind group). - `sigmoid/sigmoid.wgsl` — guards `idx >= num_elements` and writes the logistic of each element. ghstack-source-id: 397026515 @exported-using-ghexport Differential Revision: [D108793157](https://our.internmc.facebook.com/intern/diff/D108793157/) --- backends/webgpu/CMakeLists.txt | 1 + .../webgpu/runtime/ops/sigmoid/UnaryOp.cpp | 165 ++++++++++++++++++ .../webgpu/runtime/ops/sigmoid/sigmoid.wgsl | 18 ++ .../webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h | 42 +++++ 4 files changed, 226 insertions(+) create mode 100644 backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp create mode 100644 backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl create mode 100644 backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index c774fd27845..c3b6ef4e706 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -44,6 +44,7 @@ set(WEBGPU_SRCS runtime/ops/prepack/Prepack.cpp runtime/ops/view_copy/ViewCopy.cpp runtime/ops/select/Select.cpp + runtime/ops/sigmoid/UnaryOp.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp new file mode 100644 index 00000000000..4d1a087cae5 --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +// Uniform buffer layout matching the WGSL Params struct; 16-byte aligned. +struct UnaryParams { + uint32_t num_elements; + uint32_t _pad[3]; +}; + +// Generic elementwise unary op; mirrors Vulkan add_unary_op_node (UnaryOp.cpp). +void add_unary_op( + WebGPUGraph& graph, + int in_id, + int out_id, + const char* wgsl_source, + uint32_t wg_size_x, + const char* op_name) { + WGPUDevice device = graph.device(); + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) { + throw std::runtime_error(std::string(op_name) + ": null buffer binding"); + } + + // 4-byte (fp32) alignment guard on both operands; also the dtype guard. + if (in_tensor.nbytes % sizeof(float) != 0 || + out_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error( + std::string(op_name) + ": operand not 4-byte aligned"); + } + if (in_tensor.nbytes != out_tensor.nbytes) { + throw std::runtime_error( + std::string(op_name) + ": input/output size mismatch"); + } + + uint32_t num_elements = + static_cast(out_tensor.nbytes / sizeof(float)); + + uint32_t wg_size = utils::clamp_workgroup_size(device, wg_size_x); + uint32_t workgroup_count = + utils::compute_1d_workgroup_count(device, num_elements, wg_size, op_name); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + UnaryParams params = {}; + params.num_elements = num_elements; + + WGPUBuffer uniform_buffer = + utils::make_uniform(device, ¶ms, sizeof(UnaryParams)); + graph.add_uniform_buffer_bytes(sizeof(UnaryParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {wgsl_source, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group layout: input (read storage) + output (storage) + params. + WGPUBindGroupLayoutEntry entries[3] = {}; + + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 3; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[3] = {}; + + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + + bg_entries[2].binding = 2; + bg_entries[2].buffer = uniform_buffer; + bg_entries[2].size = sizeof(UnaryParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 3; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + // Release intermediates (pipeline + bind_group are kept by dispatch). + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our ref; the bind group keeps the uniform buffer alive until release. + wgpuBufferRelease(uniform_buffer); +} + +void sigmoid_impl(WebGPUGraph& graph, const std::vector& args) { + // aten.sigmoid.default args: [in, out] + add_unary_op( + graph, + args.at(0), + args.at(1), + kSigmoidWGSL, + kSigmoidWorkgroupSizeX, + "sigmoid"); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.sigmoid.default, sigmoid_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl new file mode 100644 index 00000000000..09b3e5457b3 --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl @@ -0,0 +1,18 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct Params { + num_elements: u32, +} +@group(0) @binding(2) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h new file mode 100644 index 00000000000..48e6efc607a --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from sigmoid.wgsl - DO NOT EDIT. +// wgsl-sha256: 70395dbb107b8b95ae13c0a6fb12a8415c561c645da0347294c92904314ae84c +inline constexpr const char* kSigmoidWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct Params { + num_elements: u32, +} +@group(0) @binding(2) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} +)"; + +inline constexpr uint32_t kSigmoidWorkgroupSizeX = 64; +inline constexpr uint32_t kSigmoidWorkgroupSizeY = 1; +inline constexpr uint32_t kSigmoidWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu From 3e8b396e6f796358cd05530a71dc56818072c41b Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:17 -0700 Subject: [PATCH 6/8] [ExecuTorch][WebGPU] sigmoid op test suite (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20391 Registers `aten.sigmoid.default` in the `cases.py` op-test framework: a `_sigmoid_suite` (hard-coded shapes + a saturation case over a `linspace(-12, 12)` input) that `generate_op_tests` exports and compares to an fp64 torch golden on Dawn. Also adds `test/ops/sigmoid/test_sigmoid.py` (`SigmoidModule` + `N` + `_det_input` + an export-delegation/eager smoke test) and the `aten.sigmoid.default` partitioner-allowlist entry in `tester.py`. ghstack-source-id: 397026520 @exported-using-ghexport Differential Revision: [D108793159](https://our.internmc.facebook.com/intern/diff/D108793159/) --- backends/webgpu/test/op_tests/cases.py | 31 ++++++++++++++ backends/webgpu/test/ops/test_sigmoid.py | 51 ++++++++++++++++++++++++ backends/webgpu/test/tester.py | 1 + 3 files changed, 83 insertions(+) create mode 100644 backends/webgpu/test/ops/test_sigmoid.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index be5276cc57a..7df3ee11f11 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -44,6 +44,11 @@ CONFIGS as _SELECT_CONFIGS, SelectModule, ) +from executorch.backends.webgpu.test.ops.test_sigmoid import ( + _det_input as _sigmoid_det_input, + N as _SIGMOID_N, + SigmoidModule, +) from executorch.backends.webgpu.test.ops.test_view_copy import ( CONFIGS as _VIEW_CONFIGS, ViewModule, @@ -153,3 +158,29 @@ def _view_copy_suite() -> WebGPUTestSuite: @register_op_test("select") def _select_suite() -> WebGPUTestSuite: return _fn_config_suite(SelectModule, _SELECT_CONFIGS) + + +def _sigmoid_full_range(_shape) -> torch.Tensor: + # Reuses the monolith's saturation-tail input (linspace(-12, 12)). + return _sigmoid_det_input() + + +@register_op_test("sigmoid") +def _sigmoid_suite() -> WebGPUTestSuite: + # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4). + return WebGPUTestSuite( + module_factory=lambda: SigmoidModule(), + cases=[ + Case(name="vec", inputs=((M1,),)), + Case(name="mat", inputs=((M1, M2),)), + Case(name="rank3", inputs=((S1, M1, M2),)), + Case(name="rank4", inputs=((S1, S2, S2, M2),)), + # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss. + Case( + name="saturation", + inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),), + ), + ], + atol=1e-4, + rtol=1e-4, + ) diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py new file mode 100644 index 00000000000..0ba8c435a9a --- /dev/null +++ b/backends/webgpu/test/ops/test_sigmoid.py @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.sigmoid.default` module + input for the WebGPU op-test framework. + +`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the +declarative op-test suite. `SigmoidTest` is the export-delegation +smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the +deterministic input spans the saturation tails. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# Input length; the deterministic input spans the saturation tails. +N = 64 + + +class SigmoidModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.sigmoid(x) + + +def _det_input() -> torch.Tensor: + """Deterministic fp32 input spanning negatives, zero, and large magnitudes.""" + return torch.linspace(-12.0, 12.0, N, dtype=torch.float32) + + +def _export(m: torch.nn.Module, x: torch.Tensor): + ep = torch.export.export(m, (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +class SigmoidTest(unittest.TestCase): + def test_export_delegates(self) -> None: + et = _export(SigmoidModule().eval(), _det_input()) + found = any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)") diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index 9ba9a4d9ad4..e5dd510d49b 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -24,6 +24,7 @@ exir_ops.edge.aten.mul.Tensor, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.select_copy.int, + exir_ops.edge.aten.sigmoid.default, ] From 1150e19f4e2191dde8c57d933132aac2a06f7574 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:17 -0700 Subject: [PATCH 7/8] [ExecuTorch][WebGPU] Add squeeze_copy + unsqueeze_copy (flat copies) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20392 Adds `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` to the WebGPU delegate. Both are numel-preserving shape ops; on a dense row-major buffer backend they are the same flat copy as `view_copy` — only the shape metadata differs (mirrors the Vulkan delegate, which routes both through `add_view_copy_node`). Composition (no new kernel): - `squeeze/Squeeze.cpp` — reads `args = [self, dims, out]`, ignores the AOT-fixed `dims`, calls `add_flat_copy(graph, in, out)` from `runtime/ops/view_copy/view_copy.h`. - `unsqueeze/Unsqueeze.cpp` — reads `args = [self, dim, out]`, ignores the AOT-fixed `dim`, calls `add_flat_copy(graph, in, out)`. ghstack-source-id: 397026523 @exported-using-ghexport Differential Revision: [D108793153](https://our.internmc.facebook.com/intern/diff/D108793153/) --- backends/webgpu/CMakeLists.txt | 2 ++ .../webgpu/runtime/ops/squeeze/Squeeze.cpp | 31 +++++++++++++++++++ .../runtime/ops/unsqueeze/Unsqueeze.cpp | 31 +++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 backends/webgpu/runtime/ops/squeeze/Squeeze.cpp create mode 100644 backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index c3b6ef4e706..01bb5236a44 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -45,6 +45,8 @@ set(WEBGPU_SRCS runtime/ops/view_copy/ViewCopy.cpp runtime/ops/select/Select.cpp runtime/ops/sigmoid/UnaryOp.cpp + runtime/ops/squeeze/Squeeze.cpp + runtime/ops/unsqueeze/Unsqueeze.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp new file mode 100644 index 00000000000..12b0fe561f1 --- /dev/null +++ b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace executorch::backends::webgpu { + +namespace { + +// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104). +void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dims, out]; dims ignored (out shape fixed AOT). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp new file mode 100644 index 00000000000..27d2c52e708 --- /dev/null +++ b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace executorch::backends::webgpu { + +namespace { + +// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103). +void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl); +} + +} // namespace executorch::backends::webgpu From ae8e7e895ac663e8917c092ae98ace3a519239dd Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:18 -0700 Subject: [PATCH 8/8] [ExecuTorch][WebGPU] squeeze_copy + unsqueeze_copy test suites (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20393 Registers `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` in the `cases.py` op-test framework: a `_squeeze_suite` of 3 configs (squeeze leading/middle/multiple size-1 dims) and a `_unsqueeze_suite` of 3 configs (insert dim at front/middle/last) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/squeeze/test_squeeze.py` (`SqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), `test/ops/unsqueeze/test_unsqueeze.py` (`UnsqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), and the two partitioner-allowlist entries in `tester.py`. ghstack-source-id: 397026525 @exported-using-ghexport Differential Revision: [D108793152](https://our.internmc.facebook.com/intern/diff/D108793152/) --- backends/webgpu/test/op_tests/cases.py | 36 +++++++++++ backends/webgpu/test/ops/test_squeeze.py | 75 ++++++++++++++++++++++ backends/webgpu/test/ops/test_unsqueeze.py | 75 ++++++++++++++++++++++ backends/webgpu/test/tester.py | 2 + 4 files changed, 188 insertions(+) create mode 100644 backends/webgpu/test/ops/test_squeeze.py create mode 100644 backends/webgpu/test/ops/test_unsqueeze.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index 7df3ee11f11..0db8685fa18 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -49,6 +49,16 @@ N as _SIGMOID_N, SigmoidModule, ) + +from executorch.backends.webgpu.test.ops.test_squeeze import ( + CONFIGS as _SQUEEZE_CONFIGS, + SqueezeModule, +) + +from executorch.backends.webgpu.test.ops.test_unsqueeze import ( + CONFIGS as _UNSQUEEZE_CONFIGS, + UnsqueezeModule, +) from executorch.backends.webgpu.test.ops.test_view_copy import ( CONFIGS as _VIEW_CONFIGS, ViewModule, @@ -184,3 +194,29 @@ def _sigmoid_suite() -> WebGPUTestSuite: atol=1e-4, rtol=1e-4, ) + + +@register_op_test("squeeze") +def _squeeze_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (shape, dim) where dim is an int or a tuple. + return WebGPUTestSuite( + module_factory=lambda dim: SqueezeModule(dim), + cases=[ + Case(name=n, construct={"dim": dim}, inputs=(shape,)) + for n, (shape, dim) in _SQUEEZE_CONFIGS.items() + ], + golden_dtype="float32", # reshape copies values; fp64 bit-identical + ) + + +@register_op_test("unsqueeze") +def _unsqueeze_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (shape, dim). + return WebGPUTestSuite( + module_factory=lambda dim: UnsqueezeModule(dim), + cases=[ + Case(name=n, construct={"dim": dim}, inputs=(shape,)) + for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items() + ], + golden_dtype="float32", # reshape copies values; fp64 bit-identical + ) diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py new file mode 100644 index 00000000000..b55a5143538 --- /dev/null +++ b/backends/webgpu/test/ops/test_squeeze.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework. + +`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `SqueezeTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, squeeze_dim) +CONFIGS = { + "dim0": ((1, 3, 4), 0), + "mid": ((2, 1, 4), 1), + "multi": ((1, 3, 1, 4), (0, 2)), +} + + +class SqueezeModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.squeeze(x, self.dim) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _lower(dim, x: torch.Tensor): + ep = torch.export.export(SqueezeModule(dim).eval(), (x,)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class SqueezeTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, dim) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(dim, _det_input(shape)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (squeeze {name})", + ) + self.assertTrue( + _op_delegated(edge, "squeeze_copy"), + f"squeeze_copy not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py new file mode 100644 index 00000000000..dcddf4faa51 --- /dev/null +++ b/backends/webgpu/test/ops/test_unsqueeze.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework. + +`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `UnsqueezeTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, unsqueeze_dim) +CONFIGS = { + "front": ((3, 4), 0), + "mid": ((2, 4), 1), + "last": ((3, 4), 2), +} + + +class UnsqueezeModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.unsqueeze(x, self.dim) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _lower(dim, x: torch.Tensor): + ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class UnsqueezeTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, dim) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(dim, _det_input(shape)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (unsqueeze {name})", + ) + self.assertTrue( + _op_delegated(edge, "unsqueeze_copy"), + f"unsqueeze_copy not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index e5dd510d49b..53a745a16df 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -25,6 +25,8 @@ exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.select_copy.int, exir_ops.edge.aten.sigmoid.default, + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.unsqueeze_copy.default, ]