From 2ea4836c1c9b0f8a5e2a9a508612a6610f926886 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:19 -0700 Subject: [PATCH 1/4] [ExecuTorch][WebGPU] Add slice_copy op (aten.slice_copy.Tensor) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20394 Adds `aten.slice_copy.Tensor` to the WebGPU delegate as a gather: each output element is mapped back to its source input element along the sliced dim via `start + coord * step`. Composition (single compute dispatch): - `runtime/ops/slice/Slice.cpp` — reads `args = [self, dim, start, end, step, out]` via `read_scalar` (static `Int`/`Null`-sentinel default; throws on dynamic `SymInt`); normalizes negative `dim`/`start`, clamps `start` to `[0, in_size]`; builds two `TensorMeta` UBOs + a `SliceParams{dim, start, step}` uniform; guards fp32; dispatches over `compute_1d_workgroup_count(out.numel)` with `override wg_size`; releases all uniforms after the bind group. - `runtime/ops/slice/slice.wgsl` — delinearizes the output index over the contiguous output strides, maps the sliced-dim coordinate back to the input (`start + coord*step`), relinearizes over the input strides. ghstack-source-id: 397026527 @exported-using-ghexport Differential Revision: [D108793168](https://our.internmc.facebook.com/intern/diff/D108793168/) --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/ops/slice/Slice.cpp | 194 ++++++++++++++++++ backends/webgpu/runtime/ops/slice/slice.wgsl | 42 ++++ .../webgpu/runtime/ops/slice/slice_wgsl.h | 66 ++++++ 4 files changed, 303 insertions(+) create mode 100644 backends/webgpu/runtime/ops/slice/Slice.cpp create mode 100644 backends/webgpu/runtime/ops/slice/slice.wgsl create mode 100644 backends/webgpu/runtime/ops/slice/slice_wgsl.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 01bb5236a44..25b7bb565dc 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -47,6 +47,7 @@ set(WEBGPU_SRCS runtime/ops/sigmoid/UnaryOp.cpp runtime/ops/squeeze/Squeeze.cpp runtime/ops/unsqueeze/Unsqueeze.cpp + runtime/ops/slice/Slice.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/ops/slice/Slice.cpp b/backends/webgpu/runtime/ops/slice/Slice.cpp new file mode 100644 index 00000000000..1d4406bbd1a --- /dev/null +++ b/backends/webgpu/runtime/ops/slice/Slice.cpp @@ -0,0 +1,194 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct SliceParams { + uint32_t dim; + uint32_t start; + uint32_t step; + uint32_t _pad; +}; + +// Read scalar arg: Int->value (INT64_MAX->default), Null->default, else throw. +int64_t +read_scalar(WebGPUGraph& graph, int id, int64_t dflt, const char* what) { + switch (graph.get_value_type(id)) { + case WebGPUGraph::ValueType::Int: { + const int64_t v = graph.get_int(id); + return v == INT64_MAX ? dflt : v; + } + case WebGPUGraph::ValueType::Null: + return dflt; + default: + throw std::runtime_error( + std::string("slice: dynamic/unsupported ") + what); + } +} + +void slice_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dim, start, end, step, out]; end unread (out shape is AOT). + const int in_id = args.at(0); + const int out_id = args.at(5); + + WGPUDevice device = graph.device(); + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + + const int in_ndim = static_cast(in_tensor.dims.size()); + int64_t dim = read_scalar(graph, args.at(1), 0, "dim"); + if (dim < 0) { + dim += in_ndim; + } + if (dim < 0 || dim >= in_ndim) { + throw std::runtime_error("slice: dim out of range"); + } + const int64_t in_size = in_tensor.dims[dim]; + int64_t start = read_scalar(graph, args.at(2), 0, "start"); + if (start < 0) { + start += in_size; + } + // Clamp start to [0, in_size] (guards the gather offset; out size is AOT). + start = start < 0 ? 0 : (start > in_size ? in_size : start); + const int64_t step = read_scalar(graph, args.at(4), 1, "step"); + if (step < 1) { + throw std::runtime_error("slice: step must be >= 1"); + } + + TensorMeta out_meta; + TensorMeta in_meta; + fill_tensor_meta(out_tensor, &out_meta); + fill_tensor_meta(in_tensor, &in_meta); + if (out_tensor.nbytes != + static_cast(out_meta.numel) * sizeof(float) || + in_tensor.nbytes != static_cast(in_meta.numel) * sizeof(float)) { + throw std::runtime_error("slice: non-fp32 operand (nbytes != numel * 4)"); + } + + SliceParams params = {}; + params.dim = static_cast(dim); + params.start = static_cast(start); + params.step = static_cast(step); + + uint32_t wg_size = utils::clamp_workgroup_size(device, kSliceWorkgroupSizeX); + uint32_t workgroup_count = utils::compute_1d_workgroup_count( + device, out_meta.numel, wg_size, "slice"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + WGPUBuffer out_meta_buf = + utils::make_uniform(device, &out_meta, sizeof(TensorMeta)); + WGPUBuffer in_meta_buf = + utils::make_uniform(device, &in_meta, sizeof(TensorMeta)); + WGPUBuffer params_buf = + utils::make_uniform(device, ¶ms, sizeof(SliceParams)); + graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SliceParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kSliceWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms). + WGPUBindGroupLayoutEntry entries[5] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + entries[4].binding = 4; + entries[4].visibility = WGPUShaderStage_Compute; + entries[4].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 5; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[5] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_meta_buf; + bg_entries[2].size = sizeof(TensorMeta); + bg_entries[3].binding = 3; + bg_entries[3].buffer = in_meta_buf; + bg_entries[3].size = sizeof(TensorMeta); + bg_entries[4].binding = 4; + bg_entries[4].buffer = params_buf; + bg_entries[4].size = sizeof(SliceParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 5; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our refs; the bind group keeps the uniforms alive until release. + wgpuBufferRelease(out_meta_buf); + wgpuBufferRelease(in_meta_buf); + wgpuBufferRelease(params_buf); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.slice_copy.Tensor, slice_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/slice/slice.wgsl b/backends/webgpu/runtime/ops/slice/slice.wgsl new file mode 100644 index 00000000000..7ed718b2ca1 --- /dev/null +++ b/backends/webgpu/runtime/ops/slice/slice.wgsl @@ -0,0 +1,42 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + start: u32, + step: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out_bufi -> in_bufi, sliced dim coord = start + coord*step. + var rem = out_bufi; + var in_bufi: u32 = 0u; + for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) { + let coord = rem / out_meta.strides[d]; + rem = rem % out_meta.strides[d]; + var in_coord = coord; + if (d == params.dim) { + in_coord = params.start + coord * params.step; + } + in_bufi = in_bufi + in_coord * in_meta.strides[d]; + } + output[out_bufi] = input[in_bufi]; +} diff --git a/backends/webgpu/runtime/ops/slice/slice_wgsl.h b/backends/webgpu/runtime/ops/slice/slice_wgsl.h new file mode 100644 index 00000000000..52693f3665b --- /dev/null +++ b/backends/webgpu/runtime/ops/slice/slice_wgsl.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from slice.wgsl - DO NOT EDIT. +// wgsl-sha256: 6639d985420d43a67de0847749918ab6216e0785399bdcae737d49b81c773526 +inline constexpr const char* kSliceWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + start: u32, + step: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out_bufi -> in_bufi, sliced dim coord = start + coord*step. + var rem = out_bufi; + var in_bufi: u32 = 0u; + for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) { + let coord = rem / out_meta.strides[d]; + rem = rem % out_meta.strides[d]; + var in_coord = coord; + if (d == params.dim) { + in_coord = params.start + coord * params.step; + } + in_bufi = in_bufi + in_coord * in_meta.strides[d]; + } + output[out_bufi] = input[in_bufi]; +} +)"; + +inline constexpr uint32_t kSliceWorkgroupSizeX = 64; +inline constexpr uint32_t kSliceWorkgroupSizeY = 1; +inline constexpr uint32_t kSliceWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu From b61512e512d9dfdae74b8efa74604f69a189b23a Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:19 -0700 Subject: [PATCH 2/4] [ExecuTorch][WebGPU] slice_copy op test suite (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20395 Registers `aten.slice_copy.Tensor` in the `cases.py` op-test framework: a `_slice_suite` of 4 configs (leading-dim slice `[:,1:5]`, last-dim slice `[...,1:3]`, step-2 `[:,0:8:2]`, negative-end `[:,1:-1]`) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/slice/test_slice.py` (`SliceModule` + `CONFIGS` + export-delegation/eager smoke test) and the `aten.slice_copy.Tensor` partitioner-allowlist entry in `tester.py`. ghstack-source-id: 397026537 @exported-using-ghexport Differential Revision: [D108793151](https://our.internmc.facebook.com/intern/diff/D108793151/) --- backends/webgpu/test/op_tests/cases.py | 10 ++++ backends/webgpu/test/ops/test_slice.py | 66 ++++++++++++++++++++++++++ backends/webgpu/test/tester.py | 1 + 3 files changed, 77 insertions(+) create mode 100644 backends/webgpu/test/ops/test_slice.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index 0db8685fa18..4422dfa08f6 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -50,6 +50,11 @@ SigmoidModule, ) +from executorch.backends.webgpu.test.ops.test_slice import ( + CONFIGS as _SLICE_CONFIGS, + SliceModule, +) + from executorch.backends.webgpu.test.ops.test_squeeze import ( CONFIGS as _SQUEEZE_CONFIGS, SqueezeModule, @@ -220,3 +225,8 @@ def _unsqueeze_suite() -> WebGPUTestSuite: ], golden_dtype="float32", # reshape copies values; fp64 bit-identical ) + + +@register_op_test("slice") +def _slice_suite() -> WebGPUTestSuite: + return _fn_config_suite(SliceModule, _SLICE_CONFIGS) diff --git a/backends/webgpu/test/ops/test_slice.py b/backends/webgpu/test/ops/test_slice.py new file mode 100644 index 00000000000..416f880cd42 --- /dev/null +++ b/backends/webgpu/test/ops/test_slice.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.slice_copy.Tensor` module + configs for the WebGPU op-test framework. + +`SliceModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `SliceTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, slice_fn) +CONFIGS = { + "dim1_1_5": ((3, 8, 4), lambda x: x[:, 1:5]), + "lastdim": ((3, 8, 4), lambda x: x[..., 1:3]), + "step2": ((3, 8, 4), lambda x: x[:, 0:8:2]), + "neg_end": ((3, 8, 4), lambda x: x[:, 1:-1]), +} + + +class SliceModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fn(x) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _export(fn, x: torch.Tensor): + ep = torch.export.export(SliceModule(fn).eval(), (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class SliceTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, fn) in CONFIGS.items(): + with self.subTest(name=name): + et = _export(fn, _det_input(shape)) + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (slice {name})" + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index 53a745a16df..5668a783e21 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -27,6 +27,7 @@ exir_ops.edge.aten.sigmoid.default, exir_ops.edge.aten.squeeze_copy.dims, exir_ops.edge.aten.unsqueeze_copy.default, + exir_ops.edge.aten.slice_copy.Tensor, ] From 09eb42cdb84eeff77472b88496865668a4b6fafd Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:20 -0700 Subject: [PATCH 3/4] [ExecuTorch][WebGPU] Add permute_copy + IntList graph support (aten.permute_copy.default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pull Request resolved: https://github.com/pytorch/executorch/pull/20396 Adds `aten.permute_copy.default` (a coordinate-reorder gather) to the WebGPU delegate, and the `IntList` graph value type it needs to read its `dims` argument. Composition: - `runtime/WebGPUGraph.{h,cpp}` — adds `ValueType::IntList` backed by `std::vector> int_lists_` + `get_int_list(int)`; `build()` deserializes `vkgraph::GraphTypes::IntList` via `value_as_IntList()->items()` (int64, matching the FlatBuffer `[long]`); mirrors the existing scalar value plumbing. - `runtime/ops/permute/Permute.cpp` — reads the permutation via `get_int_list`, normalizes negative dims, validates it is a permutation of `[0, ndim)`, builds two `TensorMeta` UBOs + a `PermuteParams{perm: vec4}` uniform, guards fp32 + rank≤4, dispatches over `compute_1d_workgroup_count(out.numel)` with `override wg_size`; releases all uniforms after the bind group. - `runtime/ops/permute/permute.wgsl` — delinearizes the output index over the contiguous output strides, reads `input` at `in.strides[perm[d]]` per dim (mirrors Vulkan `permute_buffer.glsl`). - Registers both `aten.permute_copy.default` and `aten.permute.default` to the same handler. ghstack-source-id: 397026548 @exported-using-ghexport Differential Revision: [D108793162](https://our.internmc.facebook.com/intern/diff/D108793162/) --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/WebGPUGraph.cpp | 9 + backends/webgpu/runtime/WebGPUGraph.h | 9 +- .../webgpu/runtime/ops/permute/Permute.cpp | 197 ++++++++++++++++++ .../webgpu/runtime/ops/permute/permute.wgsl | 36 ++++ .../webgpu/runtime/ops/permute/permute_wgsl.h | 60 ++++++ 6 files changed, 311 insertions(+), 1 deletion(-) create mode 100644 backends/webgpu/runtime/ops/permute/Permute.cpp create mode 100644 backends/webgpu/runtime/ops/permute/permute.wgsl create mode 100644 backends/webgpu/runtime/ops/permute/permute_wgsl.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 25b7bb565dc..c24ffb81ece 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -48,6 +48,7 @@ set(WEBGPU_SRCS runtime/ops/squeeze/Squeeze.cpp runtime/ops/unsqueeze/Unsqueeze.cpp runtime/ops/slice/Slice.cpp + runtime/ops/permute/Permute.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 0e00b2cb42b..aeaf42f1390 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -245,6 +245,7 @@ void WebGPUGraph::build( tensors_.resize(num_vals); tensor_mem_obj_ids_.resize(num_vals, -1); ints_.resize(num_vals, 0); + int_lists_.resize(num_vals); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); value_lists_.resize(num_vals); @@ -375,6 +376,14 @@ void WebGPUGraph::build( ints_[i] = val->value_as_Int()->int_val(); break; } + case vkgraph::GraphTypes::IntList: { + value_types_[i] = ValueType::IntList; + const auto* items = val->value_as_IntList()->items(); + if (items) { + int_lists_[i].assign(items->cbegin(), items->cend()); + } + break; + } case vkgraph::GraphTypes::Double: { value_types_[i] = ValueType::Double; doubles_[i] = val->value_as_Double()->double_val(); diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index b9326cf016c..755ea7503be 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -131,6 +131,11 @@ class WebGPUGraph { int64_t get_int(int id) const { return ints_[id]; } + // Int values of a serialized IntList (e.g. permute dims). int64 (FlatBuffer + // [long]) to match the schema and the get_int convention. + const std::vector& get_int_list(int id) const { + return int_lists_[id]; + } bool get_bool(int id) const { return bools_[id]; } @@ -258,7 +263,8 @@ class WebGPUGraph { Null, String, SymInt, - ValueList + ValueList, + IntList }; ValueType get_value_type(int id) const { @@ -275,6 +281,7 @@ class WebGPUGraph { std::vector value_types_; std::vector tensors_; std::vector ints_; + std::vector> int_lists_; std::vector doubles_; std::vector bools_; std::vector> value_lists_; diff --git a/backends/webgpu/runtime/ops/permute/Permute.cpp b/backends/webgpu/runtime/ops/permute/Permute.cpp new file mode 100644 index 00000000000..5062c33cec1 --- /dev/null +++ b/backends/webgpu/runtime/ops/permute/Permute.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct PermuteParams { + uint32_t perm[kTensorMetaMaxNdim]; +}; +static_assert( + sizeof(PermuteParams) == 16, + "PermuteParams must match the WGSL Params vec4 (16 bytes)"); + +// permute: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl, NCHW). +void permute_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dims, out]; out is the last value-id. + const int in_id = args.at(0); + const int dims_id = args.at(1); + const int out_id = args.at(args.size() - 1); + + if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor || + graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("permute: in/out arg is not a tensor"); + } + if (graph.get_value_type(dims_id) != WebGPUGraph::ValueType::IntList) { + throw std::runtime_error("permute: dims arg is not an IntList"); + } + + WGPUDevice device = graph.device(); + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + const int ndim = static_cast(in_tensor.dims.size()); + + const std::vector& dims = graph.get_int_list(dims_id); + if (static_cast(dims.size()) != ndim || + static_cast(out_tensor.dims.size()) != ndim) { + throw std::runtime_error("permute: perm length != input/output rank"); + } + + // Normalize negative dims and verify perm is a permutation of [0, ndim). + uint32_t perm[kTensorMetaMaxNdim]; + bool seen[kTensorMetaMaxNdim] = {}; + if (ndim > static_cast(kTensorMetaMaxNdim)) { + throw std::runtime_error("permute: tensor rank exceeds 4 (MAX_NDIM)"); + } + for (int d = 0; d < ndim; d++) { + int64_t p = dims[d]; + if (p < 0) { + p += ndim; + } + if (p < 0 || p >= ndim || seen[p]) { + throw std::runtime_error("permute: dims is not a valid permutation"); + } + seen[p] = true; + perm[d] = static_cast(p); + } + for (int d = ndim; d < static_cast(kTensorMetaMaxNdim); d++) { + perm[d] = static_cast(d); + } + + TensorMeta out_meta; + TensorMeta in_meta; + fill_tensor_meta(out_tensor, &out_meta); + fill_tensor_meta(in_tensor, &in_meta); + if (out_tensor.nbytes != + static_cast(out_meta.numel) * sizeof(float) || + in_tensor.nbytes != static_cast(in_meta.numel) * sizeof(float)) { + throw std::runtime_error("permute: non-fp32 operand (nbytes != numel * 4)"); + } + + PermuteParams params = {}; + std::memcpy(params.perm, perm, sizeof(perm)); + + uint32_t wg_size = + utils::clamp_workgroup_size(device, kPermuteWorkgroupSizeX); + uint32_t workgroup_count = utils::compute_1d_workgroup_count( + device, out_meta.numel, wg_size, "permute"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + WGPUBuffer out_meta_buf = + utils::make_uniform(device, &out_meta, sizeof(TensorMeta)); + WGPUBuffer in_meta_buf = + utils::make_uniform(device, &in_meta, sizeof(TensorMeta)); + WGPUBuffer params_buf = + utils::make_uniform(device, ¶ms, sizeof(PermuteParams)); + graph.add_uniform_buffer_bytes( + 2 * sizeof(TensorMeta) + sizeof(PermuteParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kPermuteWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms). + WGPUBindGroupLayoutEntry entries[5] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + entries[4].binding = 4; + entries[4].visibility = WGPUShaderStage_Compute; + entries[4].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 5; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[5] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_meta_buf; + bg_entries[2].size = sizeof(TensorMeta); + bg_entries[3].binding = 3; + bg_entries[3].buffer = in_meta_buf; + bg_entries[3].size = sizeof(TensorMeta); + bg_entries[4].binding = 4; + bg_entries[4].buffer = params_buf; + bg_entries[4].size = sizeof(PermuteParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 5; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our refs; the bind group keeps the uniforms alive until release. + wgpuBufferRelease(out_meta_buf); + wgpuBufferRelease(in_meta_buf); + wgpuBufferRelease(params_buf); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.permute_copy.default, permute_impl); + WEBGPU_REGISTER_OP(aten.permute.default, permute_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/permute/permute.wgsl b/backends/webgpu/runtime/ops/permute/permute.wgsl new file mode 100644 index 00000000000..521cfac1e66 --- /dev/null +++ b/backends/webgpu/runtime/ops/permute/permute.wgsl @@ -0,0 +1,36 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + perm: vec4, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl). + var rem = out_bufi; + var in_bufi: u32 = 0u; + for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) { + let coord = rem / out_meta.strides[d]; + rem = rem % out_meta.strides[d]; + in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]]; + } + output[out_bufi] = input[in_bufi]; +} diff --git a/backends/webgpu/runtime/ops/permute/permute_wgsl.h b/backends/webgpu/runtime/ops/permute/permute_wgsl.h new file mode 100644 index 00000000000..6ec41cc8446 --- /dev/null +++ b/backends/webgpu/runtime/ops/permute/permute_wgsl.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from permute.wgsl - DO NOT EDIT. +// wgsl-sha256: d34f59730cda7317589b6ed5691a1ccab8666b9c94e17ac2cb3658b036300197 +inline constexpr const char* kPermuteWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + perm: vec4, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl). + var rem = out_bufi; + var in_bufi: u32 = 0u; + for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) { + let coord = rem / out_meta.strides[d]; + rem = rem % out_meta.strides[d]; + in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]]; + } + output[out_bufi] = input[in_bufi]; +} +)"; + +inline constexpr uint32_t kPermuteWorkgroupSizeX = 64; +inline constexpr uint32_t kPermuteWorkgroupSizeY = 1; +inline constexpr uint32_t kPermuteWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu From cdbc3020f4db3caaec9f39e554537840013aa6e5 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Thu, 25 Jun 2026 10:24:20 -0700 Subject: [PATCH 4/4] [ExecuTorch][WebGPU] permute_copy op test suite (cases.py op-test framework) Pull Request resolved: https://github.com/pytorch/executorch/pull/20397 Registers `aten.permute_copy.default` in the `cases.py` op-test framework: a `_permute_suite` of 4 configs (3D rotation, 4D middle-dim transpose, 2D transpose, full 4D shuffle) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/permute/test_permute.py` (`PermuteModule` + `CONFIGS` + `_op_delegated` smoke test) and the `aten.permute_copy.default` partitioner-allowlist entry in `tester.py`. ghstack-source-id: 397026550 @exported-using-ghexport Differential Revision: [D108793156](https://our.internmc.facebook.com/intern/diff/D108793156/) --- backends/webgpu/test/op_tests/cases.py | 17 ++++++ backends/webgpu/test/ops/test_permute.py | 76 ++++++++++++++++++++++++ backends/webgpu/test/tester.py | 1 + 3 files changed, 94 insertions(+) create mode 100644 backends/webgpu/test/ops/test_permute.py diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index 4422dfa08f6..f1ab0c6cca4 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -40,6 +40,10 @@ CONFIGS as _MUL_CONFIGS, MulModule, ) +from executorch.backends.webgpu.test.ops.test_permute import ( + CONFIGS as _PERMUTE_CONFIGS, + PermuteModule, +) from executorch.backends.webgpu.test.ops.test_select import ( CONFIGS as _SELECT_CONFIGS, SelectModule, @@ -230,3 +234,16 @@ def _unsqueeze_suite() -> WebGPUTestSuite: @register_op_test("slice") def _slice_suite() -> WebGPUTestSuite: return _fn_config_suite(SliceModule, _SLICE_CONFIGS) + + +@register_op_test("permute") +def _permute_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (shape, perm-tuple). + return WebGPUTestSuite( + module_factory=lambda perm: PermuteModule(perm), + cases=[ + Case(name=n, construct={"perm": perm}, inputs=(shape,)) + for n, (shape, perm) in _PERMUTE_CONFIGS.items() + ], + golden_dtype="float32", # permutation reorders values; fp64 bit-identical + ) diff --git a/backends/webgpu/test/ops/test_permute.py b/backends/webgpu/test/ops/test_permute.py new file mode 100644 index 00000000000..ef194c58db8 --- /dev/null +++ b/backends/webgpu/test/ops/test_permute.py @@ -0,0 +1,76 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.permute_copy.default` module + configs for the WebGPU op-test framework. + +`PermuteModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `PermuteTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, perm) +CONFIGS = { + "rot3d": ((2, 3, 4), (2, 0, 1)), + "mid4d": ((1, 8, 4, 16), (0, 2, 1, 3)), + "t2d": ((3, 5), (1, 0)), + "shuffle4d": ((2, 3, 4, 5), (3, 1, 0, 2)), +} + + +class PermuteModule(torch.nn.Module): + def __init__(self, perm): + super().__init__() + self.perm = perm + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.permute(x, self.perm).contiguous() + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _lower(perm, x: torch.Tensor): + ep = torch.export.export(PermuteModule(perm).eval(), (x,)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class PermuteTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, perm) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(perm, _det_input(shape)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (permute {name})", + ) + self.assertTrue( + _op_delegated(edge, "permute"), + f"permute not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index 5668a783e21..b7f38246f8c 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -28,6 +28,7 @@ exir_ops.edge.aten.squeeze_copy.dims, exir_ops.edge.aten.unsqueeze_copy.default, exir_ops.edge.aten.slice_copy.Tensor, + exir_ops.edge.aten.permute_copy.default, ]