From 2ea4836c1c9b0f8a5e2a9a508612a6610f926886 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:19 -0700
Subject: [PATCH 1/4] [ExecuTorch][WebGPU] Add slice_copy op
 (aten.slice_copy.Tensor)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20394

Adds `aten.slice_copy.Tensor` to the WebGPU delegate as a gather: each output element is mapped back to its source input element along the sliced dim via `start + coord * step`.

Composition (single compute dispatch):
- `runtime/ops/slice/Slice.cpp` — reads `args = [self, dim, start, end, step, out]` via `read_scalar` (static `Int`/`Null`-sentinel default; throws on dynamic `SymInt`); normalizes negative `dim`/`start`, clamps `start` to `[0, in_size]`; builds two `TensorMeta` UBOs + a `SliceParams{dim, start, step}` uniform; guards fp32; dispatches over `compute_1d_workgroup_count(out.numel)` with `override wg_size`; releases all uniforms after the bind group.
- `runtime/ops/slice/slice.wgsl` — delinearizes the output index over the contiguous output strides, maps the sliced-dim coordinate back to the input (`start + coord*step`), relinearizes over the input strides.
ghstack-source-id: 397026527
@exported-using-ghexport

Differential Revision: [D108793168](https://our.internmc.facebook.com/intern/diff/D108793168/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 backends/webgpu/runtime/ops/slice/Slice.cpp   | 194 ++++++++++++++++++
 backends/webgpu/runtime/ops/slice/slice.wgsl  |  42 ++++
 .../webgpu/runtime/ops/slice/slice_wgsl.h     |  66 ++++++
 4 files changed, 303 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/slice/Slice.cpp
 create mode 100644 backends/webgpu/runtime/ops/slice/slice.wgsl
 create mode 100644 backends/webgpu/runtime/ops/slice/slice_wgsl.h
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 01bb5236a44..25b7bb565dc 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -47,6 +47,7 @@ set(WEBGPU_SRCS
     runtime/ops/sigmoid/UnaryOp.cpp
     runtime/ops/squeeze/Squeeze.cpp
     runtime/ops/unsqueeze/Unsqueeze.cpp
+    runtime/ops/slice/Slice.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/slice/Slice.cpp b/backends/webgpu/runtime/ops/slice/Slice.cpp
new file mode 100644
index 00000000000..1d4406bbd1a
--- /dev/null
+++ b/backends/webgpu/runtime/ops/slice/Slice.cpp
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/slice/slice_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct SliceParams {
+  uint32_t dim;
+  uint32_t start;
+  uint32_t step;
+  uint32_t _pad;
+};
+
+// Read scalar arg: Int->value (INT64_MAX->default), Null->default, else throw.
+int64_t
+read_scalar(WebGPUGraph& graph, int id, int64_t dflt, const char* what) {
+  switch (graph.get_value_type(id)) {
+    case WebGPUGraph::ValueType::Int: {
+      const int64_t v = graph.get_int(id);
+      return v == INT64_MAX ? dflt : v;
+    }
+    case WebGPUGraph::ValueType::Null:
+      return dflt;
+    default:
+      throw std::runtime_error(
+          std::string("slice: dynamic/unsupported ") + what);
+  }
+}
+
+void slice_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, start, end, step, out]; end unread (out shape is AOT).
+  const int in_id = args.at(0);
+  const int out_id = args.at(5);
+
+  WGPUDevice device = graph.device();
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+
+  const int in_ndim = static_cast<int>(in_tensor.dims.size());
+  int64_t dim = read_scalar(graph, args.at(1), 0, "dim");
+  if (dim < 0) {
+    dim += in_ndim;
+  }
+  if (dim < 0 || dim >= in_ndim) {
+    throw std::runtime_error("slice: dim out of range");
+  }
+  const int64_t in_size = in_tensor.dims[dim];
+  int64_t start = read_scalar(graph, args.at(2), 0, "start");
+  if (start < 0) {
+    start += in_size;
+  }
+  // Clamp start to [0, in_size] (guards the gather offset; out size is AOT).
+  start = start < 0 ? 0 : (start > in_size ? in_size : start);
+  const int64_t step = read_scalar(graph, args.at(4), 1, "step");
+  if (step < 1) {
+    throw std::runtime_error("slice: step must be >= 1");
+  }
+
+  TensorMeta out_meta;
+  TensorMeta in_meta;
+  fill_tensor_meta(out_tensor, &out_meta);
+  fill_tensor_meta(in_tensor, &in_meta);
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in_tensor.nbytes != static_cast<size_t>(in_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("slice: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  SliceParams params = {};
+  params.dim = static_cast<uint32_t>(dim);
+  params.start = static_cast<uint32_t>(start);
+  params.step = static_cast<uint32_t>(step);
+
+  uint32_t wg_size = utils::clamp_workgroup_size(device, kSliceWorkgroupSizeX);
+  uint32_t workgroup_count = utils::compute_1d_workgroup_count(
+      device, out_meta.numel, wg_size, "slice");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in_meta_buf =
+      utils::make_uniform(device, &in_meta, sizeof(TensorMeta));
+  WGPUBuffer params_buf =
+      utils::make_uniform(device, &params, sizeof(SliceParams));
+  graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SliceParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kSliceWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[5] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 5;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[5] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_meta_buf;
+  bg_entries[2].size = sizeof(TensorMeta);
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = in_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = params_buf;
+  bg_entries[4].size = sizeof(SliceParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 5;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in_meta_buf);
+  wgpuBufferRelease(params_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.slice_copy.Tensor, slice_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/slice/slice.wgsl b/backends/webgpu/runtime/ops/slice/slice.wgsl
new file mode 100644
index 00000000000..7ed718b2ca1
--- /dev/null
+++ b/backends/webgpu/runtime/ops/slice/slice.wgsl
@@ -0,0 +1,42 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  start: u32,
+  step: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out_bufi -> in_bufi, sliced dim coord = start + coord*step.
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        var in_coord = coord;
+        if (d == params.dim) {
+            in_coord = params.start + coord * params.step;
+        }
+        in_bufi = in_bufi + in_coord * in_meta.strides[d];
+    }
+    output[out_bufi] = input[in_bufi];
+}
diff --git a/backends/webgpu/runtime/ops/slice/slice_wgsl.h b/backends/webgpu/runtime/ops/slice/slice_wgsl.h
new file mode 100644
index 00000000000..52693f3665b
--- /dev/null
+++ b/backends/webgpu/runtime/ops/slice/slice_wgsl.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from slice.wgsl - DO NOT EDIT.
+// wgsl-sha256: 6639d985420d43a67de0847749918ab6216e0785399bdcae737d49b81c773526
+inline constexpr const char* kSliceWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  start: u32,
+  step: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out_bufi -> in_bufi, sliced dim coord = start + coord*step.
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        var in_coord = coord;
+        if (d == params.dim) {
+            in_coord = params.start + coord * params.step;
+        }
+        in_bufi = in_bufi + in_coord * in_meta.strides[d];
+    }
+    output[out_bufi] = input[in_bufi];
+}
+)";
+
+inline constexpr uint32_t kSliceWorkgroupSizeX = 64;
+inline constexpr uint32_t kSliceWorkgroupSizeY = 1;
+inline constexpr uint32_t kSliceWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

From b61512e512d9dfdae74b8efa74604f69a189b23a Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:19 -0700
Subject: [PATCH 2/4] [ExecuTorch][WebGPU] slice_copy op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20395

Registers `aten.slice_copy.Tensor` in the `cases.py` op-test framework: a `_slice_suite` of 4 configs (leading-dim slice `[:,1:5]`, last-dim slice `[...,1:3]`, step-2 `[:,0:8:2]`, negative-end `[:,1:-1]`) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/slice/test_slice.py` (`SliceModule` + `CONFIGS` + export-delegation/eager smoke test) and the `aten.slice_copy.Tensor` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026537
@exported-using-ghexport

Differential Revision: [D108793151](https://our.internmc.facebook.com/intern/diff/D108793151/)
---
 backends/webgpu/test/op_tests/cases.py | 10 ++++
 backends/webgpu/test/ops/test_slice.py | 66 ++++++++++++++++++++++++++
 backends/webgpu/test/tester.py         |  1 +
 3 files changed, 77 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_slice.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index 0db8685fa18..4422dfa08f6 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -50,6 +50,11 @@
     SigmoidModule,
 )
 
+from executorch.backends.webgpu.test.ops.test_slice import (
+    CONFIGS as _SLICE_CONFIGS,
+    SliceModule,
+)
+
 from executorch.backends.webgpu.test.ops.test_squeeze import (
     CONFIGS as _SQUEEZE_CONFIGS,
     SqueezeModule,
@@ -220,3 +225,8 @@ def _unsqueeze_suite() -> WebGPUTestSuite:
         ],
         golden_dtype="float32",  # reshape copies values; fp64 bit-identical
     )
+
+
+@register_op_test("slice")
+def _slice_suite() -> WebGPUTestSuite:
+    return _fn_config_suite(SliceModule, _SLICE_CONFIGS)
diff --git a/backends/webgpu/test/ops/test_slice.py b/backends/webgpu/test/ops/test_slice.py
new file mode 100644
index 00000000000..416f880cd42
--- /dev/null
+++ b/backends/webgpu/test/ops/test_slice.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.slice_copy.Tensor` module + configs for the WebGPU op-test framework.
+
+`SliceModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SliceTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, slice_fn)
+CONFIGS = {
+    "dim1_1_5": ((3, 8, 4), lambda x: x[:, 1:5]),
+    "lastdim": ((3, 8, 4), lambda x: x[..., 1:3]),
+    "step2": ((3, 8, 4), lambda x: x[:, 0:8:2]),
+    "neg_end": ((3, 8, 4), lambda x: x[:, 1:-1]),
+}
+
+
+class SliceModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fn(x)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _export(fn, x: torch.Tensor):
+    ep = torch.export.export(SliceModule(fn).eval(), (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class SliceTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, fn) in CONFIGS.items():
+            with self.subTest(name=name):
+                et = _export(fn, _det_input(shape))
+                self.assertTrue(
+                    _delegated(et), f"Expected a VulkanBackend delegate (slice {name})"
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index 53a745a16df..5668a783e21 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -27,6 +27,7 @@
     exir_ops.edge.aten.sigmoid.default,
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.unsqueeze_copy.default,
+    exir_ops.edge.aten.slice_copy.Tensor,
 ]
 
 

From 09eb42cdb84eeff77472b88496865668a4b6fafd Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:20 -0700
Subject: [PATCH 3/4] [ExecuTorch][WebGPU] Add permute_copy + IntList graph
 support (aten.permute_copy.default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20396

Adds `aten.permute_copy.default` (a coordinate-reorder gather) to the WebGPU delegate, and the `IntList` graph value type it needs to read its `dims` argument.

Composition:
- `runtime/WebGPUGraph.{h,cpp}` — adds `ValueType::IntList` backed by `std::vector<std::vector<int64_t>> int_lists_` + `get_int_list(int)`; `build()` deserializes `vkgraph::GraphTypes::IntList` via `value_as_IntList()->items()` (int64, matching the FlatBuffer `[long]`); mirrors the existing scalar value plumbing.
- `runtime/ops/permute/Permute.cpp` — reads the permutation via `get_int_list`, normalizes negative dims, validates it is a permutation of `[0, ndim)`, builds two `TensorMeta` UBOs + a `PermuteParams{perm: vec4<u32>}` uniform, guards fp32 + rank≤4, dispatches over `compute_1d_workgroup_count(out.numel)` with `override wg_size`; releases all uniforms after the bind group.
- `runtime/ops/permute/permute.wgsl` — delinearizes the output index over the contiguous output strides, reads `input` at `in.strides[perm[d]]` per dim (mirrors Vulkan `permute_buffer.glsl`).
- Registers both `aten.permute_copy.default` and `aten.permute.default` to the same handler.
ghstack-source-id: 397026548
@exported-using-ghexport

Differential Revision: [D108793162](https://our.internmc.facebook.com/intern/diff/D108793162/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 backends/webgpu/runtime/WebGPUGraph.cpp       |   9 +
 backends/webgpu/runtime/WebGPUGraph.h         |   9 +-
 .../webgpu/runtime/ops/permute/Permute.cpp    | 197 ++++++++++++++++++
 .../webgpu/runtime/ops/permute/permute.wgsl   |  36 ++++
 .../webgpu/runtime/ops/permute/permute_wgsl.h |  60 ++++++
 6 files changed, 311 insertions(+), 1 deletion(-)
 create mode 100644 backends/webgpu/runtime/ops/permute/Permute.cpp
 create mode 100644 backends/webgpu/runtime/ops/permute/permute.wgsl
 create mode 100644 backends/webgpu/runtime/ops/permute/permute_wgsl.h

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 25b7bb565dc..c24ffb81ece 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -48,6 +48,7 @@ set(WEBGPU_SRCS
     runtime/ops/squeeze/Squeeze.cpp
     runtime/ops/unsqueeze/Unsqueeze.cpp
     runtime/ops/slice/Slice.cpp
+    runtime/ops/permute/Permute.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 0e00b2cb42b..aeaf42f1390 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -245,6 +245,7 @@ void WebGPUGraph::build(
   tensors_.resize(num_vals);
   tensor_mem_obj_ids_.resize(num_vals, -1);
   ints_.resize(num_vals, 0);
+  int_lists_.resize(num_vals);
   doubles_.resize(num_vals, 0.0);
   bools_.resize(num_vals, false);
   value_lists_.resize(num_vals);
@@ -375,6 +376,14 @@ void WebGPUGraph::build(
         ints_[i] = val->value_as_Int()->int_val();
         break;
       }
+      case vkgraph::GraphTypes::IntList: {
+        value_types_[i] = ValueType::IntList;
+        const auto* items = val->value_as_IntList()->items();
+        if (items) {
+          int_lists_[i].assign(items->cbegin(), items->cend());
+        }
+        break;
+      }
       case vkgraph::GraphTypes::Double: {
         value_types_[i] = ValueType::Double;
         doubles_[i] = val->value_as_Double()->double_val();
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index b9326cf016c..755ea7503be 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -131,6 +131,11 @@ class WebGPUGraph {
   int64_t get_int(int id) const {
     return ints_[id];
   }
+  // Int values of a serialized IntList (e.g. permute dims). int64 (FlatBuffer
+  // [long]) to match the schema and the get_int convention.
+  const std::vector<int64_t>& get_int_list(int id) const {
+    return int_lists_[id];
+  }
   bool get_bool(int id) const {
     return bools_[id];
   }
@@ -258,7 +263,8 @@ class WebGPUGraph {
     Null,
     String,
     SymInt,
-    ValueList
+    ValueList,
+    IntList
   };
 
   ValueType get_value_type(int id) const {
@@ -275,6 +281,7 @@ class WebGPUGraph {
   std::vector<ValueType> value_types_;
   std::vector<WebGPUTensor> tensors_;
   std::vector<int64_t> ints_;
+  std::vector<std::vector<int64_t>> int_lists_;
   std::vector<double> doubles_;
   std::vector<bool> bools_;
   std::vector<std::vector<int>> value_lists_;
diff --git a/backends/webgpu/runtime/ops/permute/Permute.cpp b/backends/webgpu/runtime/ops/permute/Permute.cpp
new file mode 100644
index 00000000000..5062c33cec1
--- /dev/null
+++ b/backends/webgpu/runtime/ops/permute/Permute.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/permute/permute_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct PermuteParams {
+  uint32_t perm[kTensorMetaMaxNdim];
+};
+static_assert(
+    sizeof(PermuteParams) == 16,
+    "PermuteParams must match the WGSL Params vec4<u32> (16 bytes)");
+
+// permute: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl, NCHW).
+void permute_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; out is the last value-id.
+  const int in_id = args.at(0);
+  const int dims_id = args.at(1);
+  const int out_id = args.at(args.size() - 1);
+
+  if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
+      graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("permute: in/out arg is not a tensor");
+  }
+  if (graph.get_value_type(dims_id) != WebGPUGraph::ValueType::IntList) {
+    throw std::runtime_error("permute: dims arg is not an IntList");
+  }
+
+  WGPUDevice device = graph.device();
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  const int ndim = static_cast<int>(in_tensor.dims.size());
+
+  const std::vector<int64_t>& dims = graph.get_int_list(dims_id);
+  if (static_cast<int>(dims.size()) != ndim ||
+      static_cast<int>(out_tensor.dims.size()) != ndim) {
+    throw std::runtime_error("permute: perm length != input/output rank");
+  }
+
+  // Normalize negative dims and verify perm is a permutation of [0, ndim).
+  uint32_t perm[kTensorMetaMaxNdim];
+  bool seen[kTensorMetaMaxNdim] = {};
+  if (ndim > static_cast<int>(kTensorMetaMaxNdim)) {
+    throw std::runtime_error("permute: tensor rank exceeds 4 (MAX_NDIM)");
+  }
+  for (int d = 0; d < ndim; d++) {
+    int64_t p = dims[d];
+    if (p < 0) {
+      p += ndim;
+    }
+    if (p < 0 || p >= ndim || seen[p]) {
+      throw std::runtime_error("permute: dims is not a valid permutation");
+    }
+    seen[p] = true;
+    perm[d] = static_cast<uint32_t>(p);
+  }
+  for (int d = ndim; d < static_cast<int>(kTensorMetaMaxNdim); d++) {
+    perm[d] = static_cast<uint32_t>(d);
+  }
+
+  TensorMeta out_meta;
+  TensorMeta in_meta;
+  fill_tensor_meta(out_tensor, &out_meta);
+  fill_tensor_meta(in_tensor, &in_meta);
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in_tensor.nbytes != static_cast<size_t>(in_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("permute: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  PermuteParams params = {};
+  std::memcpy(params.perm, perm, sizeof(perm));
+
+  uint32_t wg_size =
+      utils::clamp_workgroup_size(device, kPermuteWorkgroupSizeX);
+  uint32_t workgroup_count = utils::compute_1d_workgroup_count(
+      device, out_meta.numel, wg_size, "permute");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in_meta_buf =
+      utils::make_uniform(device, &in_meta, sizeof(TensorMeta));
+  WGPUBuffer params_buf =
+      utils::make_uniform(device, &params, sizeof(PermuteParams));
+  graph.add_uniform_buffer_bytes(
+      2 * sizeof(TensorMeta) + sizeof(PermuteParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kPermuteWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[5] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 5;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[5] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_meta_buf;
+  bg_entries[2].size = sizeof(TensorMeta);
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = in_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = params_buf;
+  bg_entries[4].size = sizeof(PermuteParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 5;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in_meta_buf);
+  wgpuBufferRelease(params_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.permute_copy.default, permute_impl);
+  WEBGPU_REGISTER_OP(aten.permute.default, permute_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/permute/permute.wgsl b/backends/webgpu/runtime/ops/permute/permute.wgsl
new file mode 100644
index 00000000000..521cfac1e66
--- /dev/null
+++ b/backends/webgpu/runtime/ops/permute/permute.wgsl
@@ -0,0 +1,36 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  perm: vec4<u32>,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl).
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]];
+    }
+    output[out_bufi] = input[in_bufi];
+}
diff --git a/backends/webgpu/runtime/ops/permute/permute_wgsl.h b/backends/webgpu/runtime/ops/permute/permute_wgsl.h
new file mode 100644
index 00000000000..6ec41cc8446
--- /dev/null
+++ b/backends/webgpu/runtime/ops/permute/permute_wgsl.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from permute.wgsl - DO NOT EDIT.
+// wgsl-sha256: d34f59730cda7317589b6ed5691a1ccab8666b9c94e17ac2cb3658b036300197
+inline constexpr const char* kPermuteWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  perm: vec4<u32>,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out coord d -> in coord perm[d] (Vulkan permute_buffer.glsl).
+    var rem = out_bufi;
+    var in_bufi: u32 = 0u;
+    for (var d: u32 = 0u; d < out_meta.ndim; d = d + 1u) {
+        let coord = rem / out_meta.strides[d];
+        rem = rem % out_meta.strides[d];
+        in_bufi = in_bufi + coord * in_meta.strides[params.perm[d]];
+    }
+    output[out_bufi] = input[in_bufi];
+}
+)";
+
+inline constexpr uint32_t kPermuteWorkgroupSizeX = 64;
+inline constexpr uint32_t kPermuteWorkgroupSizeY = 1;
+inline constexpr uint32_t kPermuteWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

From cdbc3020f4db3caaec9f39e554537840013aa6e5 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:20 -0700
Subject: [PATCH 4/4] [ExecuTorch][WebGPU] permute_copy op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20397

Registers `aten.permute_copy.default` in the `cases.py` op-test framework: a `_permute_suite` of 4 configs (3D rotation, 4D middle-dim transpose, 2D transpose, full 4D shuffle) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/permute/test_permute.py` (`PermuteModule` + `CONFIGS` + `_op_delegated` smoke test) and the `aten.permute_copy.default` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026550
@exported-using-ghexport

Differential Revision: [D108793156](https://our.internmc.facebook.com/intern/diff/D108793156/)
---
 backends/webgpu/test/op_tests/cases.py   | 17 ++++++
 backends/webgpu/test/ops/test_permute.py | 76 ++++++++++++++++++++++++
 backends/webgpu/test/tester.py           |  1 +
 3 files changed, 94 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_permute.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index 4422dfa08f6..f1ab0c6cca4 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -40,6 +40,10 @@
     CONFIGS as _MUL_CONFIGS,
     MulModule,
 )
+from executorch.backends.webgpu.test.ops.test_permute import (
+    CONFIGS as _PERMUTE_CONFIGS,
+    PermuteModule,
+)
 from executorch.backends.webgpu.test.ops.test_select import (
     CONFIGS as _SELECT_CONFIGS,
     SelectModule,
@@ -230,3 +234,16 @@ def _unsqueeze_suite() -> WebGPUTestSuite:
 @register_op_test("slice")
 def _slice_suite() -> WebGPUTestSuite:
     return _fn_config_suite(SliceModule, _SLICE_CONFIGS)
+
+
+@register_op_test("permute")
+def _permute_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, perm-tuple).
+    return WebGPUTestSuite(
+        module_factory=lambda perm: PermuteModule(perm),
+        cases=[
+            Case(name=n, construct={"perm": perm}, inputs=(shape,))
+            for n, (shape, perm) in _PERMUTE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # permutation reorders values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/test_permute.py b/backends/webgpu/test/ops/test_permute.py
new file mode 100644
index 00000000000..ef194c58db8
--- /dev/null
+++ b/backends/webgpu/test/ops/test_permute.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.permute_copy.default` module + configs for the WebGPU op-test framework.
+
+`PermuteModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `PermuteTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, perm)
+CONFIGS = {
+    "rot3d": ((2, 3, 4), (2, 0, 1)),
+    "mid4d": ((1, 8, 4, 16), (0, 2, 1, 3)),
+    "t2d": ((3, 5), (1, 0)),
+    "shuffle4d": ((2, 3, 4, 5), (3, 1, 0, 2)),
+}
+
+
+class PermuteModule(torch.nn.Module):
+    def __init__(self, perm):
+        super().__init__()
+        self.perm = perm
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.permute(x, self.perm).contiguous()
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(perm, x: torch.Tensor):
+    ep = torch.export.export(PermuteModule(perm).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class PermuteTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, perm) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(perm, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (permute {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "permute"),
+                    f"permute not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index 5668a783e21..b7f38246f8c 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -28,6 +28,7 @@
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.slice_copy.Tensor,
+    exir_ops.edge.aten.permute_copy.default,
 ]