From 476353a07b3a1057cd19af521a4682997acfc322 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:14 -0700
Subject: [PATCH 1/8] [ExecuTorch][WebGPU] Add view_copy op
 (aten.view_copy.default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20360

**Add `aten.view_copy.default` as a native buffer-to-buffer DMA** — a contiguous reshape on the dense row-major buffer backend is a flat copy, so it needs no shader.

**Problem:** a reshape only relabels shape metadata; the bytes are unchanged. Launching a compute dispatch (shader module + pipeline + bind group + uniform) just to run `output[i] = input[i]` is wasted setup for every view/clone/squeeze/unsqueeze in the graph.

**Solution:**
- Before: a `view_copy.wgsl` compute kernel dispatched over `num_elements`, with its own pipeline/bind-group/uniform per copy.
- After: `add_flat_copy` records a `wgpuCommandEncoderCopyBufferToBuffer` DMA in graph order — no shader, no pipeline, no uniform.

**Implementation:**
- `WebGPUGraph` gains a `WebGPUDispatch::Kind::Copy` command + `add_buffer_copy(src, dst, nbytes)`; `execute()` emits the encoder-level copy between compute passes (both single-shot and chunked paths), preserving the existing per-pass read-after-write ordering.
- `add_flat_copy` (declared in `view_copy.h`, reused by the stacked clone/squeeze/unsqueeze) keeps the fail-loud guards (both tensors, fp32 4-byte alignment, equal `nbytes`) and treats an aliased in/out buffer as a no-op.
- Tensor buffers already carry `CopySrc | CopyDst`, so no usage-flag change is needed.
- Mirrors Vulkan `add_view_copy_node` (`backends/vulkan/runtime/graph/ops/impl/View.cpp`): Vulkan always dispatches `view_buffer.glsl` only to remap non-contiguous layouts, which the buffer-only WebGPU backend never produces — so the contiguous DMA is the equivalent path.
ghstack-source-id: 397026498
@exported-using-ghexport

Differential Revision: [D108793164](https://our.internmc.facebook.com/intern/diff/D108793164/)
---
 backends/webgpu/CMakeLists.txt                |  1 +
 backends/webgpu/runtime/WebGPUGraph.cpp       | 20 ++++++
 backends/webgpu/runtime/WebGPUGraph.h         | 17 +++++
 .../webgpu/runtime/ops/view_copy/ViewCopy.cpp | 62 +++++++++++++++++++
 .../webgpu/runtime/ops/view_copy/view_copy.h  | 18 ++++++
 5 files changed, 118 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
 create mode 100644 backends/webgpu/runtime/ops/view_copy/view_copy.h

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index f7cd85f9758..88c7b1f73a3 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -42,6 +42,7 @@ set(WEBGPU_SRCS
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
     runtime/ops/prepack/Prepack.cpp
+    runtime/ops/view_copy/ViewCopy.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index b7fb4313400..0e00b2cb42b 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -679,6 +679,16 @@ void WebGPUGraph::execute() {
     // One pass per dispatch: enforces storage RAW ordering across deps.
     for (size_t i = 0; i < n; i++) {
       const auto& dispatch = dispatches_[i];
+      if (dispatch.kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatch.copy_src,
+            0,
+            dispatch.copy_dst,
+            0,
+            dispatch.copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
       // tw must outlive BeginComputePass (the descriptor points at it).
@@ -757,6 +767,16 @@ void WebGPUGraph::execute() {
         wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
 
     for (size_t i = start; i < end; i++) {
+      if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatches_[i].copy_src,
+            0,
+            dispatches_[i].copy_dst,
+            0,
+            dispatches_[i].copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
       WGPUComputePassEncoder pass =
           wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 3572f751a06..b9326cf016c 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -42,6 +42,12 @@ struct WebGPUDispatch {
   WGPUBindGroup bind_group = nullptr;
   uint32_t workgroup_count_x = 1;
   std::string kernel_name; // bench label
+  // DMA copy command; default Compute keeps existing positional inits valid.
+  enum class Kind { Compute, Copy };
+  Kind kind = Kind::Compute;
+  WGPUBuffer copy_src = nullptr;
+  WGPUBuffer copy_dst = nullptr;
+  size_t copy_nbytes = 0;
 };
 
 struct OutputCopy {
@@ -189,6 +195,17 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy).
+  void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
+    WebGPUDispatch d;
+    d.kind = WebGPUDispatch::Kind::Copy;
+    d.copy_src = src;
+    d.copy_dst = dst;
+    d.copy_nbytes = nbytes;
+    d.kernel_name = "flat_copy";
+    dispatches_.push_back(d);
+  }
+
   // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
   // transfer. Build-time only (the .pte bytes are freed after build()).
   // Mirrors Vulkan prepack_standard.
diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
new file mode 100644
index 00000000000..d56c721ce3e
--- /dev/null
+++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) {
+  // get_tensor doesn't type-check; assert both args are tensors (fail loud).
+  if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
+      graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("flat_copy: in/out arg is not a tensor");
+  }
+
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap).
+
+  // 4-byte alignment guard (fp32 element size); does not verify dtype.
+  if (in_tensor.nbytes % sizeof(float) != 0 ||
+      out_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error("flat_copy: operand not 4-byte aligned");
+  }
+
+  // view preserves numel; this guard also prevents an OOB copy.
+  if (in_tensor.nbytes != out_tensor.nbytes) {
+    throw std::runtime_error("flat_copy: input/output size mismatch");
+  }
+
+  // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst.
+  if (in_tensor.buffer == out_tensor.buffer) {
+    return;
+  }
+
+  graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes);
+}
+
+namespace {
+
+// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer).
+void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, size, out]; out = last value-id (shape from out_tensor.dims).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h
new file mode 100644
index 00000000000..bfa81174ba9
--- /dev/null
+++ b/backends/webgpu/runtime/ops/view_copy/view_copy.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+namespace executorch::backends::webgpu {
+
+// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h).
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id);
+
+} // namespace executorch::backends::webgpu

From ffc99c6ce195f90c984de969dea0b3203e580cbd Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:15 -0700
Subject: [PATCH 2/8] [ExecuTorch][WebGPU] view_copy op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20361

Registers `aten.view_copy.default` in the `cases.py` op-test framework: a `_view_copy_suite` of 3 configs (rank-reducing reshape `(2,3,4)->(6,4)`, flatten `->(-1,)`, rank-increasing reshape `->(1,2,3,4)`) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/view_copy/test_view_copy.py` (`ViewModule` + `CONFIGS` + export-delegation/eager smoke test) and the `aten.view_copy.default` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026501
@exported-using-ghexport

Differential Revision: [D108793155](https://our.internmc.facebook.com/intern/diff/D108793155/)
---
 backends/webgpu/test/op_tests/cases.py     | 23 ++++++++
 backends/webgpu/test/ops/test_view_copy.py | 66 ++++++++++++++++++++++
 backends/webgpu/test/tester.py             |  1 +
 3 files changed, 90 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_view_copy.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index 4d01517bedb..c0fbade9bbf 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -40,6 +40,10 @@
     CONFIGS as _MUL_CONFIGS,
     MulModule,
 )
+from executorch.backends.webgpu.test.ops.test_view_copy import (
+    CONFIGS as _VIEW_CONFIGS,
+    ViewModule,
+)
 
 # rms_norm coverage is exactly the 15 cases the native test covered.
 RMS_NORM_CASES = _CASES
@@ -121,3 +125,22 @@ def _mul_suite() -> WebGPUTestSuite:
             Case(name=name, inputs=(sa, sb)) for name, (sa, sb) in _MUL_CONFIGS.items()
         ],
     )
+
+
+def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite:
+    """Builder for ops whose per-case spec is a (shape, fn) pair (view/select/slice).
+    The fn is a `construct` kwarg baked into the .pte module, never a serialized input.
+    """
+    return WebGPUTestSuite(
+        module_factory=lambda fn: module_cls(fn),
+        cases=[
+            Case(name=n, construct={"fn": fn}, inputs=(shape,))
+            for n, (shape, fn) in configs.items()
+        ],
+        golden_dtype="float32",  # gather/copy: fp64 bit-identical, skip dual-oracle
+    )
+
+
+@register_op_test("view_copy")
+def _view_copy_suite() -> WebGPUTestSuite:
+    return _fn_config_suite(ViewModule, _VIEW_CONFIGS)
diff --git a/backends/webgpu/test/ops/test_view_copy.py b/backends/webgpu/test/ops/test_view_copy.py
new file mode 100644
index 00000000000..a9c28f2b631
--- /dev/null
+++ b/backends/webgpu/test/ops/test_view_copy.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.view_copy.default` module + configs for the WebGPU op-test framework.
+
+`ViewModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `ViewCopyTest` is the export-delegation
+smoke test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, view_fn)
+CONFIGS = {
+    "reshape": ((2, 3, 4), lambda x: x.reshape(6, 4)),
+    "flatten": ((2, 3, 4), lambda x: x.reshape(-1)),
+    "reshape4d": ((2, 3, 4), lambda x: x.reshape(1, 2, 3, 4)),
+}
+
+
+class ViewModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fn(x)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _export(fn, x: torch.Tensor):
+    ep = torch.export.export(ViewModule(fn).eval(), (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class ViewCopyTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, fn) in CONFIGS.items():
+            with self.subTest(name=name):
+                et = _export(fn, _det_input(shape))
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (view_copy {name})",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index eafb64c6961..bb09fffb43e 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -22,6 +22,7 @@
     exir_ops.edge.aten.add.Tensor,
     exir_ops.edge.et_vk.rms_norm.default,
     exir_ops.edge.aten.mul.Tensor,
+    exir_ops.edge.aten.view_copy.default,
 ]
 
 

From 9418cd6a8b35d45fe4a1389da284221178a700e5 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:15 -0700
Subject: [PATCH 3/8] [ExecuTorch][WebGPU] Add select_copy op
 (aten.select_copy.int)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20362

Adds `aten.select_copy.int` to the WebGPU delegate as a gather: picks a fixed index along one dim, producing an output of rank (input rank - 1).

Composition (single dispatch):
- `select/Select.cpp` — reads `[self, dim, index, out]` (static `Int` via `read_scalar`; throws on dynamic `SymInt`), normalizes + bounds-checks dim/index, builds 2 `TensorMeta` UBOs + a `SelectParams{dim,index}`, fp32 guard, 1D-dispatch over `numel`, releases uniforms after the bind group.
- `select/select.wgsl` — seeds the input offset with `index * in.strides[dim]`, delinearizes the output index, maps each out dim to its in dim (shifted past the selected dim), relinearizes on input strides.
ghstack-source-id: 397026510
@exported-using-ghexport

Differential Revision: [D108793166](https://our.internmc.facebook.com/intern/diff/D108793166/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 backends/webgpu/runtime/ops/select/Select.cpp | 184 ++++++++++++++++++
 .../webgpu/runtime/ops/select/select.wgsl     |  41 ++++
 .../webgpu/runtime/ops/select/select_wgsl.h   |  65 +++++++
 4 files changed, 291 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/select/Select.cpp
 create mode 100644 backends/webgpu/runtime/ops/select/select.wgsl
 create mode 100644 backends/webgpu/runtime/ops/select/select_wgsl.h

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 88c7b1f73a3..c774fd27845 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -43,6 +43,7 @@ set(WEBGPU_SRCS
     runtime/ops/rope/RotaryEmbedding.cpp
     runtime/ops/prepack/Prepack.cpp
     runtime/ops/view_copy/ViewCopy.cpp
+    runtime/ops/select/Select.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/select/Select.cpp b/backends/webgpu/runtime/ops/select/Select.cpp
new file mode 100644
index 00000000000..5686bbc79c0
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/Select.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/select/select_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct SelectParams {
+  uint32_t dim;
+  uint32_t index;
+  uint32_t _pad[2];
+};
+
+// dim/index are required Ints (SymInt throws); no Null default unlike slice.
+int64_t read_scalar(WebGPUGraph& graph, int id, const char* what) {
+  if (graph.get_value_type(id) == WebGPUGraph::ValueType::Int) {
+    return graph.get_int(id);
+  }
+  throw std::runtime_error(std::string("select: dynamic/unsupported ") + what);
+}
+
+void select_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, index, out]; output rank = in rank - 1.
+  const int in_id = args.at(0);
+  const int out_id = args.at(3);
+
+  WGPUDevice device = graph.device();
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) {
+    throw std::runtime_error("select: null buffer binding");
+  }
+
+  const int in_ndim = static_cast<int>(in_tensor.dims.size());
+  int64_t dim = read_scalar(graph, args.at(1), "dim");
+  if (dim < 0) {
+    dim += in_ndim;
+  }
+  if (dim < 0 || dim >= in_ndim) {
+    throw std::runtime_error("select: dim out of range");
+  }
+  const int64_t in_size = in_tensor.dims[dim];
+  int64_t index = read_scalar(graph, args.at(2), "index");
+  if (index < 0) {
+    index += in_size;
+  }
+  if (index < 0 || index >= in_size) {
+    throw std::runtime_error("select: index out of range");
+  }
+
+  TensorMeta out_meta;
+  TensorMeta in_meta;
+  fill_tensor_meta(out_tensor, &out_meta);
+  fill_tensor_meta(in_tensor, &in_meta);
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in_tensor.nbytes != static_cast<size_t>(in_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("select: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  SelectParams params = {};
+  params.dim = static_cast<uint32_t>(dim);
+  params.index = static_cast<uint32_t>(index);
+
+  uint32_t wg_size = utils::clamp_workgroup_size(device, kSelectWorkgroupSizeX);
+  uint32_t workgroup_count = utils::compute_1d_workgroup_count(
+      device, out_meta.numel, wg_size, "select");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in_meta_buf =
+      utils::make_uniform(device, &in_meta, sizeof(TensorMeta));
+  WGPUBuffer params_buf =
+      utils::make_uniform(device, &params, sizeof(SelectParams));
+  graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SelectParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kSelectWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[5] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 5;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[5] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_meta_buf;
+  bg_entries[2].size = sizeof(TensorMeta);
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = in_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = params_buf;
+  bg_entries[4].size = sizeof(SelectParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 5;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in_meta_buf);
+  wgpuBufferRelease(params_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.select_copy.int, select_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/select/select.wgsl b/backends/webgpu/runtime/ops/select/select.wgsl
new file mode 100644
index 00000000000..84938b2c840
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/select.wgsl
@@ -0,0 +1,41 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  index: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index.
+    var rem = out_bufi;
+    var in_bufi: u32 = params.index * in_meta.strides[params.dim];
+    for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) {
+        let coord = rem / out_meta.strides[od];
+        rem = rem % out_meta.strides[od];
+        var id = od;
+        if (od >= params.dim) {
+            id = od + 1u;
+        }
+        in_bufi = in_bufi + coord * in_meta.strides[id];
+    }
+    output[out_bufi] = input[in_bufi];
+}
diff --git a/backends/webgpu/runtime/ops/select/select_wgsl.h b/backends/webgpu/runtime/ops/select/select_wgsl.h
new file mode 100644
index 00000000000..e66edde240d
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/select_wgsl.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from select.wgsl - DO NOT EDIT.
+// wgsl-sha256: 200cf5a8190045aa0562e782f01c1cfaf9681f30f679f5112ccc3d347a0ed8df
+inline constexpr const char* kSelectWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  index: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index.
+    var rem = out_bufi;
+    var in_bufi: u32 = params.index * in_meta.strides[params.dim];
+    for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) {
+        let coord = rem / out_meta.strides[od];
+        rem = rem % out_meta.strides[od];
+        var id = od;
+        if (od >= params.dim) {
+            id = od + 1u;
+        }
+        in_bufi = in_bufi + coord * in_meta.strides[id];
+    }
+    output[out_bufi] = input[in_bufi];
+}
+)";
+
+inline constexpr uint32_t kSelectWorkgroupSizeX = 64;
+inline constexpr uint32_t kSelectWorkgroupSizeY = 1;
+inline constexpr uint32_t kSelectWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

From 5f7ab1542e0097e48f3ec4ab1ebd71fb1e0b55a4 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:16 -0700
Subject: [PATCH 4/8] [ExecuTorch][WebGPU] select_copy op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20363

Registers `aten.select_copy.int` in the `cases.py` op-test framework: a `_select_suite` of 4 configs (leading/middle/last dim + negative index) that `generate_op_tests` exports and compares to a torch golden on Dawn. Also adds `test/ops/select/test_select.py` (`SelectModule` + `CONFIGS` + an export-delegation/eager smoke test) and the `aten.select_copy.int` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026513
@exported-using-ghexport

Differential Revision: [D108793161](https://our.internmc.facebook.com/intern/diff/D108793161/)
---
 backends/webgpu/test/op_tests/cases.py  |  9 ++++
 backends/webgpu/test/ops/test_select.py | 67 +++++++++++++++++++++++++
 backends/webgpu/test/tester.py          |  1 +
 3 files changed, 77 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_select.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index c0fbade9bbf..be5276cc57a 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -40,6 +40,10 @@
     CONFIGS as _MUL_CONFIGS,
     MulModule,
 )
+from executorch.backends.webgpu.test.ops.test_select import (
+    CONFIGS as _SELECT_CONFIGS,
+    SelectModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -144,3 +148,8 @@ def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite:
 @register_op_test("view_copy")
 def _view_copy_suite() -> WebGPUTestSuite:
     return _fn_config_suite(ViewModule, _VIEW_CONFIGS)
+
+
+@register_op_test("select")
+def _select_suite() -> WebGPUTestSuite:
+    return _fn_config_suite(SelectModule, _SELECT_CONFIGS)
diff --git a/backends/webgpu/test/ops/test_select.py b/backends/webgpu/test/ops/test_select.py
new file mode 100644
index 00000000000..739ebe4d190
--- /dev/null
+++ b/backends/webgpu/test/ops/test_select.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.select_copy.int` module + configs for the WebGPU op-test framework.
+
+`SelectModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SelectTest` is the export-delegation smoke test.
+Configs cover the leading, middle, and last dim plus a negative index (output rank =
+input rank - 1).
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, select_fn)
+CONFIGS = {
+    "dim0": ((3, 8, 4), lambda x: x[1]),
+    "middle": ((3, 8, 4), lambda x: x[:, 2]),
+    "last": ((3, 8, 4), lambda x: x[..., 3]),
+    "neg_idx": ((3, 8, 4), lambda x: x[:, -1]),
+}
+
+
+class SelectModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fn(x)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _export(fn, x: torch.Tensor):
+    ep = torch.export.export(SelectModule(fn).eval(), (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class SelectTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, fn) in CONFIGS.items():
+            with self.subTest(name=name):
+                et = _export(fn, _det_input(shape))
+                self.assertTrue(
+                    _delegated(et), f"Expected a VulkanBackend delegate (select {name})"
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index bb09fffb43e..9ba9a4d9ad4 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -23,6 +23,7 @@
     exir_ops.edge.et_vk.rms_norm.default,
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.view_copy.default,
+    exir_ops.edge.aten.select_copy.int,
 ]
 
 

From 369802ad96d7f1bb3b7cdd1ffb377a596a3f7b6b Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:16 -0700
Subject: [PATCH 5/8] [ExecuTorch][WebGPU] Add sigmoid op
 (aten.sigmoid.default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20390

Adds `aten.sigmoid.default` to the WebGPU delegate: element-wise `1/(1+exp(-x))` over a flat fp32 buffer. On the Llama critical path (`F.silu` -> `sigmoid` + `mul`).

Composition (single dispatch):
- `sigmoid/UnaryOp.cpp` — binds input (storage, read-only) + output (storage) + a `Params{num_elements}` uniform, 1D-dispatches over `num_elements` with `override wg_size` (clamped to the device limit); mirrors the `add` op (uniform mapped-at-creation, released after the bind group).
- `sigmoid/sigmoid.wgsl` — guards `idx >= num_elements` and writes the logistic of each element.
ghstack-source-id: 397026515
@exported-using-ghexport

Differential Revision: [D108793157](https://our.internmc.facebook.com/intern/diff/D108793157/)
---
 backends/webgpu/CMakeLists.txt                |   1 +
 .../webgpu/runtime/ops/sigmoid/UnaryOp.cpp    | 165 ++++++++++++++++++
 .../webgpu/runtime/ops/sigmoid/sigmoid.wgsl   |  18 ++
 .../webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h |  42 +++++
 4 files changed, 226 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp
 create mode 100644 backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl
 create mode 100644 backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index c774fd27845..c3b6ef4e706 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -44,6 +44,7 @@ set(WEBGPU_SRCS
     runtime/ops/prepack/Prepack.cpp
     runtime/ops/view_copy/ViewCopy.cpp
     runtime/ops/select/Select.cpp
+    runtime/ops/sigmoid/UnaryOp.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp
new file mode 100644
index 00000000000..4d1a087cae5
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// Uniform buffer layout matching the WGSL Params struct; 16-byte aligned.
+struct UnaryParams {
+  uint32_t num_elements;
+  uint32_t _pad[3];
+};
+
+// Generic elementwise unary op; mirrors Vulkan add_unary_op_node (UnaryOp.cpp).
+void add_unary_op(
+    WebGPUGraph& graph,
+    int in_id,
+    int out_id,
+    const char* wgsl_source,
+    uint32_t wg_size_x,
+    const char* op_name) {
+  WGPUDevice device = graph.device();
+
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) {
+    throw std::runtime_error(std::string(op_name) + ": null buffer binding");
+  }
+
+  // 4-byte (fp32) alignment guard on both operands; also the dtype guard.
+  if (in_tensor.nbytes % sizeof(float) != 0 ||
+      out_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error(
+        std::string(op_name) + ": operand not 4-byte aligned");
+  }
+  if (in_tensor.nbytes != out_tensor.nbytes) {
+    throw std::runtime_error(
+        std::string(op_name) + ": input/output size mismatch");
+  }
+
+  uint32_t num_elements =
+      static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));
+
+  uint32_t wg_size = utils::clamp_workgroup_size(device, wg_size_x);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, num_elements, wg_size, op_name);
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  UnaryParams params = {};
+  params.num_elements = num_elements;
+
+  WGPUBuffer uniform_buffer =
+      utils::make_uniform(device, &params, sizeof(UnaryParams));
+  graph.add_uniform_buffer_bytes(sizeof(UnaryParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {wgsl_source, WGPU_STRLEN};
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group layout: input (read storage) + output (storage) + params.
+  WGPUBindGroupLayoutEntry entries[3] = {};
+
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 3;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[3] = {};
+
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = uniform_buffer;
+  bg_entries[2].size = sizeof(UnaryParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 3;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  // Release intermediates (pipeline + bind_group are kept by dispatch).
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our ref; the bind group keeps the uniform buffer alive until release.
+  wgpuBufferRelease(uniform_buffer);
+}
+
+void sigmoid_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // aten.sigmoid.default args: [in, out]
+  add_unary_op(
+      graph,
+      args.at(0),
+      args.at(1),
+      kSigmoidWGSL,
+      kSigmoidWorkgroupSizeX,
+      "sigmoid");
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.sigmoid.default, sigmoid_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl
new file mode 100644
index 00000000000..09b3e5457b3
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl
@@ -0,0 +1,18 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+}
+@group(0) @binding(2) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = 1.0 / (1.0 + exp(-input[idx]));
+}
diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h
new file mode 100644
index 00000000000..48e6efc607a
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from sigmoid.wgsl - DO NOT EDIT.
+// wgsl-sha256: 70395dbb107b8b95ae13c0a6fb12a8415c561c645da0347294c92904314ae84c
+inline constexpr const char* kSigmoidWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+}
+@group(0) @binding(2) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = 1.0 / (1.0 + exp(-input[idx]));
+}
+)";
+
+inline constexpr uint32_t kSigmoidWorkgroupSizeX = 64;
+inline constexpr uint32_t kSigmoidWorkgroupSizeY = 1;
+inline constexpr uint32_t kSigmoidWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu

From 3e8b396e6f796358cd05530a71dc56818072c41b Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:17 -0700
Subject: [PATCH 6/8] [ExecuTorch][WebGPU] sigmoid op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20391

Registers `aten.sigmoid.default` in the `cases.py` op-test framework: a `_sigmoid_suite` (hard-coded shapes + a saturation case over a `linspace(-12, 12)` input) that `generate_op_tests` exports and compares to an fp64 torch golden on Dawn. Also adds `test/ops/sigmoid/test_sigmoid.py` (`SigmoidModule` + `N` + `_det_input` + an export-delegation/eager smoke test) and the `aten.sigmoid.default` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026520
@exported-using-ghexport

Differential Revision: [D108793159](https://our.internmc.facebook.com/intern/diff/D108793159/)
---
 backends/webgpu/test/op_tests/cases.py   | 31 ++++++++++++++
 backends/webgpu/test/ops/test_sigmoid.py | 51 ++++++++++++++++++++++++
 backends/webgpu/test/tester.py           |  1 +
 3 files changed, 83 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_sigmoid.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index be5276cc57a..7df3ee11f11 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -44,6 +44,11 @@
     CONFIGS as _SELECT_CONFIGS,
     SelectModule,
 )
+from executorch.backends.webgpu.test.ops.test_sigmoid import (
+    _det_input as _sigmoid_det_input,
+    N as _SIGMOID_N,
+    SigmoidModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -153,3 +158,29 @@ def _view_copy_suite() -> WebGPUTestSuite:
 @register_op_test("select")
 def _select_suite() -> WebGPUTestSuite:
     return _fn_config_suite(SelectModule, _SELECT_CONFIGS)
+
+
+def _sigmoid_full_range(_shape) -> torch.Tensor:
+    # Reuses the monolith's saturation-tail input (linspace(-12, 12)).
+    return _sigmoid_det_input()
+
+
+@register_op_test("sigmoid")
+def _sigmoid_suite() -> WebGPUTestSuite:
+    # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4).
+    return WebGPUTestSuite(
+        module_factory=lambda: SigmoidModule(),
+        cases=[
+            Case(name="vec", inputs=((M1,),)),
+            Case(name="mat", inputs=((M1, M2),)),
+            Case(name="rank3", inputs=((S1, M1, M2),)),
+            Case(name="rank4", inputs=((S1, S2, S2, M2),)),
+            # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss.
+            Case(
+                name="saturation",
+                inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),),
+            ),
+        ],
+        atol=1e-4,
+        rtol=1e-4,
+    )
diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py
new file mode 100644
index 00000000000..0ba8c435a9a
--- /dev/null
+++ b/backends/webgpu/test/ops/test_sigmoid.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.sigmoid.default` module + input for the WebGPU op-test framework.
+
+`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the
+declarative op-test suite. `SigmoidTest` is the export-delegation
+smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the
+deterministic input spans the saturation tails.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# Input length; the deterministic input spans the saturation tails.
+N = 64
+
+
+class SigmoidModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sigmoid(x)
+
+
+def _det_input() -> torch.Tensor:
+    """Deterministic fp32 input spanning negatives, zero, and large magnitudes."""
+    return torch.linspace(-12.0, 12.0, N, dtype=torch.float32)
+
+
+def _export(m: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(m, (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class SigmoidTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        et = _export(SigmoidModule().eval(), _det_input())
+        found = any(
+            d.id == "VulkanBackend"
+            for plan in et.executorch_program.execution_plan
+            for d in plan.delegates
+        )
+        self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)")
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index 9ba9a4d9ad4..e5dd510d49b 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -24,6 +24,7 @@
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.select_copy.int,
+    exir_ops.edge.aten.sigmoid.default,
 ]
 
 

From 1150e19f4e2191dde8c57d933132aac2a06f7574 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:17 -0700
Subject: [PATCH 7/8] [ExecuTorch][WebGPU] Add squeeze_copy + unsqueeze_copy
 (flat copies)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20392

Adds `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` to the WebGPU delegate. Both are numel-preserving shape ops; on a dense row-major buffer backend they are the same flat copy as `view_copy` — only the shape metadata differs (mirrors the Vulkan delegate, which routes both through `add_view_copy_node`).

Composition (no new kernel):
- `squeeze/Squeeze.cpp` — reads `args = [self, dims, out]`, ignores the AOT-fixed `dims`, calls `add_flat_copy(graph, in, out)` from `runtime/ops/view_copy/view_copy.h`.
- `unsqueeze/Unsqueeze.cpp` — reads `args = [self, dim, out]`, ignores the AOT-fixed `dim`, calls `add_flat_copy(graph, in, out)`.
ghstack-source-id: 397026523
@exported-using-ghexport

Differential Revision: [D108793153](https://our.internmc.facebook.com/intern/diff/D108793153/)
---
 backends/webgpu/CMakeLists.txt                |  2 ++
 .../webgpu/runtime/ops/squeeze/Squeeze.cpp    | 31 +++++++++++++++++++
 .../runtime/ops/unsqueeze/Unsqueeze.cpp       | 31 +++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
 create mode 100644 backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index c3b6ef4e706..01bb5236a44 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -45,6 +45,8 @@ set(WEBGPU_SRCS
     runtime/ops/view_copy/ViewCopy.cpp
     runtime/ops/select/Select.cpp
     runtime/ops/sigmoid/UnaryOp.cpp
+    runtime/ops/squeeze/Squeeze.cpp
+    runtime/ops/unsqueeze/Unsqueeze.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
new file mode 100644
index 00000000000..12b0fe561f1
--- /dev/null
+++ b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104).
+void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; dims ignored (out shape fixed AOT).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
new file mode 100644
index 00000000000..27d2c52e708
--- /dev/null
+++ b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103).
+void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu

From ae8e7e895ac663e8917c092ae98ace3a519239dd Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:18 -0700
Subject: [PATCH 8/8] [ExecuTorch][WebGPU] squeeze_copy + unsqueeze_copy test
 suites (cases.py op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20393

Registers `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` in the `cases.py` op-test framework: a `_squeeze_suite` of 3 configs (squeeze leading/middle/multiple size-1 dims) and a `_unsqueeze_suite` of 3 configs (insert dim at front/middle/last) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/squeeze/test_squeeze.py` (`SqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), `test/ops/unsqueeze/test_unsqueeze.py` (`UnsqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), and the two partitioner-allowlist entries in `tester.py`.
ghstack-source-id: 397026525
@exported-using-ghexport

Differential Revision: [D108793152](https://our.internmc.facebook.com/intern/diff/D108793152/)
---
 backends/webgpu/test/op_tests/cases.py     | 36 +++++++++++
 backends/webgpu/test/ops/test_squeeze.py   | 75 ++++++++++++++++++++++
 backends/webgpu/test/ops/test_unsqueeze.py | 75 ++++++++++++++++++++++
 backends/webgpu/test/tester.py             |  2 +
 4 files changed, 188 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_squeeze.py
 create mode 100644 backends/webgpu/test/ops/test_unsqueeze.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index 7df3ee11f11..0db8685fa18 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -49,6 +49,16 @@
     N as _SIGMOID_N,
     SigmoidModule,
 )
+
+from executorch.backends.webgpu.test.ops.test_squeeze import (
+    CONFIGS as _SQUEEZE_CONFIGS,
+    SqueezeModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_unsqueeze import (
+    CONFIGS as _UNSQUEEZE_CONFIGS,
+    UnsqueezeModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -184,3 +194,29 @@ def _sigmoid_suite() -> WebGPUTestSuite:
         atol=1e-4,
         rtol=1e-4,
     )
+
+
+@register_op_test("squeeze")
+def _squeeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim) where dim is an int or a tuple.
+    return WebGPUTestSuite(
+        module_factory=lambda dim: SqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _SQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
+
+
+@register_op_test("unsqueeze")
+def _unsqueeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim).
+    return WebGPUTestSuite(
+        module_factory=lambda dim: UnsqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py
new file mode 100644
index 00000000000..b55a5143538
--- /dev/null
+++ b/backends/webgpu/test/ops/test_squeeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework.
+
+`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, squeeze_dim)
+CONFIGS = {
+    "dim0": ((1, 3, 4), 0),
+    "mid": ((2, 1, 4), 1),
+    "multi": ((1, 3, 1, 4), (0, 2)),
+}
+
+
+class SqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.squeeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(SqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class SqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (squeeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "squeeze_copy"),
+                    f"squeeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py
new file mode 100644
index 00000000000..dcddf4faa51
--- /dev/null
+++ b/backends/webgpu/test/ops/test_unsqueeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework.
+
+`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `UnsqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, unsqueeze_dim)
+CONFIGS = {
+    "front": ((3, 4), 0),
+    "mid": ((2, 4), 1),
+    "last": ((3, 4), 2),
+}
+
+
+class UnsqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.unsqueeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class UnsqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (unsqueeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "unsqueeze_copy"),
+                    f"unsqueeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index e5dd510d49b..53a745a16df 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -25,6 +25,8 @@
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.select_copy.int,
     exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.squeeze_copy.dims,
+    exir_ops.edge.aten.unsqueeze_copy.default,
 ]