From b10992674cc71ee3a4d9675a438618bdd01b56aa Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:17 -0700
Subject: [PATCH 1/3] [ExecuTorch][WebGPU] sigmoid op test suite (cases.py
 op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20391

Registers `aten.sigmoid.default` in the `cases.py` op-test framework: a `_sigmoid_suite` (hard-coded shapes + a saturation case over a `linspace(-12, 12)` input) that `generate_op_tests` exports and compares to an fp64 torch golden on Dawn. Also adds `test/ops/sigmoid/test_sigmoid.py` (`SigmoidModule` + `N` + `_det_input` + an export-delegation/eager smoke test) and the `aten.sigmoid.default` partitioner-allowlist entry in `tester.py`.
ghstack-source-id: 397026520
@exported-using-ghexport

Differential Revision: [D108793159](https://our.internmc.facebook.com/intern/diff/D108793159/)
---
 backends/webgpu/test/op_tests/cases.py   | 31 ++++++++++++++
 backends/webgpu/test/ops/test_sigmoid.py | 51 ++++++++++++++++++++++++
 backends/webgpu/test/tester.py           |  1 +
 3 files changed, 83 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_sigmoid.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index be5276cc57a..7df3ee11f11 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -44,6 +44,11 @@
     CONFIGS as _SELECT_CONFIGS,
     SelectModule,
 )
+from executorch.backends.webgpu.test.ops.test_sigmoid import (
+    _det_input as _sigmoid_det_input,
+    N as _SIGMOID_N,
+    SigmoidModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -153,3 +158,29 @@ def _view_copy_suite() -> WebGPUTestSuite:
 @register_op_test("select")
 def _select_suite() -> WebGPUTestSuite:
     return _fn_config_suite(SelectModule, _SELECT_CONFIGS)
+
+
+def _sigmoid_full_range(_shape) -> torch.Tensor:
+    # Reuses the monolith's saturation-tail input (linspace(-12, 12)).
+    return _sigmoid_det_input()
+
+
+@register_op_test("sigmoid")
+def _sigmoid_suite() -> WebGPUTestSuite:
+    # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4).
+    return WebGPUTestSuite(
+        module_factory=lambda: SigmoidModule(),
+        cases=[
+            Case(name="vec", inputs=((M1,),)),
+            Case(name="mat", inputs=((M1, M2),)),
+            Case(name="rank3", inputs=((S1, M1, M2),)),
+            Case(name="rank4", inputs=((S1, S2, S2, M2),)),
+            # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss.
+            Case(
+                name="saturation",
+                inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),),
+            ),
+        ],
+        atol=1e-4,
+        rtol=1e-4,
+    )
diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py
new file mode 100644
index 00000000000..0ba8c435a9a
--- /dev/null
+++ b/backends/webgpu/test/ops/test_sigmoid.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.sigmoid.default` module + input for the WebGPU op-test framework.
+
+`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the
+declarative op-test suite. `SigmoidTest` is the export-delegation
+smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the
+deterministic input spans the saturation tails.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# Input length; the deterministic input spans the saturation tails.
+N = 64
+
+
+class SigmoidModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sigmoid(x)
+
+
+def _det_input() -> torch.Tensor:
+    """Deterministic fp32 input spanning negatives, zero, and large magnitudes."""
+    return torch.linspace(-12.0, 12.0, N, dtype=torch.float32)
+
+
+def _export(m: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(m, (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class SigmoidTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        et = _export(SigmoidModule().eval(), _det_input())
+        found = any(
+            d.id == "VulkanBackend"
+            for plan in et.executorch_program.execution_plan
+            for d in plan.delegates
+        )
+        self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)")
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index 9ba9a4d9ad4..e5dd510d49b 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -24,6 +24,7 @@
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.select_copy.int,
+    exir_ops.edge.aten.sigmoid.default,
 ]
 
 

From 621084c2912df140386f4bb2f83352316bbfdedd Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:17 -0700
Subject: [PATCH 2/3] [ExecuTorch][WebGPU] Add squeeze_copy + unsqueeze_copy
 (flat copies)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/executorch/pull/20392

Adds `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` to the WebGPU delegate. Both are numel-preserving shape ops; on a dense row-major buffer backend they are the same flat copy as `view_copy` — only the shape metadata differs (mirrors the Vulkan delegate, which routes both through `add_view_copy_node`).

Composition (no new kernel):
- `squeeze/Squeeze.cpp` — reads `args = [self, dims, out]`, ignores the AOT-fixed `dims`, calls `add_flat_copy(graph, in, out)` from `runtime/ops/view_copy/view_copy.h`.
- `unsqueeze/Unsqueeze.cpp` — reads `args = [self, dim, out]`, ignores the AOT-fixed `dim`, calls `add_flat_copy(graph, in, out)`.
ghstack-source-id: 397026523
@exported-using-ghexport

Differential Revision: [D108793153](https://our.internmc.facebook.com/intern/diff/D108793153/)
---
 backends/webgpu/CMakeLists.txt                |  2 ++
 .../webgpu/runtime/ops/squeeze/Squeeze.cpp    | 31 +++++++++++++++++++
 .../runtime/ops/unsqueeze/Unsqueeze.cpp       | 31 +++++++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
 create mode 100644 backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index c3b6ef4e706..01bb5236a44 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -45,6 +45,8 @@ set(WEBGPU_SRCS
     runtime/ops/view_copy/ViewCopy.cpp
     runtime/ops/select/Select.cpp
     runtime/ops/sigmoid/UnaryOp.cpp
+    runtime/ops/squeeze/Squeeze.cpp
+    runtime/ops/unsqueeze/Unsqueeze.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
new file mode 100644
index 00000000000..12b0fe561f1
--- /dev/null
+++ b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104).
+void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; dims ignored (out shape fixed AOT).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
new file mode 100644
index 00000000000..27d2c52e708
--- /dev/null
+++ b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103).
+void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu

From c86418c4ec769c32ffafdcd3729dccce4ed55185 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Thu, 25 Jun 2026 10:24:18 -0700
Subject: [PATCH 3/3] [ExecuTorch][WebGPU] squeeze_copy + unsqueeze_copy test
 suites (cases.py op-test framework)

Pull Request resolved: https://github.com/pytorch/executorch/pull/20393

Registers `aten.squeeze_copy.dims` and `aten.unsqueeze_copy.default` in the `cases.py` op-test framework: a `_squeeze_suite` of 3 configs (squeeze leading/middle/multiple size-1 dims) and a `_unsqueeze_suite` of 3 configs (insert dim at front/middle/last) that `generate_op_tests` exports via `VulkanPartitioner` and compares to a torch golden on Dawn. Also adds `test/ops/squeeze/test_squeeze.py` (`SqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), `test/ops/unsqueeze/test_unsqueeze.py` (`UnsqueezeModule` + `CONFIGS` + `_op_delegated` smoke test), and the two partitioner-allowlist entries in `tester.py`.
ghstack-source-id: 397026525
@exported-using-ghexport

Differential Revision: [D108793152](https://our.internmc.facebook.com/intern/diff/D108793152/)
---
 backends/webgpu/test/op_tests/cases.py     | 36 +++++++++++
 backends/webgpu/test/ops/test_squeeze.py   | 75 ++++++++++++++++++++++
 backends/webgpu/test/ops/test_unsqueeze.py | 75 ++++++++++++++++++++++
 backends/webgpu/test/tester.py             |  2 +
 4 files changed, 188 insertions(+)
 create mode 100644 backends/webgpu/test/ops/test_squeeze.py
 create mode 100644 backends/webgpu/test/ops/test_unsqueeze.py

diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index 7df3ee11f11..0db8685fa18 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -49,6 +49,16 @@
     N as _SIGMOID_N,
     SigmoidModule,
 )
+
+from executorch.backends.webgpu.test.ops.test_squeeze import (
+    CONFIGS as _SQUEEZE_CONFIGS,
+    SqueezeModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_unsqueeze import (
+    CONFIGS as _UNSQUEEZE_CONFIGS,
+    UnsqueezeModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -184,3 +194,29 @@ def _sigmoid_suite() -> WebGPUTestSuite:
         atol=1e-4,
         rtol=1e-4,
     )
+
+
+@register_op_test("squeeze")
+def _squeeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim) where dim is an int or a tuple.
+    return WebGPUTestSuite(
+        module_factory=lambda dim: SqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _SQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
+
+
+@register_op_test("unsqueeze")
+def _unsqueeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim).
+    return WebGPUTestSuite(
+        module_factory=lambda dim: UnsqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py
new file mode 100644
index 00000000000..b55a5143538
--- /dev/null
+++ b/backends/webgpu/test/ops/test_squeeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework.
+
+`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, squeeze_dim)
+CONFIGS = {
+    "dim0": ((1, 3, 4), 0),
+    "mid": ((2, 1, 4), 1),
+    "multi": ((1, 3, 1, 4), (0, 2)),
+}
+
+
+class SqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.squeeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(SqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class SqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (squeeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "squeeze_copy"),
+                    f"squeeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py
new file mode 100644
index 00000000000..dcddf4faa51
--- /dev/null
+++ b/backends/webgpu/test/ops/test_unsqueeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework.
+
+`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `UnsqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, unsqueeze_dim)
+CONFIGS = {
+    "front": ((3, 4), 0),
+    "mid": ((2, 4), 1),
+    "last": ((3, 4), 2),
+}
+
+
+class UnsqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.unsqueeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class UnsqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (unsqueeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "unsqueeze_copy"),
+                    f"unsqueeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index e5dd510d49b..53a745a16df 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -25,6 +25,8 @@
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.select_copy.int,
     exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.squeeze_copy.dims,
+    exir_ops.edge.aten.unsqueeze_copy.default,
 ]