pytorch · JulianCloudNTH · Jun 26, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
@@ -45,6 +45,8 @@ set(WEBGPU_SRCS
     runtime/ops/view_copy/ViewCopy.cpp
     runtime/ops/select/Select.cpp
     runtime/ops/sigmoid/UnaryOp.cpp
+    runtime/ops/squeeze/Squeeze.cpp
+    runtime/ops/unsqueeze/Unsqueeze.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})

diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104).
+void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; dims ignored (out shape fixed AOT).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103).
+void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
@@ -44,6 +44,21 @@
     CONFIGS as _SELECT_CONFIGS,
     SelectModule,
 )
+from executorch.backends.webgpu.test.ops.test_sigmoid import (
+    _det_input as _sigmoid_det_input,
+    N as _SIGMOID_N,
+    SigmoidModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_squeeze import (
+    CONFIGS as _SQUEEZE_CONFIGS,
+    SqueezeModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_unsqueeze import (
+    CONFIGS as _UNSQUEEZE_CONFIGS,
+    UnsqueezeModule,
+)
 from executorch.backends.webgpu.test.ops.test_view_copy import (
     CONFIGS as _VIEW_CONFIGS,
     ViewModule,
@@ -153,3 +168,55 @@ def _view_copy_suite() -> WebGPUTestSuite:
 @register_op_test("select")
 def _select_suite() -> WebGPUTestSuite:
     return _fn_config_suite(SelectModule, _SELECT_CONFIGS)
+
+
+def _sigmoid_full_range(_shape) -> torch.Tensor:
+    # Reuses the monolith's saturation-tail input (linspace(-12, 12)).
+    return _sigmoid_det_input()
+
+
+@register_op_test("sigmoid")
+def _sigmoid_suite() -> WebGPUTestSuite:
+    # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4).
+    return WebGPUTestSuite(
+        module_factory=lambda: SigmoidModule(),
+        cases=[
+            Case(name="vec", inputs=((M1,),)),
+            Case(name="mat", inputs=((M1, M2),)),
+            Case(name="rank3", inputs=((S1, M1, M2),)),
+            Case(name="rank4", inputs=((S1, S2, S2, M2),)),
+            # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss.
+            Case(
+                name="saturation",
+                inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),),
+            ),
+        ],
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+@register_op_test("squeeze")
+def _squeeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim) where dim is an int or a tuple.
+    return WebGPUTestSuite(
+        module_factory=lambda dim: SqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _SQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
+
+
+@register_op_test("unsqueeze")
+def _unsqueeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim).
+    return WebGPUTestSuite(
+        module_factory=lambda dim: UnsqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.sigmoid.default` module + input for the WebGPU op-test framework.
+
+`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the
+declarative op-test suite. `SigmoidTest` is the export-delegation
+smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the
+deterministic input spans the saturation tails.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# Input length; the deterministic input spans the saturation tails.
+N = 64
+
+
+class SigmoidModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sigmoid(x)
+
+
+def _det_input() -> torch.Tensor:
+    """Deterministic fp32 input spanning negatives, zero, and large magnitudes."""
+    return torch.linspace(-12.0, 12.0, N, dtype=torch.float32)
+
+
+def _export(m: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(m, (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class SigmoidTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        et = _export(SigmoidModule().eval(), _det_input())
+        found = any(
+            d.id == "VulkanBackend"
+            for plan in et.executorch_program.execution_plan
+            for d in plan.delegates
+        )
+        self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)")
diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework.
+
+`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, squeeze_dim)
+CONFIGS = {
+    "dim0": ((1, 3, 4), 0),
+    "mid": ((2, 1, 4), 1),
+    "multi": ((1, 3, 1, 4), (0, 2)),
+}
+
+
+class SqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.squeeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(SqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class SqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (squeeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "squeeze_copy"),
+                    f"squeeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework.
+
+`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `UnsqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, unsqueeze_dim)
+CONFIGS = {
+    "front": ((3, 4), 0),
+    "mid": ((2, 4), 1),
+    "last": ((3, 4), 2),
+}
+
+
+class UnsqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.unsqueeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class UnsqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (unsqueeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "unsqueeze_copy"),
+                    f"unsqueeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
@@ -24,6 +24,9 @@
     exir_ops.edge.aten.mul.Tensor,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.select_copy.int,
+    exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.squeeze_copy.dims,
+    exir_ops.edge.aten.unsqueeze_copy.default,
 ]