diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index f7cd85f9758..01bb5236a44 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -42,6 +42,11 @@ set(WEBGPU_SRCS
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
     runtime/ops/prepack/Prepack.cpp
+    runtime/ops/view_copy/ViewCopy.cpp
+    runtime/ops/select/Select.cpp
+    runtime/ops/sigmoid/UnaryOp.cpp
+    runtime/ops/squeeze/Squeeze.cpp
+    runtime/ops/unsqueeze/Unsqueeze.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index b7fb4313400..0e00b2cb42b 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -679,6 +679,16 @@ void WebGPUGraph::execute() {
     // One pass per dispatch: enforces storage RAW ordering across deps.
     for (size_t i = 0; i < n; i++) {
       const auto& dispatch = dispatches_[i];
+      if (dispatch.kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatch.copy_src,
+            0,
+            dispatch.copy_dst,
+            0,
+            dispatch.copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
 #ifdef WGPU_BACKEND_ENABLE_PROFILING
       // tw must outlive BeginComputePass (the descriptor points at it).
@@ -757,6 +767,16 @@ void WebGPUGraph::execute() {
         wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
 
     for (size_t i = start; i < end; i++) {
+      if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) {
+        wgpuCommandEncoderCopyBufferToBuffer(
+            encoder,
+            dispatches_[i].copy_src,
+            0,
+            dispatches_[i].copy_dst,
+            0,
+            dispatches_[i].copy_nbytes);
+        continue;
+      }
       WGPUComputePassDescriptor pass_desc = {};
       WGPUComputePassEncoder pass =
           wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 3572f751a06..b9326cf016c 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -42,6 +42,12 @@ struct WebGPUDispatch {
   WGPUBindGroup bind_group = nullptr;
   uint32_t workgroup_count_x = 1;
   std::string kernel_name; // bench label
+  // DMA copy command; default Compute keeps existing positional inits valid.
+  enum class Kind { Compute, Copy };
+  Kind kind = Kind::Compute;
+  WGPUBuffer copy_src = nullptr;
+  WGPUBuffer copy_dst = nullptr;
+  size_t copy_nbytes = 0;
 };
 
 struct OutputCopy {
@@ -189,6 +195,17 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy).
+  void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
+    WebGPUDispatch d;
+    d.kind = WebGPUDispatch::Kind::Copy;
+    d.copy_src = src;
+    d.copy_dst = dst;
+    d.copy_nbytes = nbytes;
+    d.kernel_name = "flat_copy";
+    dispatches_.push_back(d);
+  }
+
   // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
   // transfer. Build-time only (the .pte bytes are freed after build()).
   // Mirrors Vulkan prepack_standard.
diff --git a/backends/webgpu/runtime/ops/select/Select.cpp b/backends/webgpu/runtime/ops/select/Select.cpp
new file mode 100644
index 00000000000..5686bbc79c0
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/Select.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/TensorMeta.h>
+#include <executorch/backends/webgpu/runtime/ops/select/select_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct SelectParams {
+  uint32_t dim;
+  uint32_t index;
+  uint32_t _pad[2];
+};
+
+// dim/index are required Ints (SymInt throws); no Null default unlike slice.
+int64_t read_scalar(WebGPUGraph& graph, int id, const char* what) {
+  if (graph.get_value_type(id) == WebGPUGraph::ValueType::Int) {
+    return graph.get_int(id);
+  }
+  throw std::runtime_error(std::string("select: dynamic/unsupported ") + what);
+}
+
+void select_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, index, out]; output rank = in rank - 1.
+  const int in_id = args.at(0);
+  const int out_id = args.at(3);
+
+  WGPUDevice device = graph.device();
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) {
+    throw std::runtime_error("select: null buffer binding");
+  }
+
+  const int in_ndim = static_cast<int>(in_tensor.dims.size());
+  int64_t dim = read_scalar(graph, args.at(1), "dim");
+  if (dim < 0) {
+    dim += in_ndim;
+  }
+  if (dim < 0 || dim >= in_ndim) {
+    throw std::runtime_error("select: dim out of range");
+  }
+  const int64_t in_size = in_tensor.dims[dim];
+  int64_t index = read_scalar(graph, args.at(2), "index");
+  if (index < 0) {
+    index += in_size;
+  }
+  if (index < 0 || index >= in_size) {
+    throw std::runtime_error("select: index out of range");
+  }
+
+  TensorMeta out_meta;
+  TensorMeta in_meta;
+  fill_tensor_meta(out_tensor, &out_meta);
+  fill_tensor_meta(in_tensor, &in_meta);
+  if (out_tensor.nbytes !=
+          static_cast<size_t>(out_meta.numel) * sizeof(float) ||
+      in_tensor.nbytes != static_cast<size_t>(in_meta.numel) * sizeof(float)) {
+    throw std::runtime_error("select: non-fp32 operand (nbytes != numel * 4)");
+  }
+
+  SelectParams params = {};
+  params.dim = static_cast<uint32_t>(dim);
+  params.index = static_cast<uint32_t>(index);
+
+  uint32_t wg_size = utils::clamp_workgroup_size(device, kSelectWorkgroupSizeX);
+  uint32_t workgroup_count = utils::compute_1d_workgroup_count(
+      device, out_meta.numel, wg_size, "select");
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  WGPUBuffer out_meta_buf =
+      utils::make_uniform(device, &out_meta, sizeof(TensorMeta));
+  WGPUBuffer in_meta_buf =
+      utils::make_uniform(device, &in_meta, sizeof(TensorMeta));
+  WGPUBuffer params_buf =
+      utils::make_uniform(device, &params, sizeof(SelectParams));
+  graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SelectParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {kSelectWGSL, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms).
+  WGPUBindGroupLayoutEntry entries[5] = {};
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[3].binding = 3;
+  entries[3].visibility = WGPUShaderStage_Compute;
+  entries[3].buffer.type = WGPUBufferBindingType_Uniform;
+  entries[4].binding = 4;
+  entries[4].visibility = WGPUShaderStage_Compute;
+  entries[4].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 5;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[5] = {};
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = out_meta_buf;
+  bg_entries[2].size = sizeof(TensorMeta);
+  bg_entries[3].binding = 3;
+  bg_entries[3].buffer = in_meta_buf;
+  bg_entries[3].size = sizeof(TensorMeta);
+  bg_entries[4].binding = 4;
+  bg_entries[4].buffer = params_buf;
+  bg_entries[4].size = sizeof(SelectParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 5;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our refs; the bind group keeps the uniforms alive until release.
+  wgpuBufferRelease(out_meta_buf);
+  wgpuBufferRelease(in_meta_buf);
+  wgpuBufferRelease(params_buf);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.select_copy.int, select_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/select/select.wgsl b/backends/webgpu/runtime/ops/select/select.wgsl
new file mode 100644
index 00000000000..84938b2c840
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/select.wgsl
@@ -0,0 +1,41 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  index: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index.
+    var rem = out_bufi;
+    var in_bufi: u32 = params.index * in_meta.strides[params.dim];
+    for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) {
+        let coord = rem / out_meta.strides[od];
+        rem = rem % out_meta.strides[od];
+        var id = od;
+        if (od >= params.dim) {
+            id = od + 1u;
+        }
+        in_bufi = in_bufi + coord * in_meta.strides[id];
+    }
+    output[out_bufi] = input[in_bufi];
+}
diff --git a/backends/webgpu/runtime/ops/select/select_wgsl.h b/backends/webgpu/runtime/ops/select/select_wgsl.h
new file mode 100644
index 00000000000..e66edde240d
--- /dev/null
+++ b/backends/webgpu/runtime/ops/select/select_wgsl.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from select.wgsl - DO NOT EDIT.
+// wgsl-sha256: 200cf5a8190045aa0562e782f01c1cfaf9681f30f679f5112ccc3d347a0ed8df
+inline constexpr const char* kSelectWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct TensorMeta {
+  ndim: u32,
+  numel: u32,
+  sizes: vec4<u32>,
+  strides: vec4<u32>,
+}
+@group(0) @binding(2) var<uniform> out_meta: TensorMeta;
+@group(0) @binding(3) var<uniform> in_meta: TensorMeta;
+
+struct Params {
+  dim: u32,
+  index: u32,
+}
+@group(0) @binding(4) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let out_bufi = gid.x;
+    if (out_bufi >= out_meta.numel) {
+        return;
+    }
+
+    // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index.
+    var rem = out_bufi;
+    var in_bufi: u32 = params.index * in_meta.strides[params.dim];
+    for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) {
+        let coord = rem / out_meta.strides[od];
+        rem = rem % out_meta.strides[od];
+        var id = od;
+        if (od >= params.dim) {
+            id = od + 1u;
+        }
+        in_bufi = in_bufi + coord * in_meta.strides[id];
+    }
+    output[out_bufi] = input[in_bufi];
+}
+)";
+
+inline constexpr uint32_t kSelectWorkgroupSizeX = 64;
+inline constexpr uint32_t kSelectWorkgroupSizeY = 1;
+inline constexpr uint32_t kSelectWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp
new file mode 100644
index 00000000000..4d1a087cae5
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/WebGPUUtils.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h>
+
+#include <webgpu/webgpu.h>
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// Uniform buffer layout matching the WGSL Params struct; 16-byte aligned.
+struct UnaryParams {
+  uint32_t num_elements;
+  uint32_t _pad[3];
+};
+
+// Generic elementwise unary op; mirrors Vulkan add_unary_op_node (UnaryOp.cpp).
+void add_unary_op(
+    WebGPUGraph& graph,
+    int in_id,
+    int out_id,
+    const char* wgsl_source,
+    uint32_t wg_size_x,
+    const char* op_name) {
+  WGPUDevice device = graph.device();
+
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) {
+    throw std::runtime_error(std::string(op_name) + ": null buffer binding");
+  }
+
+  // 4-byte (fp32) alignment guard on both operands; also the dtype guard.
+  if (in_tensor.nbytes % sizeof(float) != 0 ||
+      out_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error(
+        std::string(op_name) + ": operand not 4-byte aligned");
+  }
+  if (in_tensor.nbytes != out_tensor.nbytes) {
+    throw std::runtime_error(
+        std::string(op_name) + ": input/output size mismatch");
+  }
+
+  uint32_t num_elements =
+      static_cast<uint32_t>(out_tensor.nbytes / sizeof(float));
+
+  uint32_t wg_size = utils::clamp_workgroup_size(device, wg_size_x);
+  uint32_t workgroup_count =
+      utils::compute_1d_workgroup_count(device, num_elements, wg_size, op_name);
+
+  WGPUConstantEntry wg_size_constant = {};
+  wg_size_constant.key = {"wg_size", WGPU_STRLEN};
+  wg_size_constant.value = static_cast<double>(wg_size);
+
+  UnaryParams params = {};
+  params.num_elements = num_elements;
+
+  WGPUBuffer uniform_buffer =
+      utils::make_uniform(device, &params, sizeof(UnaryParams));
+  graph.add_uniform_buffer_bytes(sizeof(UnaryParams));
+
+  WGPUShaderSourceWGSL wgsl_desc = {};
+  wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl_desc.code = {wgsl_source, WGPU_STRLEN};
+
+  WGPUShaderModuleDescriptor shader_desc = {};
+  shader_desc.nextInChain = &wgsl_desc.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc);
+
+  // Bind group layout: input (read storage) + output (storage) + params.
+  WGPUBindGroupLayoutEntry entries[3] = {};
+
+  entries[0].binding = 0;
+  entries[0].visibility = WGPUShaderStage_Compute;
+  entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage;
+
+  entries[1].binding = 1;
+  entries[1].visibility = WGPUShaderStage_Compute;
+  entries[1].buffer.type = WGPUBufferBindingType_Storage;
+
+  entries[2].binding = 2;
+  entries[2].visibility = WGPUShaderStage_Compute;
+  entries[2].buffer.type = WGPUBufferBindingType_Uniform;
+
+  WGPUBindGroupLayoutDescriptor bgl_desc = {};
+  bgl_desc.entryCount = 3;
+  bgl_desc.entries = entries;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc);
+
+  WGPUPipelineLayoutDescriptor pl_desc = {};
+  pl_desc.bindGroupLayoutCount = 1;
+  pl_desc.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pipeline_layout =
+      wgpuDeviceCreatePipelineLayout(device, &pl_desc);
+
+  WGPUComputePipelineDescriptor pipeline_desc = {};
+  pipeline_desc.layout = pipeline_layout;
+  pipeline_desc.compute.module = shader;
+  pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN};
+  pipeline_desc.compute.constantCount = 1;
+  pipeline_desc.compute.constants = &wg_size_constant;
+  WGPUComputePipeline pipeline =
+      wgpuDeviceCreateComputePipeline(device, &pipeline_desc);
+
+  WGPUBindGroupEntry bg_entries[3] = {};
+
+  bg_entries[0].binding = 0;
+  bg_entries[0].buffer = in_tensor.buffer;
+  bg_entries[0].size = in_tensor.nbytes;
+
+  bg_entries[1].binding = 1;
+  bg_entries[1].buffer = out_tensor.buffer;
+  bg_entries[1].size = out_tensor.nbytes;
+
+  bg_entries[2].binding = 2;
+  bg_entries[2].buffer = uniform_buffer;
+  bg_entries[2].size = sizeof(UnaryParams);
+
+  WGPUBindGroupDescriptor bg_desc = {};
+  bg_desc.layout = bgl;
+  bg_desc.entryCount = 3;
+  bg_desc.entries = bg_entries;
+  WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc);
+
+  graph.add_dispatch({pipeline, bind_group, workgroup_count});
+
+  // Release intermediates (pipeline + bind_group are kept by dispatch).
+  wgpuShaderModuleRelease(shader);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuPipelineLayoutRelease(pipeline_layout);
+  // Drop our ref; the bind group keeps the uniform buffer alive until release.
+  wgpuBufferRelease(uniform_buffer);
+}
+
+void sigmoid_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // aten.sigmoid.default args: [in, out]
+  add_unary_op(
+      graph,
+      args.at(0),
+      args.at(1),
+      kSigmoidWGSL,
+      kSigmoidWorkgroupSizeX,
+      "sigmoid");
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.sigmoid.default, sigmoid_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl
new file mode 100644
index 00000000000..09b3e5457b3
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl
@@ -0,0 +1,18 @@
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+}
+@group(0) @binding(2) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = 1.0 / (1.0 + exp(-input[idx]));
+}
diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h
new file mode 100644
index 00000000000..48e6efc607a
--- /dev/null
+++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// @generated from sigmoid.wgsl - DO NOT EDIT.
+// wgsl-sha256: 70395dbb107b8b95ae13c0a6fb12a8415c561c645da0347294c92904314ae84c
+inline constexpr const char* kSigmoidWGSL = R"(
+@group(0) @binding(0) var<storage, read> input: array<f32>;
+@group(0) @binding(1) var<storage, read_write> output: array<f32>;
+
+struct Params {
+  num_elements: u32,
+}
+@group(0) @binding(2) var<uniform> params: Params;
+
+override wg_size: u32 = 64u;
+
+@compute @workgroup_size(wg_size, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= params.num_elements) {
+        return;
+    }
+    output[idx] = 1.0 / (1.0 + exp(-input[idx]));
+}
+)";
+
+inline constexpr uint32_t kSigmoidWorkgroupSizeX = 64;
+inline constexpr uint32_t kSigmoidWorkgroupSizeY = 1;
+inline constexpr uint32_t kSigmoidWorkgroupSizeZ = 1;
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
new file mode 100644
index 00000000000..12b0fe561f1
--- /dev/null
+++ b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104).
+void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dims, out]; dims ignored (out shape fixed AOT).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
new file mode 100644
index 00000000000..27d2c52e708
--- /dev/null
+++ b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103).
+void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
new file mode 100644
index 00000000000..d56c721ce3e
--- /dev/null
+++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+#include <executorch/backends/webgpu/runtime/ops/view_copy/view_copy.h>
+
+#include <stdexcept>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) {
+  // get_tensor doesn't type-check; assert both args are tensors (fail loud).
+  if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor ||
+      graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) {
+    throw std::runtime_error("flat_copy: in/out arg is not a tensor");
+  }
+
+  const auto& in_tensor = graph.get_tensor(in_id);
+  const auto& out_tensor = graph.get_tensor(out_id);
+  // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap).
+
+  // 4-byte alignment guard (fp32 element size); does not verify dtype.
+  if (in_tensor.nbytes % sizeof(float) != 0 ||
+      out_tensor.nbytes % sizeof(float) != 0) {
+    throw std::runtime_error("flat_copy: operand not 4-byte aligned");
+  }
+
+  // view preserves numel; this guard also prevents an OOB copy.
+  if (in_tensor.nbytes != out_tensor.nbytes) {
+    throw std::runtime_error("flat_copy: input/output size mismatch");
+  }
+
+  // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst.
+  if (in_tensor.buffer == out_tensor.buffer) {
+    return;
+  }
+
+  graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes);
+}
+
+namespace {
+
+// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer).
+void view_copy_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // args: [self, size, out]; out = last value-id (shape from out_tensor.dims).
+  add_flat_copy(graph, args.at(0), args.at(args.size() - 1));
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h
new file mode 100644
index 00000000000..bfa81174ba9
--- /dev/null
+++ b/backends/webgpu/runtime/ops/view_copy/view_copy.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+namespace executorch::backends::webgpu {
+
+// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h).
+void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id);
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py
index febdbd507a8..0db8685fa18 100644
--- a/backends/webgpu/test/op_tests/cases.py
+++ b/backends/webgpu/test/op_tests/cases.py
@@ -36,6 +36,33 @@
     _ramp,
     RmsNormModule,
 )
+from executorch.backends.webgpu.test.ops.test_mul import (
+    CONFIGS as _MUL_CONFIGS,
+    MulModule,
+)
+from executorch.backends.webgpu.test.ops.test_select import (
+    CONFIGS as _SELECT_CONFIGS,
+    SelectModule,
+)
+from executorch.backends.webgpu.test.ops.test_sigmoid import (
+    _det_input as _sigmoid_det_input,
+    N as _SIGMOID_N,
+    SigmoidModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_squeeze import (
+    CONFIGS as _SQUEEZE_CONFIGS,
+    SqueezeModule,
+)
+
+from executorch.backends.webgpu.test.ops.test_unsqueeze import (
+    CONFIGS as _UNSQUEEZE_CONFIGS,
+    UnsqueezeModule,
+)
+from executorch.backends.webgpu.test.ops.test_view_copy import (
+    CONFIGS as _VIEW_CONFIGS,
+    ViewModule,
+)
 
 # rms_norm coverage is exactly the 15 cases the native test covered.
 RMS_NORM_CASES = _CASES
@@ -106,3 +133,90 @@ def _rms_norm_suite() -> WebGPUTestSuite:
             )
         )
     return WebGPUTestSuite(module_factory=_rms_norm_factory, cases=cases)
+
+
+@register_op_test("mul")
+def _mul_suite() -> WebGPUTestSuite:
+    # Full numeric coverage incl. broadcast (binary_mul.wgsl over a TensorMeta UBO); fp64 golden.
+    return WebGPUTestSuite(
+        module_factory=lambda: MulModule(),
+        cases=[
+            Case(name=name, inputs=(sa, sb)) for name, (sa, sb) in _MUL_CONFIGS.items()
+        ],
+    )
+
+
+def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite:
+    """Builder for ops whose per-case spec is a (shape, fn) pair (view/select/slice).
+    The fn is a `construct` kwarg baked into the .pte module, never a serialized input.
+    """
+    return WebGPUTestSuite(
+        module_factory=lambda fn: module_cls(fn),
+        cases=[
+            Case(name=n, construct={"fn": fn}, inputs=(shape,))
+            for n, (shape, fn) in configs.items()
+        ],
+        golden_dtype="float32",  # gather/copy: fp64 bit-identical, skip dual-oracle
+    )
+
+
+@register_op_test("view_copy")
+def _view_copy_suite() -> WebGPUTestSuite:
+    return _fn_config_suite(ViewModule, _VIEW_CONFIGS)
+
+
+@register_op_test("select")
+def _select_suite() -> WebGPUTestSuite:
+    return _fn_config_suite(SelectModule, _SELECT_CONFIGS)
+
+
+def _sigmoid_full_range(_shape) -> torch.Tensor:
+    # Reuses the monolith's saturation-tail input (linspace(-12, 12)).
+    return _sigmoid_det_input()
+
+
+@register_op_test("sigmoid")
+def _sigmoid_suite() -> WebGPUTestSuite:
+    # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4).
+    return WebGPUTestSuite(
+        module_factory=lambda: SigmoidModule(),
+        cases=[
+            Case(name="vec", inputs=((M1,),)),
+            Case(name="mat", inputs=((M1, M2),)),
+            Case(name="rank3", inputs=((S1, M1, M2),)),
+            Case(name="rank4", inputs=((S1, S2, S2, M2),)),
+            # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss.
+            Case(
+                name="saturation",
+                inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),),
+            ),
+        ],
+        atol=1e-4,
+        rtol=1e-4,
+    )
+
+
+@register_op_test("squeeze")
+def _squeeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim) where dim is an int or a tuple.
+    return WebGPUTestSuite(
+        module_factory=lambda dim: SqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _SQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
+
+
+@register_op_test("unsqueeze")
+def _unsqueeze_suite() -> WebGPUTestSuite:
+    # CONFIGS: name -> (shape, dim).
+    return WebGPUTestSuite(
+        module_factory=lambda dim: UnsqueezeModule(dim),
+        cases=[
+            Case(name=n, construct={"dim": dim}, inputs=(shape,))
+            for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items()
+        ],
+        golden_dtype="float32",  # reshape copies values; fp64 bit-identical
+    )
diff --git a/backends/webgpu/test/ops/test_mul.py b/backends/webgpu/test/ops/test_mul.py
new file mode 100644
index 00000000000..cca04bcdaa4
--- /dev/null
+++ b/backends/webgpu/test/ops/test_mul.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.mul.Tensor` (full broadcast) module + configs for the WebGPU op-test framework.
+
+`MulModule` + `CONFIGS` are imported by `cases.py` to drive the declarative op-test
+suite (export via VulkanPartitioner + fp64 torch golden, run on Dawn). `MulTest` is
+the export-delegation smoke test. Configs span the same-shape
+fast path (SwiGLU), last-dim broadcast at LLM width, and a mixed-rank left-pad case.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (shape_a, shape_b). Output shape is the broadcast of the two.
+CONFIGS = {
+    "same": ((8, 32), (8, 32)),  # fast path (SwiGLU same-shape)
+    "bcast_lastdim": ((1, 1, 7, 896), (1, 1, 7, 1)),  # last-dim broadcast, LLM width
+    "mixedrank": ((4,), (3, 4)),  # right-aligned left-pad (in.ndim < out.ndim)
+}
+
+
+class MulModule(torch.nn.Module):
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return a * b
+
+
+def _det_inputs(shape_a, shape_b):
+    """Deterministic fp32 inputs (fixed seed) for a config."""
+    g = torch.Generator().manual_seed(0)
+    a = torch.randn(*shape_a, generator=g, dtype=torch.float32)
+    b = torch.randn(*shape_b, generator=g, dtype=torch.float32)
+    return a, b
+
+
+def _export(a: torch.Tensor, b: torch.Tensor):
+    ep = torch.export.export(MulModule().eval(), (a, b))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class MulTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (sa, sb) in CONFIGS.items():
+            with self.subTest(name=name):
+                a, b = _det_inputs(sa, sb)
+                et = _export(a, b)
+                self.assertTrue(
+                    _delegated(et), f"Expected a VulkanBackend delegate (mul {name})"
+                )
diff --git a/backends/webgpu/test/ops/test_select.py b/backends/webgpu/test/ops/test_select.py
new file mode 100644
index 00000000000..739ebe4d190
--- /dev/null
+++ b/backends/webgpu/test/ops/test_select.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.select_copy.int` module + configs for the WebGPU op-test framework.
+
+`SelectModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SelectTest` is the export-delegation smoke test.
+Configs cover the leading, middle, and last dim plus a negative index (output rank =
+input rank - 1).
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, select_fn)
+CONFIGS = {
+    "dim0": ((3, 8, 4), lambda x: x[1]),
+    "middle": ((3, 8, 4), lambda x: x[:, 2]),
+    "last": ((3, 8, 4), lambda x: x[..., 3]),
+    "neg_idx": ((3, 8, 4), lambda x: x[:, -1]),
+}
+
+
+class SelectModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fn(x)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _export(fn, x: torch.Tensor):
+    ep = torch.export.export(SelectModule(fn).eval(), (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class SelectTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, fn) in CONFIGS.items():
+            with self.subTest(name=name):
+                et = _export(fn, _det_input(shape))
+                self.assertTrue(
+                    _delegated(et), f"Expected a VulkanBackend delegate (select {name})"
+                )
diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py
new file mode 100644
index 00000000000..0ba8c435a9a
--- /dev/null
+++ b/backends/webgpu/test/ops/test_sigmoid.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.sigmoid.default` module + input for the WebGPU op-test framework.
+
+`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the
+declarative op-test suite. `SigmoidTest` is the export-delegation
+smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the
+deterministic input spans the saturation tails.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# Input length; the deterministic input spans the saturation tails.
+N = 64
+
+
+class SigmoidModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sigmoid(x)
+
+
+def _det_input() -> torch.Tensor:
+    """Deterministic fp32 input spanning negatives, zero, and large magnitudes."""
+    return torch.linspace(-12.0, 12.0, N, dtype=torch.float32)
+
+
+def _export(m: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(m, (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class SigmoidTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        et = _export(SigmoidModule().eval(), _det_input())
+        found = any(
+            d.id == "VulkanBackend"
+            for plan in et.executorch_program.execution_plan
+            for d in plan.delegates
+        )
+        self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)")
diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py
new file mode 100644
index 00000000000..b55a5143538
--- /dev/null
+++ b/backends/webgpu/test/ops/test_squeeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework.
+
+`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `SqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, squeeze_dim)
+CONFIGS = {
+    "dim0": ((1, 3, 4), 0),
+    "mid": ((2, 1, 4), 1),
+    "multi": ((1, 3, 1, 4), (0, 2)),
+}
+
+
+class SqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.squeeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(SqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class SqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (squeeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "squeeze_copy"),
+                    f"squeeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py
new file mode 100644
index 00000000000..dcddf4faa51
--- /dev/null
+++ b/backends/webgpu/test/ops/test_unsqueeze.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework.
+
+`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `UnsqueezeTest` is the export-delegation smoke
+test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, unsqueeze_dim)
+CONFIGS = {
+    "front": ((3, 4), 0),
+    "mid": ((2, 4), 1),
+    "last": ((3, 4), 2),
+}
+
+
+class UnsqueezeModule(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.unsqueeze(x, self.dim)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _lower(dim, x: torch.Tensor):
+    ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,))
+    return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()])
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+def _op_delegated(edge, op_substr: str) -> bool:
+    # op must be absorbed into the delegate, not left as a top-level CPU-fallback node.
+    gm = edge.exported_program().graph_module
+    return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes)
+
+
+class UnsqueezeTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, dim) in CONFIGS.items():
+            with self.subTest(name=name):
+                edge = _lower(dim, _det_input(shape))
+                et = edge.to_executorch()
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (unsqueeze {name})",
+                )
+                self.assertTrue(
+                    _op_delegated(edge, "unsqueeze_copy"),
+                    f"unsqueeze_copy not delegated (fell back to CPU) for {name}",
+                )
diff --git a/backends/webgpu/test/ops/test_view_copy.py b/backends/webgpu/test/ops/test_view_copy.py
new file mode 100644
index 00000000000..a9c28f2b631
--- /dev/null
+++ b/backends/webgpu/test/ops/test_view_copy.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""`aten.view_copy.default` module + configs for the WebGPU op-test framework.
+
+`ViewModule` + `CONFIGS` are imported by `cases.py` to drive the declarative
+op-test suite. `ViewCopyTest` is the export-delegation
+smoke test.
+"""
+
+import unittest
+
+import torch
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# name -> (input_shape, view_fn)
+CONFIGS = {
+    "reshape": ((2, 3, 4), lambda x: x.reshape(6, 4)),
+    "flatten": ((2, 3, 4), lambda x: x.reshape(-1)),
+    "reshape4d": ((2, 3, 4), lambda x: x.reshape(1, 2, 3, 4)),
+}
+
+
+class ViewModule(torch.nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fn(x)
+
+
+def _det_input(shape):
+    g = torch.Generator().manual_seed(0)
+    return torch.randn(*shape, generator=g, dtype=torch.float32)
+
+
+def _export(fn, x: torch.Tensor):
+    ep = torch.export.export(ViewModule(fn).eval(), (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+def _delegated(et) -> bool:
+    return any(
+        d.id == "VulkanBackend"
+        for plan in et.executorch_program.execution_plan
+        for d in plan.delegates
+    )
+
+
+class ViewCopyTest(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        for name, (shape, fn) in CONFIGS.items():
+            with self.subTest(name=name):
+                et = _export(fn, _det_input(shape))
+                self.assertTrue(
+                    _delegated(et),
+                    f"Expected a VulkanBackend delegate (view_copy {name})",
+                )
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index 2e67df442e6..53a745a16df 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -21,6 +21,12 @@
 WEBGPU_SUPPORTED_OPS = [
     exir_ops.edge.aten.add.Tensor,
     exir_ops.edge.et_vk.rms_norm.default,
+    exir_ops.edge.aten.mul.Tensor,
+    exir_ops.edge.aten.view_copy.default,
+    exir_ops.edge.aten.select_copy.int,
+    exir_ops.edge.aten.sigmoid.default,
+    exir_ops.edge.aten.squeeze_copy.dims,
+    exir_ops.edge.aten.unsqueeze_copy.default,
 ]