diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index f7cd85f9758..01bb5236a44 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -42,6 +42,11 @@ set(WEBGPU_SRCS runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp runtime/ops/rope/RotaryEmbedding.cpp runtime/ops/prepack/Prepack.cpp + runtime/ops/view_copy/ViewCopy.cpp + runtime/ops/select/Select.cpp + runtime/ops/sigmoid/UnaryOp.cpp + runtime/ops/squeeze/Squeeze.cpp + runtime/ops/unsqueeze/Unsqueeze.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b7fb4313400..0e00b2cb42b 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -679,6 +679,16 @@ void WebGPUGraph::execute() { // One pass per dispatch: enforces storage RAW ordering across deps. for (size_t i = 0; i < n; i++) { const auto& dispatch = dispatches_[i]; + if (dispatch.kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatch.copy_src, + 0, + dispatch.copy_dst, + 0, + dispatch.copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; #ifdef WGPU_BACKEND_ENABLE_PROFILING // tw must outlive BeginComputePass (the descriptor points at it). @@ -757,6 +767,16 @@ void WebGPUGraph::execute() { wgpuDeviceCreateCommandEncoder(device_, &enc_desc); for (size_t i = start; i < end; i++) { + if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatches_[i].copy_src, + 0, + dispatches_[i].copy_dst, + 0, + dispatches_[i].copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 3572f751a06..b9326cf016c 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -42,6 +42,12 @@ struct WebGPUDispatch { WGPUBindGroup bind_group = nullptr; uint32_t workgroup_count_x = 1; std::string kernel_name; // bench label + // DMA copy command; default Compute keeps existing positional inits valid. + enum class Kind { Compute, Copy }; + Kind kind = Kind::Compute; + WGPUBuffer copy_src = nullptr; + WGPUBuffer copy_dst = nullptr; + size_t copy_nbytes = 0; }; struct OutputCopy { @@ -189,6 +195,17 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } + // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy). + void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) { + WebGPUDispatch d; + d.kind = WebGPUDispatch::Kind::Copy; + d.copy_src = src; + d.copy_dst = dst; + d.copy_nbytes = nbytes; + d.kernel_name = "flat_copy"; + dispatches_.push_back(d); + } + // Materialize a recorded prepack-routed constant into dst via one CPU->GPU // transfer. Build-time only (the .pte bytes are freed after build()). // Mirrors Vulkan prepack_standard. diff --git a/backends/webgpu/runtime/ops/select/Select.cpp b/backends/webgpu/runtime/ops/select/Select.cpp new file mode 100644 index 00000000000..5686bbc79c0 --- /dev/null +++ b/backends/webgpu/runtime/ops/select/Select.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct SelectParams { + uint32_t dim; + uint32_t index; + uint32_t _pad[2]; +}; + +// dim/index are required Ints (SymInt throws); no Null default unlike slice. +int64_t read_scalar(WebGPUGraph& graph, int id, const char* what) { + if (graph.get_value_type(id) == WebGPUGraph::ValueType::Int) { + return graph.get_int(id); + } + throw std::runtime_error(std::string("select: dynamic/unsupported ") + what); +} + +void select_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dim, index, out]; output rank = in rank - 1. + const int in_id = args.at(0); + const int out_id = args.at(3); + + WGPUDevice device = graph.device(); + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) { + throw std::runtime_error("select: null buffer binding"); + } + + const int in_ndim = static_cast(in_tensor.dims.size()); + int64_t dim = read_scalar(graph, args.at(1), "dim"); + if (dim < 0) { + dim += in_ndim; + } + if (dim < 0 || dim >= in_ndim) { + throw std::runtime_error("select: dim out of range"); + } + const int64_t in_size = in_tensor.dims[dim]; + int64_t index = read_scalar(graph, args.at(2), "index"); + if (index < 0) { + index += in_size; + } + if (index < 0 || index >= in_size) { + throw std::runtime_error("select: index out of range"); + } + + TensorMeta out_meta; + TensorMeta in_meta; + fill_tensor_meta(out_tensor, &out_meta); + fill_tensor_meta(in_tensor, &in_meta); + if (out_tensor.nbytes != + static_cast(out_meta.numel) * sizeof(float) || + in_tensor.nbytes != static_cast(in_meta.numel) * sizeof(float)) { + throw std::runtime_error("select: non-fp32 operand (nbytes != numel * 4)"); + } + + SelectParams params = {}; + params.dim = static_cast(dim); + params.index = static_cast(index); + + uint32_t wg_size = utils::clamp_workgroup_size(device, kSelectWorkgroupSizeX); + uint32_t workgroup_count = utils::compute_1d_workgroup_count( + device, out_meta.numel, wg_size, "select"); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + WGPUBuffer out_meta_buf = + utils::make_uniform(device, &out_meta, sizeof(TensorMeta)); + WGPUBuffer in_meta_buf = + utils::make_uniform(device, &in_meta, sizeof(TensorMeta)); + WGPUBuffer params_buf = + utils::make_uniform(device, ¶ms, sizeof(SelectParams)); + graph.add_uniform_buffer_bytes(2 * sizeof(TensorMeta) + sizeof(SelectParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kSelectWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group: in, out (rw), out_meta, in_meta, params (3 uniforms). + WGPUBindGroupLayoutEntry entries[5] = {}; + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + entries[3].binding = 3; + entries[3].visibility = WGPUShaderStage_Compute; + entries[3].buffer.type = WGPUBufferBindingType_Uniform; + entries[4].binding = 4; + entries[4].visibility = WGPUShaderStage_Compute; + entries[4].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 5; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[5] = {}; + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + bg_entries[2].binding = 2; + bg_entries[2].buffer = out_meta_buf; + bg_entries[2].size = sizeof(TensorMeta); + bg_entries[3].binding = 3; + bg_entries[3].buffer = in_meta_buf; + bg_entries[3].size = sizeof(TensorMeta); + bg_entries[4].binding = 4; + bg_entries[4].buffer = params_buf; + bg_entries[4].size = sizeof(SelectParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 5; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our refs; the bind group keeps the uniforms alive until release. + wgpuBufferRelease(out_meta_buf); + wgpuBufferRelease(in_meta_buf); + wgpuBufferRelease(params_buf); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.select_copy.int, select_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/select/select.wgsl b/backends/webgpu/runtime/ops/select/select.wgsl new file mode 100644 index 00000000000..84938b2c840 --- /dev/null +++ b/backends/webgpu/runtime/ops/select/select.wgsl @@ -0,0 +1,41 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + index: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index. + var rem = out_bufi; + var in_bufi: u32 = params.index * in_meta.strides[params.dim]; + for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) { + let coord = rem / out_meta.strides[od]; + rem = rem % out_meta.strides[od]; + var id = od; + if (od >= params.dim) { + id = od + 1u; + } + in_bufi = in_bufi + coord * in_meta.strides[id]; + } + output[out_bufi] = input[in_bufi]; +} diff --git a/backends/webgpu/runtime/ops/select/select_wgsl.h b/backends/webgpu/runtime/ops/select/select_wgsl.h new file mode 100644 index 00000000000..e66edde240d --- /dev/null +++ b/backends/webgpu/runtime/ops/select/select_wgsl.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from select.wgsl - DO NOT EDIT. +// wgsl-sha256: 200cf5a8190045aa0562e782f01c1cfaf9681f30f679f5112ccc3d347a0ed8df +inline constexpr const char* kSelectWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct TensorMeta { + ndim: u32, + numel: u32, + sizes: vec4, + strides: vec4, +} +@group(0) @binding(2) var out_meta: TensorMeta; +@group(0) @binding(3) var in_meta: TensorMeta; + +struct Params { + dim: u32, + index: u32, +} +@group(0) @binding(4) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let out_bufi = gid.x; + if (out_bufi >= out_meta.numel) { + return; + } + + // Gather: out dim od -> in dim (od if od < dim else od+1); sel dim = index. + var rem = out_bufi; + var in_bufi: u32 = params.index * in_meta.strides[params.dim]; + for (var od: u32 = 0u; od < out_meta.ndim; od = od + 1u) { + let coord = rem / out_meta.strides[od]; + rem = rem % out_meta.strides[od]; + var id = od; + if (od >= params.dim) { + id = od + 1u; + } + in_bufi = in_bufi + coord * in_meta.strides[id]; + } + output[out_bufi] = input[in_bufi]; +} +)"; + +inline constexpr uint32_t kSelectWorkgroupSizeX = 64; +inline constexpr uint32_t kSelectWorkgroupSizeY = 1; +inline constexpr uint32_t kSelectWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp new file mode 100644 index 00000000000..4d1a087cae5 --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/UnaryOp.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +// Uniform buffer layout matching the WGSL Params struct; 16-byte aligned. +struct UnaryParams { + uint32_t num_elements; + uint32_t _pad[3]; +}; + +// Generic elementwise unary op; mirrors Vulkan add_unary_op_node (UnaryOp.cpp). +void add_unary_op( + WebGPUGraph& graph, + int in_id, + int out_id, + const char* wgsl_source, + uint32_t wg_size_x, + const char* op_name) { + WGPUDevice device = graph.device(); + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + if (in_tensor.buffer == nullptr || out_tensor.buffer == nullptr) { + throw std::runtime_error(std::string(op_name) + ": null buffer binding"); + } + + // 4-byte (fp32) alignment guard on both operands; also the dtype guard. + if (in_tensor.nbytes % sizeof(float) != 0 || + out_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error( + std::string(op_name) + ": operand not 4-byte aligned"); + } + if (in_tensor.nbytes != out_tensor.nbytes) { + throw std::runtime_error( + std::string(op_name) + ": input/output size mismatch"); + } + + uint32_t num_elements = + static_cast(out_tensor.nbytes / sizeof(float)); + + uint32_t wg_size = utils::clamp_workgroup_size(device, wg_size_x); + uint32_t workgroup_count = + utils::compute_1d_workgroup_count(device, num_elements, wg_size, op_name); + + WGPUConstantEntry wg_size_constant = {}; + wg_size_constant.key = {"wg_size", WGPU_STRLEN}; + wg_size_constant.value = static_cast(wg_size); + + UnaryParams params = {}; + params.num_elements = num_elements; + + WGPUBuffer uniform_buffer = + utils::make_uniform(device, ¶ms, sizeof(UnaryParams)); + graph.add_uniform_buffer_bytes(sizeof(UnaryParams)); + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {wgsl_source, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + // Bind group layout: input (read storage) + output (storage) + params. + WGPUBindGroupLayoutEntry entries[3] = {}; + + entries[0].binding = 0; + entries[0].visibility = WGPUShaderStage_Compute; + entries[0].buffer.type = WGPUBufferBindingType_ReadOnlyStorage; + + entries[1].binding = 1; + entries[1].visibility = WGPUShaderStage_Compute; + entries[1].buffer.type = WGPUBufferBindingType_Storage; + + entries[2].binding = 2; + entries[2].visibility = WGPUShaderStage_Compute; + entries[2].buffer.type = WGPUBufferBindingType_Uniform; + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 3; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pipeline_layout = + wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = pipeline_layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + pipeline_desc.compute.constantCount = 1; + pipeline_desc.compute.constants = &wg_size_constant; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device, &pipeline_desc); + + WGPUBindGroupEntry bg_entries[3] = {}; + + bg_entries[0].binding = 0; + bg_entries[0].buffer = in_tensor.buffer; + bg_entries[0].size = in_tensor.nbytes; + + bg_entries[1].binding = 1; + bg_entries[1].buffer = out_tensor.buffer; + bg_entries[1].size = out_tensor.nbytes; + + bg_entries[2].binding = 2; + bg_entries[2].buffer = uniform_buffer; + bg_entries[2].size = sizeof(UnaryParams); + + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 3; + bg_desc.entries = bg_entries; + WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); + + graph.add_dispatch({pipeline, bind_group, workgroup_count}); + + // Release intermediates (pipeline + bind_group are kept by dispatch). + wgpuShaderModuleRelease(shader); + wgpuBindGroupLayoutRelease(bgl); + wgpuPipelineLayoutRelease(pipeline_layout); + // Drop our ref; the bind group keeps the uniform buffer alive until release. + wgpuBufferRelease(uniform_buffer); +} + +void sigmoid_impl(WebGPUGraph& graph, const std::vector& args) { + // aten.sigmoid.default args: [in, out] + add_unary_op( + graph, + args.at(0), + args.at(1), + kSigmoidWGSL, + kSigmoidWorkgroupSizeX, + "sigmoid"); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.sigmoid.default, sigmoid_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl new file mode 100644 index 00000000000..09b3e5457b3 --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid.wgsl @@ -0,0 +1,18 @@ +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct Params { + num_elements: u32, +} +@group(0) @binding(2) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} diff --git a/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h new file mode 100644 index 00000000000..48e6efc607a --- /dev/null +++ b/backends/webgpu/runtime/ops/sigmoid/sigmoid_wgsl.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// @generated from sigmoid.wgsl - DO NOT EDIT. +// wgsl-sha256: 70395dbb107b8b95ae13c0a6fb12a8415c561c645da0347294c92904314ae84c +inline constexpr const char* kSigmoidWGSL = R"( +@group(0) @binding(0) var input: array; +@group(0) @binding(1) var output: array; + +struct Params { + num_elements: u32, +} +@group(0) @binding(2) var params: Params; + +override wg_size: u32 = 64u; + +@compute @workgroup_size(wg_size, 1, 1) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x; + if (idx >= params.num_elements) { + return; + } + output[idx] = 1.0 / (1.0 + exp(-input[idx])); +} +)"; + +inline constexpr uint32_t kSigmoidWorkgroupSizeX = 64; +inline constexpr uint32_t kSigmoidWorkgroupSizeY = 1; +inline constexpr uint32_t kSigmoidWorkgroupSizeZ = 1; + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp new file mode 100644 index 00000000000..12b0fe561f1 --- /dev/null +++ b/backends/webgpu/runtime/ops/squeeze/Squeeze.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace executorch::backends::webgpu { + +namespace { + +// squeeze_copy.dims = numel-preserving flat copy (Vulkan Squeeze.cpp:102-104). +void squeeze_copy_dims_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dims, out]; dims ignored (out shape fixed AOT). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp new file mode 100644 index 00000000000..27d2c52e708 --- /dev/null +++ b/backends/webgpu/runtime/ops/unsqueeze/Unsqueeze.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +namespace executorch::backends::webgpu { + +namespace { + +// unsqueeze_copy = numel-preserving flat copy (Vulkan Unsqueeze.cpp:101-103). +void unsqueeze_copy_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, dim, out]; dim ignored (out shape fixed AOT, like view_copy). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze_copy_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp new file mode 100644 index 00000000000..d56c721ce3e --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include + +namespace executorch::backends::webgpu { + +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) { + // get_tensor doesn't type-check; assert both args are tensors (fail loud). + if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor || + graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("flat_copy: in/out arg is not a tensor"); + } + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap). + + // 4-byte alignment guard (fp32 element size); does not verify dtype. + if (in_tensor.nbytes % sizeof(float) != 0 || + out_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error("flat_copy: operand not 4-byte aligned"); + } + + // view preserves numel; this guard also prevents an OOB copy. + if (in_tensor.nbytes != out_tensor.nbytes) { + throw std::runtime_error("flat_copy: input/output size mismatch"); + } + + // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst. + if (in_tensor.buffer == out_tensor.buffer) { + return; + } + + graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes); +} + +namespace { + +// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer). +void view_copy_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, size, out]; out = last value-id (shape from out_tensor.dims). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h new file mode 100644 index 00000000000..bfa81174ba9 --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/view_copy.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h). +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id); + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/test/op_tests/cases.py b/backends/webgpu/test/op_tests/cases.py index febdbd507a8..0db8685fa18 100644 --- a/backends/webgpu/test/op_tests/cases.py +++ b/backends/webgpu/test/op_tests/cases.py @@ -36,6 +36,33 @@ _ramp, RmsNormModule, ) +from executorch.backends.webgpu.test.ops.test_mul import ( + CONFIGS as _MUL_CONFIGS, + MulModule, +) +from executorch.backends.webgpu.test.ops.test_select import ( + CONFIGS as _SELECT_CONFIGS, + SelectModule, +) +from executorch.backends.webgpu.test.ops.test_sigmoid import ( + _det_input as _sigmoid_det_input, + N as _SIGMOID_N, + SigmoidModule, +) + +from executorch.backends.webgpu.test.ops.test_squeeze import ( + CONFIGS as _SQUEEZE_CONFIGS, + SqueezeModule, +) + +from executorch.backends.webgpu.test.ops.test_unsqueeze import ( + CONFIGS as _UNSQUEEZE_CONFIGS, + UnsqueezeModule, +) +from executorch.backends.webgpu.test.ops.test_view_copy import ( + CONFIGS as _VIEW_CONFIGS, + ViewModule, +) # rms_norm coverage is exactly the 15 cases the native test covered. RMS_NORM_CASES = _CASES @@ -106,3 +133,90 @@ def _rms_norm_suite() -> WebGPUTestSuite: ) ) return WebGPUTestSuite(module_factory=_rms_norm_factory, cases=cases) + + +@register_op_test("mul") +def _mul_suite() -> WebGPUTestSuite: + # Full numeric coverage incl. broadcast (binary_mul.wgsl over a TensorMeta UBO); fp64 golden. + return WebGPUTestSuite( + module_factory=lambda: MulModule(), + cases=[ + Case(name=name, inputs=(sa, sb)) for name, (sa, sb) in _MUL_CONFIGS.items() + ], + ) + + +def _fn_config_suite(module_cls, configs) -> WebGPUTestSuite: + """Builder for ops whose per-case spec is a (shape, fn) pair (view/select/slice). + The fn is a `construct` kwarg baked into the .pte module, never a serialized input. + """ + return WebGPUTestSuite( + module_factory=lambda fn: module_cls(fn), + cases=[ + Case(name=n, construct={"fn": fn}, inputs=(shape,)) + for n, (shape, fn) in configs.items() + ], + golden_dtype="float32", # gather/copy: fp64 bit-identical, skip dual-oracle + ) + + +@register_op_test("view_copy") +def _view_copy_suite() -> WebGPUTestSuite: + return _fn_config_suite(ViewModule, _VIEW_CONFIGS) + + +@register_op_test("select") +def _select_suite() -> WebGPUTestSuite: + return _fn_config_suite(SelectModule, _SELECT_CONFIGS) + + +def _sigmoid_full_range(_shape) -> torch.Tensor: + # Reuses the monolith's saturation-tail input (linspace(-12, 12)). + return _sigmoid_det_input() + + +@register_op_test("sigmoid") +def _sigmoid_suite() -> WebGPUTestSuite: + # sigmoid has no CONFIGS table; cover unary shapes directly (tol 1e-4). + return WebGPUTestSuite( + module_factory=lambda: SigmoidModule(), + cases=[ + Case(name="vec", inputs=((M1,),)), + Case(name="mat", inputs=((M1, M2),)), + Case(name="rank3", inputs=((S1, M1, M2),)), + Case(name="rank4", inputs=((S1, S2, S2, M2),)), + # Saturation tails sigmoid(+-12) (~6e-6 / 0.999994) that randn shapes miss. + Case( + name="saturation", + inputs=(InputSpec(shape=(_SIGMOID_N,), gen=_sigmoid_full_range),), + ), + ], + atol=1e-4, + rtol=1e-4, + ) + + +@register_op_test("squeeze") +def _squeeze_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (shape, dim) where dim is an int or a tuple. + return WebGPUTestSuite( + module_factory=lambda dim: SqueezeModule(dim), + cases=[ + Case(name=n, construct={"dim": dim}, inputs=(shape,)) + for n, (shape, dim) in _SQUEEZE_CONFIGS.items() + ], + golden_dtype="float32", # reshape copies values; fp64 bit-identical + ) + + +@register_op_test("unsqueeze") +def _unsqueeze_suite() -> WebGPUTestSuite: + # CONFIGS: name -> (shape, dim). + return WebGPUTestSuite( + module_factory=lambda dim: UnsqueezeModule(dim), + cases=[ + Case(name=n, construct={"dim": dim}, inputs=(shape,)) + for n, (shape, dim) in _UNSQUEEZE_CONFIGS.items() + ], + golden_dtype="float32", # reshape copies values; fp64 bit-identical + ) diff --git a/backends/webgpu/test/ops/test_mul.py b/backends/webgpu/test/ops/test_mul.py new file mode 100644 index 00000000000..cca04bcdaa4 --- /dev/null +++ b/backends/webgpu/test/ops/test_mul.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.mul.Tensor` (full broadcast) module + configs for the WebGPU op-test framework. + +`MulModule` + `CONFIGS` are imported by `cases.py` to drive the declarative op-test +suite (export via VulkanPartitioner + fp64 torch golden, run on Dawn). `MulTest` is +the export-delegation smoke test. Configs span the same-shape +fast path (SwiGLU), last-dim broadcast at LLM width, and a mixed-rank left-pad case. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (shape_a, shape_b). Output shape is the broadcast of the two. +CONFIGS = { + "same": ((8, 32), (8, 32)), # fast path (SwiGLU same-shape) + "bcast_lastdim": ((1, 1, 7, 896), (1, 1, 7, 1)), # last-dim broadcast, LLM width + "mixedrank": ((4,), (3, 4)), # right-aligned left-pad (in.ndim < out.ndim) +} + + +class MulModule(torch.nn.Module): + def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + return a * b + + +def _det_inputs(shape_a, shape_b): + """Deterministic fp32 inputs (fixed seed) for a config.""" + g = torch.Generator().manual_seed(0) + a = torch.randn(*shape_a, generator=g, dtype=torch.float32) + b = torch.randn(*shape_b, generator=g, dtype=torch.float32) + return a, b + + +def _export(a: torch.Tensor, b: torch.Tensor): + ep = torch.export.export(MulModule().eval(), (a, b)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class MulTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (sa, sb) in CONFIGS.items(): + with self.subTest(name=name): + a, b = _det_inputs(sa, sb) + et = _export(a, b) + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (mul {name})" + ) diff --git a/backends/webgpu/test/ops/test_select.py b/backends/webgpu/test/ops/test_select.py new file mode 100644 index 00000000000..739ebe4d190 --- /dev/null +++ b/backends/webgpu/test/ops/test_select.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.select_copy.int` module + configs for the WebGPU op-test framework. + +`SelectModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `SelectTest` is the export-delegation smoke test. +Configs cover the leading, middle, and last dim plus a negative index (output rank = +input rank - 1). +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, select_fn) +CONFIGS = { + "dim0": ((3, 8, 4), lambda x: x[1]), + "middle": ((3, 8, 4), lambda x: x[:, 2]), + "last": ((3, 8, 4), lambda x: x[..., 3]), + "neg_idx": ((3, 8, 4), lambda x: x[:, -1]), +} + + +class SelectModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fn(x) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _export(fn, x: torch.Tensor): + ep = torch.export.export(SelectModule(fn).eval(), (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class SelectTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, fn) in CONFIGS.items(): + with self.subTest(name=name): + et = _export(fn, _det_input(shape)) + self.assertTrue( + _delegated(et), f"Expected a VulkanBackend delegate (select {name})" + ) diff --git a/backends/webgpu/test/ops/test_sigmoid.py b/backends/webgpu/test/ops/test_sigmoid.py new file mode 100644 index 00000000000..0ba8c435a9a --- /dev/null +++ b/backends/webgpu/test/ops/test_sigmoid.py @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.sigmoid.default` module + input for the WebGPU op-test framework. + +`SigmoidModule`, `N`, and `_det_input` are imported by `cases.py` to drive the +declarative op-test suite. `SigmoidTest` is the export-delegation +smoke test. Sigmoid is on the Llama critical path (`F.silu` -> `sigmoid` + `mul`); the +deterministic input spans the saturation tails. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# Input length; the deterministic input spans the saturation tails. +N = 64 + + +class SigmoidModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.sigmoid(x) + + +def _det_input() -> torch.Tensor: + """Deterministic fp32 input spanning negatives, zero, and large magnitudes.""" + return torch.linspace(-12.0, 12.0, N, dtype=torch.float32) + + +def _export(m: torch.nn.Module, x: torch.Tensor): + ep = torch.export.export(m, (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +class SigmoidTest(unittest.TestCase): + def test_export_delegates(self) -> None: + et = _export(SigmoidModule().eval(), _det_input()) + found = any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + self.assertTrue(found, "Expected a VulkanBackend delegate (sigmoid)") diff --git a/backends/webgpu/test/ops/test_squeeze.py b/backends/webgpu/test/ops/test_squeeze.py new file mode 100644 index 00000000000..b55a5143538 --- /dev/null +++ b/backends/webgpu/test/ops/test_squeeze.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.squeeze_copy.dims` module + configs for the WebGPU op-test framework. + +`SqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `SqueezeTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, squeeze_dim) +CONFIGS = { + "dim0": ((1, 3, 4), 0), + "mid": ((2, 1, 4), 1), + "multi": ((1, 3, 1, 4), (0, 2)), +} + + +class SqueezeModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.squeeze(x, self.dim) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _lower(dim, x: torch.Tensor): + ep = torch.export.export(SqueezeModule(dim).eval(), (x,)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class SqueezeTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, dim) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(dim, _det_input(shape)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (squeeze {name})", + ) + self.assertTrue( + _op_delegated(edge, "squeeze_copy"), + f"squeeze_copy not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/ops/test_unsqueeze.py b/backends/webgpu/test/ops/test_unsqueeze.py new file mode 100644 index 00000000000..dcddf4faa51 --- /dev/null +++ b/backends/webgpu/test/ops/test_unsqueeze.py @@ -0,0 +1,75 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.unsqueeze_copy.default` module + configs for the WebGPU op-test framework. + +`UnsqueezeModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `UnsqueezeTest` is the export-delegation smoke +test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, unsqueeze_dim) +CONFIGS = { + "front": ((3, 4), 0), + "mid": ((2, 4), 1), + "last": ((3, 4), 2), +} + + +class UnsqueezeModule(torch.nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.unsqueeze(x, self.dim) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _lower(dim, x: torch.Tensor): + ep = torch.export.export(UnsqueezeModule(dim).eval(), (x,)) + return to_edge_transform_and_lower(ep, partitioner=[VulkanPartitioner()]) + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +def _op_delegated(edge, op_substr: str) -> bool: + # op must be absorbed into the delegate, not left as a top-level CPU-fallback node. + gm = edge.exported_program().graph_module + return all(op_substr not in str(getattr(n, "target", "")) for n in gm.graph.nodes) + + +class UnsqueezeTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, dim) in CONFIGS.items(): + with self.subTest(name=name): + edge = _lower(dim, _det_input(shape)) + et = edge.to_executorch() + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (unsqueeze {name})", + ) + self.assertTrue( + _op_delegated(edge, "unsqueeze_copy"), + f"unsqueeze_copy not delegated (fell back to CPU) for {name}", + ) diff --git a/backends/webgpu/test/ops/test_view_copy.py b/backends/webgpu/test/ops/test_view_copy.py new file mode 100644 index 00000000000..a9c28f2b631 --- /dev/null +++ b/backends/webgpu/test/ops/test_view_copy.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""`aten.view_copy.default` module + configs for the WebGPU op-test framework. + +`ViewModule` + `CONFIGS` are imported by `cases.py` to drive the declarative +op-test suite. `ViewCopyTest` is the export-delegation +smoke test. +""" + +import unittest + +import torch + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# name -> (input_shape, view_fn) +CONFIGS = { + "reshape": ((2, 3, 4), lambda x: x.reshape(6, 4)), + "flatten": ((2, 3, 4), lambda x: x.reshape(-1)), + "reshape4d": ((2, 3, 4), lambda x: x.reshape(1, 2, 3, 4)), +} + + +class ViewModule(torch.nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fn(x) + + +def _det_input(shape): + g = torch.Generator().manual_seed(0) + return torch.randn(*shape, generator=g, dtype=torch.float32) + + +def _export(fn, x: torch.Tensor): + ep = torch.export.export(ViewModule(fn).eval(), (x,)) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +def _delegated(et) -> bool: + return any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + + +class ViewCopyTest(unittest.TestCase): + def test_export_delegates(self) -> None: + for name, (shape, fn) in CONFIGS.items(): + with self.subTest(name=name): + et = _export(fn, _det_input(shape)) + self.assertTrue( + _delegated(et), + f"Expected a VulkanBackend delegate (view_copy {name})", + ) diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py index 2e67df442e6..53a745a16df 100644 --- a/backends/webgpu/test/tester.py +++ b/backends/webgpu/test/tester.py @@ -21,6 +21,12 @@ WEBGPU_SUPPORTED_OPS = [ exir_ops.edge.aten.add.Tensor, exir_ops.edge.et_vk.rms_norm.default, + exir_ops.edge.aten.mul.Tensor, + exir_ops.edge.aten.view_copy.default, + exir_ops.edge.aten.select_copy.int, + exir_ops.edge.aten.sigmoid.default, + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.unsqueeze_copy.default, ]