diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index f7cd85f9758..88c7b1f73a3 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -42,6 +42,7 @@ set(WEBGPU_SRCS runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp runtime/ops/rope/RotaryEmbedding.cpp runtime/ops/prepack/Prepack.cpp + runtime/ops/view_copy/ViewCopy.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b7fb4313400..0e00b2cb42b 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -679,6 +679,16 @@ void WebGPUGraph::execute() { // One pass per dispatch: enforces storage RAW ordering across deps. for (size_t i = 0; i < n; i++) { const auto& dispatch = dispatches_[i]; + if (dispatch.kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatch.copy_src, + 0, + dispatch.copy_dst, + 0, + dispatch.copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; #ifdef WGPU_BACKEND_ENABLE_PROFILING // tw must outlive BeginComputePass (the descriptor points at it). @@ -757,6 +767,16 @@ void WebGPUGraph::execute() { wgpuDeviceCreateCommandEncoder(device_, &enc_desc); for (size_t i = start; i < end; i++) { + if (dispatches_[i].kind == WebGPUDispatch::Kind::Copy) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + dispatches_[i].copy_src, + 0, + dispatches_[i].copy_dst, + 0, + dispatches_[i].copy_nbytes); + continue; + } WGPUComputePassDescriptor pass_desc = {}; WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 3572f751a06..b9326cf016c 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -42,6 +42,12 @@ struct WebGPUDispatch { WGPUBindGroup bind_group = nullptr; uint32_t workgroup_count_x = 1; std::string kernel_name; // bench label + // DMA copy command; default Compute keeps existing positional inits valid. + enum class Kind { Compute, Copy }; + Kind kind = Kind::Compute; + WGPUBuffer copy_src = nullptr; + WGPUBuffer copy_dst = nullptr; + size_t copy_nbytes = 0; }; struct OutputCopy { @@ -189,6 +195,17 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } + // Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy). + void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) { + WebGPUDispatch d; + d.kind = WebGPUDispatch::Kind::Copy; + d.copy_src = src; + d.copy_dst = dst; + d.copy_nbytes = nbytes; + d.kernel_name = "flat_copy"; + dispatches_.push_back(d); + } + // Materialize a recorded prepack-routed constant into dst via one CPU->GPU // transfer. Build-time only (the .pte bytes are freed after build()). // Mirrors Vulkan prepack_standard. diff --git a/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp new file mode 100644 index 00000000000..d56c721ce3e --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/ViewCopy.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include + +namespace executorch::backends::webgpu { + +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id) { + // get_tensor doesn't type-check; assert both args are tensors (fail loud). + if (graph.get_value_type(in_id) != WebGPUGraph::ValueType::Tensor || + graph.get_value_type(out_id) != WebGPUGraph::ValueType::Tensor) { + throw std::runtime_error("flat_copy: in/out arg is not a tensor"); + } + + const auto& in_tensor = graph.get_tensor(in_id); + const auto& out_tensor = graph.get_tensor(out_id); + // Contiguous reshape = flat byte copy; mirrors Vulkan view_buffer (no-remap). + + // 4-byte alignment guard (fp32 element size); does not verify dtype. + if (in_tensor.nbytes % sizeof(float) != 0 || + out_tensor.nbytes % sizeof(float) != 0) { + throw std::runtime_error("flat_copy: operand not 4-byte aligned"); + } + + // view preserves numel; this guard also prevents an OOB copy. + if (in_tensor.nbytes != out_tensor.nbytes) { + throw std::runtime_error("flat_copy: input/output size mismatch"); + } + + // Aliased in/out already in place; CopyBufferToBuffer rejects src == dst. + if (in_tensor.buffer == out_tensor.buffer) { + return; + } + + graph.add_buffer_copy(in_tensor.buffer, out_tensor.buffer, out_tensor.nbytes); +} + +namespace { + +// view_copy = contiguous reshape = flat copy (mirrors Vulkan view_buffer). +void view_copy_impl(WebGPUGraph& graph, const std::vector& args) { + // args: [self, size, out]; out = last value-id (shape from out_tensor.dims). + add_flat_copy(graph, args.at(0), args.at(args.size() - 1)); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(aten.view_copy.default, view_copy_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/view_copy/view_copy.h b/backends/webgpu/runtime/ops/view_copy/view_copy.h new file mode 100644 index 00000000000..bfa81174ba9 --- /dev/null +++ b/backends/webgpu/runtime/ops/view_copy/view_copy.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch::backends::webgpu { + +// Flat copy output[i]=input[i]; mirrors Vulkan add_view_copy_node (View.h). +void add_flat_copy(WebGPUGraph& graph, int in_id, int out_id); + +} // namespace executorch::backends::webgpu