diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 8bf1674d872..f7cd85f9758 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -41,6 +41,7 @@ set(WEBGPU_SRCS runtime/ops/mul/BinaryOp.cpp runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp runtime/ops/rope/RotaryEmbedding.cpp + runtime/ops/prepack/Prepack.cpp ) add_library(webgpu_backend ${WEBGPU_SRCS}) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 65aaaf6c681..b7fb4313400 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -26,6 +26,10 @@ namespace executorch::backends::webgpu { namespace { +// Op name the AOT exporter emits for a prepacked constant (must match the +// serialized schema); compared in the prepack pre-scan below. +constexpr const char* kPrepackOpName = "et_vk.prepack.default"; + size_t vk_datatype_size(vkgraph::VkDataType dtype) { switch (dtype) { case vkgraph::VkDataType::BOOL: @@ -230,6 +234,10 @@ void WebGPUGraph::build( const auto* graph = vkgraph::GetVkGraph(flatbuffer_data); + // .pte byte sources for prepack-time constant materialization (build-only). + constant_data_ = constant_data; + named_data_map_ = named_data_map; + // Phase 1: Create all values const auto* values = graph->values(); const int num_vals = values ? values->size() : 0; @@ -241,6 +249,41 @@ void WebGPUGraph::build( bools_.resize(num_vals, false); value_lists_.resize(num_vals); + // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the + // prepack node materializes it once) only if it is a prepack source AND never + // a direct arg of a non-prepack op. ValueList args are expanded so a constant + // reached through a list still counts as a direct use. + std::unordered_set prepack_src_ids; + std::unordered_set direct_use_ids; + const auto* chain_prescan = graph->chain(); + if (chain_prescan) { + for (unsigned ci = 0; ci < chain_prescan->size(); ci++) { + const auto* oc = chain_prescan->Get(ci); + const bool is_prepack = oc->name()->str() == kPrepackOpName; + const auto* a = oc->args(); + if (!a) { + continue; + } + for (unsigned j = 0; j < a->size(); j++) { + int id = static_cast(a->Get(j)); + if (is_prepack && j == 0) { + prepack_src_ids.insert(id); + } else if (!is_prepack) { + direct_use_ids.insert(id); + const auto* v = values ? values->Get(id) : nullptr; + if (v && v->value_type() == vkgraph::GraphTypes::ValueList) { + const auto* items = v->value_as_ValueList()->items(); + if (items) { + for (unsigned k = 0; k < items->size(); k++) { + direct_use_ids.insert(static_cast(items->Get(k))); + } + } + } + } + } + } + } + for (int i = 0; i < num_vals; i++) { const auto* val = values->Get(i); if (!val || val->value_type() == vkgraph::GraphTypes::NONE) { @@ -269,60 +312,51 @@ void WebGPUGraph::build( int constant_id = vk_tensor->constant_id(); int mem_obj_id = vk_tensor->mem_obj_id(); - // Constants always get dedicated buffers regardless of mem_obj_id + // Constants are dedicated. Every constant is recorded as a + // ConstantSource and materialized via materialize_constant (one + // CPU->GPU write); a constant consumed ONLY via prepack is deferred + // (no eager buffer -- its prepack node performs that one write). if (constant_id >= 0 || mem_obj_id < 0) { tensor_mem_obj_ids_[i] = -1; - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = std::max(tensor.nbytes, size_t(4)); - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - if (constant_id >= 0 && constant_data && tensor.nbytes > 0) { + + if (constant_id >= 0) { const auto* constants = graph->constants(); - if (constants && - constant_id < static_cast(constants->size())) { - const auto* vk_bytes = constants->Get(constant_id); - if (vk_bytes->offset() != UINT64_MAX) { - const uint8_t* src = constant_data + vk_bytes->offset(); - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, src, tensor.nbytes); - } else if ( - vk_bytes->named_key() != nullptr && - named_data_map != nullptr) { - // Constant stored in the PTE named-data map. - auto buf = - named_data_map->get_data(vk_bytes->named_key()->c_str()); - if (!buf.ok()) { - throw std::runtime_error( - std::string("WebGPU: named constant '") + - vk_bytes->named_key()->c_str() + - "' not found in NamedDataMap"); - } - if (buf->size() < tensor.nbytes) { - throw std::runtime_error( - std::string("WebGPU: named constant '") + - vk_bytes->named_key()->c_str() + "' undersized: have " + - std::to_string(buf->size()) + " bytes, need " + - std::to_string(tensor.nbytes)); - } - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, buf->data(), tensor.nbytes); - buf->Free(); - } else { - throw std::runtime_error( - "WebGPU: constant has no inline offset and no named-data key"); - } - } else { + if (!constants || + constant_id >= static_cast(constants->size())) { throw std::runtime_error( "WebGPU: constant_id set but the constants table is missing " "or the id is out of range"); } - } else if (constant_id >= 0 && tensor.nbytes > 0) { - // constant_id set but constant_data null -> fail loud. - throw std::runtime_error( - "WebGPU: constant_id set but constant_data is null"); + const auto* vk_bytes = constants->Get(constant_id); + ConstantSource cs; + cs.nbytes = tensor.nbytes; + if (vk_bytes->offset() != UINT64_MAX) { + cs.inline_offset = vk_bytes->offset(); + } else if (vk_bytes->named_key() != nullptr) { + cs.named_key = vk_bytes->named_key()->str(); + } else { + throw std::runtime_error( + "WebGPU: constant has no inline offset and no named-data key"); + } + constant_sources_[i] = std::move(cs); + } + + // Defer constants consumed solely via prepack: skip the eager buffer. + const bool defer = constant_id >= 0 && + prepack_src_ids.count(i) != 0 && direct_use_ids.count(i) == 0; + if (!defer) { + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(tensor.nbytes, size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + // Same single CPU->GPU write the prepack node uses (no + // duplication). + if (constant_id >= 0) { + materialize_constant(i, tensor.buffer); + } } } else { // Shared buffer: track required size, defer allocation to pass 2 @@ -458,6 +492,47 @@ void WebGPUGraph::build( webgpu_operator_registry().get_op_fn(op_name)(*this, args); } } + + // Prepack nodes (Phase 3) materialized their constants directly into the + // consumer buffers via materialize_constant; no separate copy pass needed. + // The .pte bytes are freed right after build() returns (WebGPUBackend + // processed->Free()), so clear the build-only source pointers. + constant_data_ = nullptr; + named_data_map_ = nullptr; +} + +void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) { + auto it = constant_sources_.find(const_value_id); + if (it == constant_sources_.end()) { + throw std::runtime_error( + "WebGPU: no source recorded for constant id " + + std::to_string(const_value_id)); + } + const ConstantSource& cs = it->second; + if (cs.nbytes == 0) { + return; + } + if (cs.inline_offset != UINT64_MAX) { + if (constant_data_ == nullptr) { + throw std::runtime_error("WebGPU: inline constant data is null"); + } + wgpuQueueWriteBuffer( + queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes); + } else if (!cs.named_key.empty() && named_data_map_ != nullptr) { + auto buf = named_data_map_->get_data(cs.named_key.c_str()); + if (!buf.ok()) { + throw std::runtime_error( + "WebGPU: named constant '" + cs.named_key + "' not found"); + } + if (buf->size() < cs.nbytes) { + throw std::runtime_error( + "WebGPU: named constant '" + cs.named_key + "' undersized"); + } + wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes); + buf->Free(); + } else { + throw std::runtime_error("WebGPU: constant has no source"); + } } WGPUShaderModule WebGPUGraph::get_or_create_shader( @@ -780,10 +855,11 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { stats.num_tensors++; - // Shared tensors are tracked via shared_buffer_sizes_ + // Shared tensors are tracked via shared_buffer_sizes_; a deferred + // prepack-routed constant has no buffer (no GPU memory) -> not counted. bool is_shared = i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; - if (!is_shared) { + if (!is_shared && tensors_[i].buffer != nullptr) { stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; } } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index a914c8710ce..3572f751a06 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -50,6 +50,15 @@ struct OutputCopy { size_t nbytes = 0; }; +// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef +// (sizes + a data reference, not a live GPU tensor). The prepack node is the +// sole materialization, so the constant needs no eager GPU buffer. +struct ConstantSource { + uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key + std::string named_key; // non-empty => fetch from named_data_map_ + size_t nbytes = 0; +}; + struct ExecuteConfig { size_t chunk_size = 0; size_t initial_chunk_size = 0; @@ -180,6 +189,11 @@ class WebGPUGraph { dispatches_.push_back(dispatch); } + // Materialize a recorded prepack-routed constant into dst via one CPU->GPU + // transfer. Build-time only (the .pte bytes are freed after build()). + // Mirrors Vulkan prepack_standard. + void materialize_constant(int const_value_id, WGPUBuffer dst); + void add_uniform_buffer_bytes(size_t bytes) { uniform_buffer_bytes_ += bytes; } @@ -286,6 +300,13 @@ class WebGPUGraph { std::vector dispatches_; + // Prepack-routed constant sources (offset/named-key + size); the prepack node + // materializes these once. constant_data_/named_data_map_ point at the .pte + // bytes and are valid only during build(). + const uint8_t* constant_data_ = nullptr; + const executorch::runtime::NamedDataMap* named_data_map_ = nullptr; + std::unordered_map constant_sources_; + ExecuteConfig execute_config_; // Caches for reusing GPU objects across dispatches. diff --git a/backends/webgpu/runtime/ops/prepack/Prepack.cpp b/backends/webgpu/runtime/ops/prepack/Prepack.cpp new file mode 100644 index 00000000000..71414f91787 --- /dev/null +++ b/backends/webgpu/runtime/ops/prepack/Prepack.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +namespace executorch::backends::webgpu { + +namespace { + +// Materialize a constant into the prepack-output buffer via one CPU->GPU write. +void prepack_impl(WebGPUGraph& graph, const std::vector& args) { + // et_vk.prepack.default args: [src (constant), out]. + if (args.size() != 2) { + throw std::runtime_error("WebGPU prepack: expected 2 args (src, out)"); + } + const auto& src = graph.get_tensor(args.at(0)); + const auto& out = graph.get_tensor(args.at(1)); + + if (src.dims != out.dims) { + throw std::runtime_error("WebGPU prepack: src/out shape mismatch"); + } + if (src.elem_size != out.elem_size) { + throw std::runtime_error( + "WebGPU prepack: src/out dtype mismatch (cast unsupported)"); + } + if (src.nbytes != out.nbytes) { + throw std::runtime_error("WebGPU prepack: src/out byte-size mismatch"); + } + if (out.buffer == nullptr) { + throw std::runtime_error("WebGPU prepack: null out buffer binding"); + } + + // Sole materialization: write the .pte bytes once, straight into the + // consumer's buffer (no eager src buffer, no buffer->buffer copy). + // Correctness of this write-once relies on `out` being a dedicated buffer + // (the partitioner gives prepack outputs mem_obj_id=-1, so it is never + // memory-plan aliased with a transient that execute() would later overwrite). + graph.materialize_constant(args.at(0), out.buffer); +} + +} // namespace + +WEBGPU_REGISTER_OPERATORS { + WEBGPU_REGISTER_OP(et_vk.prepack.default, prepack_impl); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index 100e48dfbfd..84b5349ef2d 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -57,6 +57,12 @@ ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin" ROPE_DECODE_MODEL="/tmp/webgpu_rope_decode.pte" ROPE_DECODE_XQ_GOLDEN="/tmp/webgpu_rope_decode_xq_golden.bin" ROPE_DECODE_XK_GOLDEN="/tmp/webgpu_rope_decode_xk_golden.bin" +PREPACK_MODEL="/tmp/webgpu_prepack.pte" +PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin" +PREPACK2_MODEL="/tmp/webgpu_prepack_two_const.pte" +PREPACK2_GOLDEN="/tmp/webgpu_prepack_two_const_golden.bin" +PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte" +PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin" $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models @@ -75,6 +81,13 @@ export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}') export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode') " || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test" +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model +export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}') +export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}') +export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}') +" || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test" + $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') @@ -172,6 +185,12 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" ]] && WEBGPU_TEST_ROPE_DECODE_MODEL="${ROPE_DECODE_MODEL}" \ WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN="${ROPE_DECODE_XQ_GOLDEN}" \ WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN="${ROPE_DECODE_XK_GOLDEN}" \ + WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \ + WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \ + WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \ + WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \ + WEBGPU_TEST_PREPACK_TIED_MODEL="${PREPACK_TIED_MODEL}" \ + WEBGPU_TEST_PREPACK_TIED_GOLDEN="${PREPACK_TIED_GOLDEN}" \ "${BIN_DIR}/webgpu_native_test" else echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)" diff --git a/backends/webgpu/test/ops/prepack/__init__.py b/backends/webgpu/test/ops/prepack/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/webgpu/test/ops/prepack/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/webgpu/test/ops/prepack/test_prepack.py b/backends/webgpu/test/ops/prepack/test_prepack.py new file mode 100644 index 00000000000..0769177143f --- /dev/null +++ b/backends/webgpu/test/ops/prepack/test_prepack.py @@ -0,0 +1,142 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Constant-tensor prepack (`et_vk.prepack`) export + golden for the WebGPU +backend. + +The VulkanPartitioner wraps every constant feeding a delegated op in an +`et_vk.prepack.default` node that materializes the constant into a GPU buffer at +init. Model `M(x) = x + w` (w a constant) routes `w` through prepack, so the +delegate must run the prepack copy for the output to equal `x + w` rather than +`x + 0 = x`. The input is a deterministic /16 ramp so the native binary +reconstructs it bit-for-bit; the torch-computed golden is written for the native +binary to compare (it has no ATen). +""" + +import unittest + +import executorch.backends.vulkan.custom_ops_lib # noqa: F401 + +import torch +from executorch.backends.vulkan import VulkanPartitioner +from executorch.exir import to_edge_transform_and_lower + +# 4x4 constant weight, small enough to dump and reason about by hand. +N = 4 + + +class _AddConst(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + # arange weight: non-zero everywhere so an unrun prepack (out = x + 0 = x) + # is unambiguously distinguishable from a correct one (out = x + w). + self.w = torch.nn.Parameter( + torch.arange(N * N, dtype=torch.float32).reshape(N, N) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.w + + +class _AddTwoConst(torch.nn.Module): + # Two constants => two prepack nodes (the multi-copy path E2E Llama needs); + # add-only so it stays delegated with just this stack's registered ops. + def __init__(self) -> None: + super().__init__() + self.w1 = torch.nn.Parameter( + torch.arange(N * N, dtype=torch.float32).reshape(N, N) + ) + self.w2 = torch.nn.Parameter( + torch.arange(N * N, dtype=torch.float32).reshape(N, N) * 0.5 - 3.0 + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.w1 + self.w2 + + +class _AddTiedConst(torch.nn.Module): + # Two BYTE-IDENTICAL constants => two prepack nodes sharing ONE SHA256 + # named-data key (tied/duplicate weights). Exercises the prepack handler + # materializing the same key twice (independent get_data + Free per call). + def __init__(self) -> None: + super().__init__() + self.w1 = torch.nn.Parameter( + torch.arange(N * N, dtype=torch.float32).reshape(N, N) + ) + self.w2 = torch.nn.Parameter( + torch.arange(N * N, dtype=torch.float32).reshape(N, N) + ) + # Pin the tied premise; the dedup to one key is assumed, not asserted. + assert torch.equal(self.w1, self.w2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.w1 + self.w2 + + +def _inputs() -> tuple[torch.Tensor]: + # ((i % 13) - 6) / 16: exact in fp32, matches test_webgpu_native.cpp. + idx = torch.arange(N * N, dtype=torch.int64) + x = (((idx % 13) - 6).to(torch.float32) / 16.0).reshape(N, N) + return (x,) + + +def _export(model, inputs): + ep = torch.export.export(model.eval(), inputs) + return to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + + +class TestPrepack(unittest.TestCase): + def test_export_delegates(self) -> None: + # Each model must fully delegate -- every constant wrapped in a prepack + # node inside a VulkanBackend delegate (single, multi-const, tied). + for name, model in ( + ("x + w", _AddConst()), + ("x + w1 + w2", _AddTwoConst()), + ("x + w + w (tied)", _AddTiedConst()), + ): + with self.subTest(model=name): + et = _export(model, _inputs()) + found = any( + d.id == "VulkanBackend" + for plan in et.executorch_program.execution_plan + for d in plan.delegates + ) + self.assertTrue(found, f"Expected a VulkanBackend delegate: {name}") + + +def _write(model, pte_path: str, golden_path: str) -> None: + (x,) = _inputs() + golden = model.eval()(x) + et = _export(model, (x,)) + with open(pte_path, "wb") as f: + f.write(et.buffer) + golden.detach().numpy().astype(" None: + """Write the x + w .pte + torch golden (raw LE fp32). One prepacked constant. + The input is a /16 ramp reconstructed in the native test.""" + _write(_AddConst(), pte_path, golden_path) + + +def export_prepack_two_const_model(pte_path: str, golden_path: str) -> None: + """Write the x + w1 + w2 .pte + golden. Two prepacked constants, exercising + the multi-copy path.""" + _write(_AddTwoConst(), pte_path, golden_path) + + +def export_prepack_tied_const_model(pte_path: str, golden_path: str) -> None: + """Write the x + w1 + w2 .pte + golden where w1 and w2 are BYTE-IDENTICAL, + so they share one named-data key -> two prepack nodes materialize the same + key (verifies per-call buffer ownership / no double-free on tied weights).""" + _write(_AddTiedConst(), pte_path, golden_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 6a607bcab17..ad7ad2f2fc2 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -536,6 +536,74 @@ static bool test_rope( return true; } +static bool test_prepack( + const std::string& model_path, + const std::string& golden_path, + const std::string& label = "x + const w") { + // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py. + constexpr int n = 4; + constexpr int numel = n * n; + printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + std::vector golden = load_golden(golden_path, numel); + if (golden.empty()) { + printf("FAIL: could not load golden %s\n", golden_path.c_str()); + return false; + } + + // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs. + std::vector x_data(numel); + for (int i = 0; i < numel; i++) { + x_data[i] = static_cast((i % 13) - 6) / 16.0f; + } + auto x = make_tensor_ptr({n, n}, std::vector(x_data)); + + auto result = module.forward({EValue(x)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (out_tensor.numel() != numel) { + printf( + "FAIL: output numel %zu != expected %d\n", + (size_t)out_tensor.numel(), + numel); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + float max_abs_err = 0.0f, max_rel_err = 0.0f; + // Per-element abs-OR-rel (quant_within_tol): a global rel gate spuriously + // fails near-zero outputs where rel error explodes. + const bool within = quant_within_tol( + out_data, golden.data(), numel, 1e-3f, 1e-3f, &max_abs_err, &max_rel_err); + printf( + "Max abs error: %e Max rel error: %e (checked %d elements)\n", + max_abs_err, + max_rel_err, + numel); + if (!within) { + printf("FAIL: prepack exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: prepack test\n"); + return true; +} + // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden. static bool test_q4gsw_config( const Q4gswConfig& cfg, @@ -1614,6 +1682,30 @@ int main(int argc, char** argv) { 64}, }; + std::string prepack_model_path, prepack_golden_path; + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) { + prepack_model_path = env; + } + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) { + prepack_golden_path = env; + } + + std::string prepack2_model_path, prepack2_golden_path; + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) { + prepack2_model_path = env; + } + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) { + prepack2_golden_path = env; + } + + std::string prepack_tied_model_path, prepack_tied_golden_path; + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_MODEL")) { + prepack_tied_model_path = env; + } + if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_GOLDEN")) { + prepack_tied_golden_path = env; + } + // SDPA sweep: configs self-discover their sdpa_.pte/.golden.bin under // this directory (default "" = the embedded-file root / cwd). Set // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/). @@ -1679,6 +1771,24 @@ int main(int argc, char** argv) { } } + if (!prepack_model_path.empty() && !prepack_golden_path.empty()) { + ok = test_prepack(prepack_model_path, prepack_golden_path) && ok; + } + + if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) { + ok = test_prepack( + prepack2_model_path, prepack2_golden_path, "x + w1 + w2") && + ok; + } + + if (!prepack_tied_model_path.empty() && !prepack_tied_golden_path.empty()) { + ok = test_prepack( + prepack_tied_model_path, + prepack_tied_golden_path, + "x + w + w (tied weights, shared key)") && + ok; + } + bool sdpa_ran = false; bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran); if (sdpa_ran) {