diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 8bf1674d872..f7cd85f9758 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -41,6 +41,7 @@ set(WEBGPU_SRCS
     runtime/ops/mul/BinaryOp.cpp
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
+    runtime/ops/prepack/Prepack.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 65aaaf6c681..b7fb4313400 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -26,6 +26,10 @@ namespace executorch::backends::webgpu {
 
 namespace {
 
+// Op name the AOT exporter emits for a prepacked constant (must match the
+// serialized schema); compared in the prepack pre-scan below.
+constexpr const char* kPrepackOpName = "et_vk.prepack.default";
+
 size_t vk_datatype_size(vkgraph::VkDataType dtype) {
   switch (dtype) {
     case vkgraph::VkDataType::BOOL:
@@ -230,6 +234,10 @@ void WebGPUGraph::build(
 
   const auto* graph = vkgraph::GetVkGraph(flatbuffer_data);
 
+  // .pte byte sources for prepack-time constant materialization (build-only).
+  constant_data_ = constant_data;
+  named_data_map_ = named_data_map;
+
   // Phase 1: Create all values
   const auto* values = graph->values();
   const int num_vals = values ? values->size() : 0;
@@ -241,6 +249,41 @@ void WebGPUGraph::build(
   bools_.resize(num_vals, false);
   value_lists_.resize(num_vals);
 
+  // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the
+  // prepack node materializes it once) only if it is a prepack source AND never
+  // a direct arg of a non-prepack op. ValueList args are expanded so a constant
+  // reached through a list still counts as a direct use.
+  std::unordered_set<int> prepack_src_ids;
+  std::unordered_set<int> direct_use_ids;
+  const auto* chain_prescan = graph->chain();
+  if (chain_prescan) {
+    for (unsigned ci = 0; ci < chain_prescan->size(); ci++) {
+      const auto* oc = chain_prescan->Get(ci);
+      const bool is_prepack = oc->name()->str() == kPrepackOpName;
+      const auto* a = oc->args();
+      if (!a) {
+        continue;
+      }
+      for (unsigned j = 0; j < a->size(); j++) {
+        int id = static_cast<int>(a->Get(j));
+        if (is_prepack && j == 0) {
+          prepack_src_ids.insert(id);
+        } else if (!is_prepack) {
+          direct_use_ids.insert(id);
+          const auto* v = values ? values->Get(id) : nullptr;
+          if (v && v->value_type() == vkgraph::GraphTypes::ValueList) {
+            const auto* items = v->value_as_ValueList()->items();
+            if (items) {
+              for (unsigned k = 0; k < items->size(); k++) {
+                direct_use_ids.insert(static_cast<int>(items->Get(k)));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (int i = 0; i < num_vals; i++) {
     const auto* val = values->Get(i);
     if (!val || val->value_type() == vkgraph::GraphTypes::NONE) {
@@ -269,60 +312,51 @@ void WebGPUGraph::build(
         int constant_id = vk_tensor->constant_id();
         int mem_obj_id = vk_tensor->mem_obj_id();
 
-        // Constants always get dedicated buffers regardless of mem_obj_id
+        // Constants are dedicated. Every constant is recorded as a
+        // ConstantSource and materialized via materialize_constant (one
+        // CPU->GPU write); a constant consumed ONLY via prepack is deferred
+        // (no eager buffer -- its prepack node performs that one write).
         if (constant_id >= 0 || mem_obj_id < 0) {
           tensor_mem_obj_ids_[i] = -1;
-          WGPUBufferDescriptor buf_desc = {};
-          buf_desc.size = std::max(tensor.nbytes, size_t(4));
-          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-              WGPUBufferUsage_CopySrc;
-          buf_desc.mappedAtCreation = false;
-          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-          if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
+
+          if (constant_id >= 0) {
             const auto* constants = graph->constants();
-            if (constants &&
-                constant_id < static_cast<int>(constants->size())) {
-              const auto* vk_bytes = constants->Get(constant_id);
-              if (vk_bytes->offset() != UINT64_MAX) {
-                const uint8_t* src = constant_data + vk_bytes->offset();
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, src, tensor.nbytes);
-              } else if (
-                  vk_bytes->named_key() != nullptr &&
-                  named_data_map != nullptr) {
-                // Constant stored in the PTE named-data map.
-                auto buf =
-                    named_data_map->get_data(vk_bytes->named_key()->c_str());
-                if (!buf.ok()) {
-                  throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() +
-                      "' not found in NamedDataMap");
-                }
-                if (buf->size() < tensor.nbytes) {
-                  throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() + "' undersized: have " +
-                      std::to_string(buf->size()) + " bytes, need " +
-                      std::to_string(tensor.nbytes));
-                }
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
-                buf->Free();
-              } else {
-                throw std::runtime_error(
-                    "WebGPU: constant has no inline offset and no named-data key");
-              }
-            } else {
+            if (!constants ||
+                constant_id >= static_cast<int>(constants->size())) {
               throw std::runtime_error(
                   "WebGPU: constant_id set but the constants table is missing "
                   "or the id is out of range");
             }
-          } else if (constant_id >= 0 && tensor.nbytes > 0) {
-            // constant_id set but constant_data null -> fail loud.
-            throw std::runtime_error(
-                "WebGPU: constant_id set but constant_data is null");
+            const auto* vk_bytes = constants->Get(constant_id);
+            ConstantSource cs;
+            cs.nbytes = tensor.nbytes;
+            if (vk_bytes->offset() != UINT64_MAX) {
+              cs.inline_offset = vk_bytes->offset();
+            } else if (vk_bytes->named_key() != nullptr) {
+              cs.named_key = vk_bytes->named_key()->str();
+            } else {
+              throw std::runtime_error(
+                  "WebGPU: constant has no inline offset and no named-data key");
+            }
+            constant_sources_[i] = std::move(cs);
+          }
+
+          // Defer constants consumed solely via prepack: skip the eager buffer.
+          const bool defer = constant_id >= 0 &&
+              prepack_src_ids.count(i) != 0 && direct_use_ids.count(i) == 0;
+          if (!defer) {
+            WGPUBufferDescriptor buf_desc = {};
+            buf_desc.size = std::max(tensor.nbytes, size_t(4));
+            buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                WGPUBufferUsage_CopySrc;
+            buf_desc.mappedAtCreation = false;
+            tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+            // Same single CPU->GPU write the prepack node uses (no
+            // duplication).
+            if (constant_id >= 0) {
+              materialize_constant(i, tensor.buffer);
+            }
           }
         } else {
           // Shared buffer: track required size, defer allocation to pass 2
@@ -458,6 +492,47 @@ void WebGPUGraph::build(
       webgpu_operator_registry().get_op_fn(op_name)(*this, args);
     }
   }
+
+  // Prepack nodes (Phase 3) materialized their constants directly into the
+  // consumer buffers via materialize_constant; no separate copy pass needed.
+  // The .pte bytes are freed right after build() returns (WebGPUBackend
+  // processed->Free()), so clear the build-only source pointers.
+  constant_data_ = nullptr;
+  named_data_map_ = nullptr;
+}
+
+void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) {
+  auto it = constant_sources_.find(const_value_id);
+  if (it == constant_sources_.end()) {
+    throw std::runtime_error(
+        "WebGPU: no source recorded for constant id " +
+        std::to_string(const_value_id));
+  }
+  const ConstantSource& cs = it->second;
+  if (cs.nbytes == 0) {
+    return;
+  }
+  if (cs.inline_offset != UINT64_MAX) {
+    if (constant_data_ == nullptr) {
+      throw std::runtime_error("WebGPU: inline constant data is null");
+    }
+    wgpuQueueWriteBuffer(
+        queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes);
+  } else if (!cs.named_key.empty() && named_data_map_ != nullptr) {
+    auto buf = named_data_map_->get_data(cs.named_key.c_str());
+    if (!buf.ok()) {
+      throw std::runtime_error(
+          "WebGPU: named constant '" + cs.named_key + "' not found");
+    }
+    if (buf->size() < cs.nbytes) {
+      throw std::runtime_error(
+          "WebGPU: named constant '" + cs.named_key + "' undersized");
+    }
+    wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes);
+    buf->Free();
+  } else {
+    throw std::runtime_error("WebGPU: constant has no source");
+  }
 }
 
 WGPUShaderModule WebGPUGraph::get_or_create_shader(
@@ -780,10 +855,11 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
       stats.num_tensors++;
-      // Shared tensors are tracked via shared_buffer_sizes_
+      // Shared tensors are tracked via shared_buffer_sizes_; a deferred
+      // prepack-routed constant has no buffer (no GPU memory) -> not counted.
       bool is_shared =
           i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
-      if (!is_shared) {
+      if (!is_shared && tensors_[i].buffer != nullptr) {
         stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
       }
     }
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index a914c8710ce..3572f751a06 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -50,6 +50,15 @@ struct OutputCopy {
   size_t nbytes = 0;
 };
 
+// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef
+// (sizes + a data reference, not a live GPU tensor). The prepack node is the
+// sole materialization, so the constant needs no eager GPU buffer.
+struct ConstantSource {
+  uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key
+  std::string named_key; // non-empty => fetch from named_data_map_
+  size_t nbytes = 0;
+};
+
 struct ExecuteConfig {
   size_t chunk_size = 0;
   size_t initial_chunk_size = 0;
@@ -180,6 +189,11 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
+  // transfer. Build-time only (the .pte bytes are freed after build()).
+  // Mirrors Vulkan prepack_standard.
+  void materialize_constant(int const_value_id, WGPUBuffer dst);
+
   void add_uniform_buffer_bytes(size_t bytes) {
     uniform_buffer_bytes_ += bytes;
   }
@@ -286,6 +300,13 @@ class WebGPUGraph {
 
   std::vector<WebGPUDispatch> dispatches_;
 
+  // Prepack-routed constant sources (offset/named-key + size); the prepack node
+  // materializes these once. constant_data_/named_data_map_ point at the .pte
+  // bytes and are valid only during build().
+  const uint8_t* constant_data_ = nullptr;
+  const executorch::runtime::NamedDataMap* named_data_map_ = nullptr;
+  std::unordered_map<int, ConstantSource> constant_sources_;
+
   ExecuteConfig execute_config_;
 
   // Caches for reusing GPU objects across dispatches.
diff --git a/backends/webgpu/runtime/ops/prepack/Prepack.cpp b/backends/webgpu/runtime/ops/prepack/Prepack.cpp
new file mode 100644
index 00000000000..71414f91787
--- /dev/null
+++ b/backends/webgpu/runtime/ops/prepack/Prepack.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+
+#include <stdexcept>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// Materialize a constant into the prepack-output buffer via one CPU->GPU write.
+void prepack_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // et_vk.prepack.default args: [src (constant), out].
+  if (args.size() != 2) {
+    throw std::runtime_error("WebGPU prepack: expected 2 args (src, out)");
+  }
+  const auto& src = graph.get_tensor(args.at(0));
+  const auto& out = graph.get_tensor(args.at(1));
+
+  if (src.dims != out.dims) {
+    throw std::runtime_error("WebGPU prepack: src/out shape mismatch");
+  }
+  if (src.elem_size != out.elem_size) {
+    throw std::runtime_error(
+        "WebGPU prepack: src/out dtype mismatch (cast unsupported)");
+  }
+  if (src.nbytes != out.nbytes) {
+    throw std::runtime_error("WebGPU prepack: src/out byte-size mismatch");
+  }
+  if (out.buffer == nullptr) {
+    throw std::runtime_error("WebGPU prepack: null out buffer binding");
+  }
+
+  // Sole materialization: write the .pte bytes once, straight into the
+  // consumer's buffer (no eager src buffer, no buffer->buffer copy).
+  // Correctness of this write-once relies on `out` being a dedicated buffer
+  // (the partitioner gives prepack outputs mem_obj_id=-1, so it is never
+  // memory-plan aliased with a transient that execute() would later overwrite).
+  graph.materialize_constant(args.at(0), out.buffer);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(et_vk.prepack.default, prepack_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
index 100e48dfbfd..84b5349ef2d 100644
--- a/backends/webgpu/scripts/test_webgpu_native_ci.sh
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -57,6 +57,12 @@ ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin"
 ROPE_DECODE_MODEL="/tmp/webgpu_rope_decode.pte"
 ROPE_DECODE_XQ_GOLDEN="/tmp/webgpu_rope_decode_xq_golden.bin"
 ROPE_DECODE_XK_GOLDEN="/tmp/webgpu_rope_decode_xk_golden.bin"
+PREPACK_MODEL="/tmp/webgpu_prepack.pte"
+PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin"
+PREPACK2_MODEL="/tmp/webgpu_prepack_two_const.pte"
+PREPACK2_GOLDEN="/tmp/webgpu_prepack_two_const_golden.bin"
+PREPACK_TIED_MODEL="/tmp/webgpu_prepack_tied_const.pte"
+PREPACK_TIED_GOLDEN="/tmp/webgpu_prepack_tied_const_golden.bin"
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
@@ -75,6 +81,13 @@ export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
 export_rope_model('${ROPE_DECODE_MODEL}', '${ROPE_DECODE_XQ_GOLDEN}', '${ROPE_DECODE_XK_GOLDEN}', 'decode')
 " || echo "WARN: rope export failed; apply_rotary_emb configs will FAIL in webgpu_native_test"
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_two_const_model, export_prepack_tied_const_model
+export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
+export_prepack_two_const_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
+export_prepack_tied_const_model('${PREPACK_TIED_MODEL}', '${PREPACK_TIED_GOLDEN}')
+" || echo "WARN: prepack export failed; prepack configs will FAIL in webgpu_native_test"
+
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
 export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
@@ -172,6 +185,12 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" ]] &&
       WEBGPU_TEST_ROPE_DECODE_MODEL="${ROPE_DECODE_MODEL}" \
       WEBGPU_TEST_ROPE_DECODE_XQ_GOLDEN="${ROPE_DECODE_XQ_GOLDEN}" \
       WEBGPU_TEST_ROPE_DECODE_XK_GOLDEN="${ROPE_DECODE_XK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \
+      WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \
+      WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \
+      WEBGPU_TEST_PREPACK_TIED_MODEL="${PREPACK_TIED_MODEL}" \
+      WEBGPU_TEST_PREPACK_TIED_GOLDEN="${PREPACK_TIED_GOLDEN}" \
       "${BIN_DIR}/webgpu_native_test"
 else
   echo "(skipping webgpu_native_test: executorch wheel absent — exports did not run)"
diff --git a/backends/webgpu/test/ops/prepack/__init__.py b/backends/webgpu/test/ops/prepack/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/backends/webgpu/test/ops/prepack/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/prepack/test_prepack.py b/backends/webgpu/test/ops/prepack/test_prepack.py
new file mode 100644
index 00000000000..0769177143f
--- /dev/null
+++ b/backends/webgpu/test/ops/prepack/test_prepack.py
@@ -0,0 +1,142 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Constant-tensor prepack (`et_vk.prepack`) export + golden for the WebGPU
+backend.
+
+The VulkanPartitioner wraps every constant feeding a delegated op in an
+`et_vk.prepack.default` node that materializes the constant into a GPU buffer at
+init. Model `M(x) = x + w` (w a constant) routes `w` through prepack, so the
+delegate must run the prepack copy for the output to equal `x + w` rather than
+`x + 0 = x`. The input is a deterministic /16 ramp so the native binary
+reconstructs it bit-for-bit; the torch-computed golden is written for the native
+binary to compare (it has no ATen).
+"""
+
+import unittest
+
+import executorch.backends.vulkan.custom_ops_lib  # noqa: F401
+
+import torch
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# 4x4 constant weight, small enough to dump and reason about by hand.
+N = 4
+
+
+class _AddConst(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # arange weight: non-zero everywhere so an unrun prepack (out = x + 0 = x)
+        # is unambiguously distinguishable from a correct one (out = x + w).
+        self.w = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.w
+
+
+class _AddTwoConst(torch.nn.Module):
+    # Two constants => two prepack nodes (the multi-copy path E2E Llama needs);
+    # add-only so it stays delegated with just this stack's registered ops.
+    def __init__(self) -> None:
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N) * 0.5 - 3.0
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.w1 + self.w2
+
+
+class _AddTiedConst(torch.nn.Module):
+    # Two BYTE-IDENTICAL constants => two prepack nodes sharing ONE SHA256
+    # named-data key (tied/duplicate weights). Exercises the prepack handler
+    # materializing the same key twice (independent get_data + Free per call).
+    def __init__(self) -> None:
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+        # Pin the tied premise; the dedup to one key is assumed, not asserted.
+        assert torch.equal(self.w1, self.w2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.w1 + self.w2
+
+
+def _inputs() -> tuple[torch.Tensor]:
+    # ((i % 13) - 6) / 16: exact in fp32, matches test_webgpu_native.cpp.
+    idx = torch.arange(N * N, dtype=torch.int64)
+    x = (((idx % 13) - 6).to(torch.float32) / 16.0).reshape(N, N)
+    return (x,)
+
+
+def _export(model, inputs):
+    ep = torch.export.export(model.eval(), inputs)
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class TestPrepack(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        # Each model must fully delegate -- every constant wrapped in a prepack
+        # node inside a VulkanBackend delegate (single, multi-const, tied).
+        for name, model in (
+            ("x + w", _AddConst()),
+            ("x + w1 + w2", _AddTwoConst()),
+            ("x + w + w (tied)", _AddTiedConst()),
+        ):
+            with self.subTest(model=name):
+                et = _export(model, _inputs())
+                found = any(
+                    d.id == "VulkanBackend"
+                    for plan in et.executorch_program.execution_plan
+                    for d in plan.delegates
+                )
+                self.assertTrue(found, f"Expected a VulkanBackend delegate: {name}")
+
+
+def _write(model, pte_path: str, golden_path: str) -> None:
+    (x,) = _inputs()
+    golden = model.eval()(x)
+    et = _export(model, (x,))
+    with open(pte_path, "wb") as f:
+        f.write(et.buffer)
+    golden.detach().numpy().astype("<f4").tofile(golden_path)
+    print(f"Exported {pte_path}; golden {golden_path} ({golden.numel()} floats)")
+
+
+def export_prepack_model(pte_path: str, golden_path: str) -> None:
+    """Write the x + w .pte + torch golden (raw LE fp32). One prepacked constant.
+    The input is a /16 ramp reconstructed in the native test."""
+    _write(_AddConst(), pte_path, golden_path)
+
+
+def export_prepack_two_const_model(pte_path: str, golden_path: str) -> None:
+    """Write the x + w1 + w2 .pte + golden. Two prepacked constants, exercising
+    the multi-copy path."""
+    _write(_AddTwoConst(), pte_path, golden_path)
+
+
+def export_prepack_tied_const_model(pte_path: str, golden_path: str) -> None:
+    """Write the x + w1 + w2 .pte + golden where w1 and w2 are BYTE-IDENTICAL,
+    so they share one named-data key -> two prepack nodes materialize the same
+    key (verifies per-call buffer ownership / no double-free on tied weights)."""
+    _write(_AddTiedConst(), pte_path, golden_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index 6a607bcab17..ad7ad2f2fc2 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -536,6 +536,74 @@ static bool test_rope(
   return true;
 }
 
+static bool test_prepack(
+    const std::string& model_path,
+    const std::string& golden_path,
+    const std::string& label = "x + const w") {
+  // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py.
+  constexpr int n = 4;
+  constexpr int numel = n * n;
+  printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n);
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  std::vector<float> golden = load_golden(golden_path, numel);
+  if (golden.empty()) {
+    printf("FAIL: could not load golden %s\n", golden_path.c_str());
+    return false;
+  }
+
+  // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs.
+  std::vector<float> x_data(numel);
+  for (int i = 0; i < numel; i++) {
+    x_data[i] = static_cast<float>((i % 13) - 6) / 16.0f;
+  }
+  auto x = make_tensor_ptr({n, n}, std::vector<float>(x_data));
+
+  auto result = module.forward({EValue(x)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.numel() != numel) {
+    printf(
+        "FAIL: output numel %zu != expected %d\n",
+        (size_t)out_tensor.numel(),
+        numel);
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f, max_rel_err = 0.0f;
+  // Per-element abs-OR-rel (quant_within_tol): a global rel gate spuriously
+  // fails near-zero outputs where rel error explodes.
+  const bool within = quant_within_tol(
+      out_data, golden.data(), numel, 1e-3f, 1e-3f, &max_abs_err, &max_rel_err);
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      max_abs_err,
+      max_rel_err,
+      numel);
+  if (!within) {
+    printf("FAIL: prepack exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: prepack test\n");
+  return true;
+}
+
 // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
 static bool test_q4gsw_config(
     const Q4gswConfig& cfg,
@@ -1614,6 +1682,30 @@ int main(int argc, char** argv) {
        64},
   };
 
+  std::string prepack_model_path, prepack_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) {
+    prepack_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) {
+    prepack_golden_path = env;
+  }
+
+  std::string prepack2_model_path, prepack2_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) {
+    prepack2_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) {
+    prepack2_golden_path = env;
+  }
+
+  std::string prepack_tied_model_path, prepack_tied_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_MODEL")) {
+    prepack_tied_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_TIED_GOLDEN")) {
+    prepack_tied_golden_path = env;
+  }
+
   // SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
   // this directory (default "" = the embedded-file root / cwd). Set
   // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
@@ -1679,6 +1771,24 @@ int main(int argc, char** argv) {
     }
   }
 
+  if (!prepack_model_path.empty() && !prepack_golden_path.empty()) {
+    ok = test_prepack(prepack_model_path, prepack_golden_path) && ok;
+  }
+
+  if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) {
+    ok = test_prepack(
+             prepack2_model_path, prepack2_golden_path, "x + w1 + w2") &&
+        ok;
+  }
+
+  if (!prepack_tied_model_path.empty() && !prepack_tied_golden_path.empty()) {
+    ok = test_prepack(
+             prepack_tied_model_path,
+             prepack_tied_golden_path,
+             "x + w + w (tied weights, shared key)") &&
+        ok;
+  }
+
   bool sdpa_ran = false;
   bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
   if (sdpa_ran) {