From d4053045960fbb61af24cda5284a0dea259cac77 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Mon, 15 Jun 2026 15:37:32 -0700
Subject: [PATCH 1/2] V2 quantizer: fix IO-boundary shared clusters left in
 float

Summary:
Shared-op clusters (e.g. `cat`, `view`, `reshape`) on the quantized IO boundary were silently left in float by the composable TOSA quantizer (`_TOSAQuantizerV2`), causing them to fall off the Ethos-U integer delegate onto CPU.

`SharedQspecQuantizer` propagates a qspec only from already-quantized neighbors. A cluster whose only quantized neighbors are a uint8 model input (intentionally skipped by `_skip_shared_qspec_from_io` to confine uint8 to the IO boundary) and/or an input-state placeholder with no `output_qspec` had no qspec to propagate, so it was rejected and remained in float.

The fix adds `_is_quantized_io_boundary`, which detects annotated `placeholder`/`output` nodes that signal the cluster is on the quantized data path even when their qspec is filtered. `_get_shared_clique` now returns a `touches_quantized_io` flag alongside the usual results. When `_annotate_shared_cluster` finds an empty `adjacent_qspecs` but a boundary-touching cluster, it initiates quantization from the global config input-activation qspec instead of rejecting. `_TOSAQuantizerV2.set_global` now also propagates to `shared_qspec_quantizer.global_config` so the fallback is wired automatically.

This restores the correctness fix from D107320847, which was abandoned because its other fix (parameter-operand weight misclassification) had already been resolved via the `is_weight` `PARAMETER_TARGETS` refactor.

This change was developed with assistance from Claude.

Differential Revision: D108662081
---
 backends/arm/quantizer/arm_quantizer.py       |  1 +
 backends/arm/quantizer/arm_quantizer_utils.py | 41 ++++++++++++--
 .../quantizer/test_uint8_io_quantization.py   | 53 +++++++++++++++++++
 3 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
index 3508410509c..0080d77ab69 100644
--- a/backends/arm/quantizer/arm_quantizer.py
+++ b/backends/arm/quantizer/arm_quantizer.py
@@ -1220,6 +1220,7 @@ def set_global(
             quantization_config, node_finder, self.pattern_matcher
         )
         self.global_config = quantization_config
+        self.shared_qspec_quantizer.global_config = quantization_config
         return self
 
     def set_node_target(
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index d4c2dfebdee..9680406ec0e 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -480,6 +480,7 @@ class SharedQspecQuantizer(Quantizer, QuantizerReporterUser):
     def __init__(self, targets: Optional[list[Callable[..., object]]] = None) -> None:
         super().__init__()
         QuantizerReporterUser.__init__(self)
+        self.global_config: Optional[QuantizationConfig] = None
         if targets is None:
             self.targets = self.SHARED_QSPEC_OPS_DEFAULT
             self.support_config_path = (
@@ -551,10 +552,24 @@ def _append_input_qspec(
             return
         adjacent_qspecs.append(input_qspec)
 
-    def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
+    def _is_quantized_io_boundary(self, node: Node) -> bool:
+        """Return True if node is a model input/output annotated by the quantizer.
+
+        Such a node sits on the quantized interface but its qspec is often
+        filtered out of shared-cluster propagation: a uint8 IO qspec is skipped
+        by _skip_shared_qspec_from_io, and an input-state placeholder may carry
+        an annotation with no output_qspec. Its presence still signals that the
+        cluster is on the quantized data path.
+        """
+        return node.op in ("placeholder", "output") and self._is_annotated(node)
+
+    def _get_shared_clique(
+        self, root_node: Node
+    ) -> tuple[set[Node], list[Any], bool]:
         shared_nodes = set()
         bfs_queue = [root_node]
         adjacent_qspecs: list[Any] = []
+        touches_quantized_io = False
 
         while bfs_queue:
             node = bfs_queue.pop(0)
@@ -563,12 +578,14 @@ def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any]]:
             for input_node in node.all_input_nodes:
                 self._maybe_enqueue_shared_node(input_node, shared_nodes, bfs_queue)
                 self._append_output_qspec(input_node, adjacent_qspecs)
+                touches_quantized_io |= self._is_quantized_io_boundary(input_node)
 
             for output_node in node.users.keys():
                 self._maybe_enqueue_shared_node(output_node, shared_nodes, bfs_queue)
                 self._append_input_qspec(output_node, node, adjacent_qspecs)
+                touches_quantized_io |= self._is_quantized_io_boundary(output_node)
 
-        return shared_nodes, adjacent_qspecs
+        return shared_nodes, adjacent_qspecs, touches_quantized_io
 
     def _should_skip_while_shared_qspec(self, node: Node) -> bool:
         return node.target == torch.ops.higher_order.while_loop and bool(
@@ -623,7 +640,25 @@ def _annotate_shared_cluster(self, root_node: Node) -> None:
             )
             return
 
-        shared_nodes, adjacent_qspecs = self._get_shared_clique(root_node)
+        shared_nodes, adjacent_qspecs, touches_quantized_io = self._get_shared_clique(
+            root_node
+        )
+
+        # If there is no neighbor qspec to propagate but the cluster sits on the
+        # quantized I/O boundary (e.g. a state-passthrough cat whose only neighbors
+        # are a uint8 model input skipped by _skip_shared_qspec_from_io and an
+        # input-state placeholder with no output_qspec), initiate quantization from
+        # the global config rather than leaving the cluster in float. Without this,
+        # such clusters fall off the integer delegate onto CPU.
+        if (
+            len(adjacent_qspecs) == 0
+            and touches_quantized_io
+            and self.global_config is not None
+        ):
+            global_input_qspec = self.global_config.get_input_act_qspec()
+            if global_input_qspec is not None:
+                adjacent_qspecs = [global_input_qspec]
+
         node_order = {node: index for index, node in enumerate(root_node.graph.nodes)}
         ordered_nodes = sorted(shared_nodes, key=lambda node: node_order.get(node, 0))
 
diff --git a/backends/arm/test/quantizer/test_uint8_io_quantization.py b/backends/arm/test/quantizer/test_uint8_io_quantization.py
index 7461ca85a6f..32bf6fbc590 100644
--- a/backends/arm/test/quantizer/test_uint8_io_quantization.py
+++ b/backends/arm/test/quantizer/test_uint8_io_quantization.py
@@ -4,8 +4,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e
+from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 from executorch.backends.arm.quantizer import (
+    get_symmetric_quantization_config,
     get_uint8_io_quantization_config,
     TOSAQuantizer,
 )
@@ -24,6 +27,20 @@ def forward(self, x):
         return self.fc2(self.relu(self.fc1(x)))
 
 
+class CloneAtIoBoundary(torch.nn.Module):
+    """zero-arithmetic cluster whose only adjacent annotated neighbours are
+    uint8-annotated IO nodes (input placeholder + graph output).
+
+    With set_global(int8) + set_io(uint8), both the placeholder and the output
+    node carry uint8 qspecs that _skip_shared_qspec_from_io filters out, leaving
+    adjacent_qspecs empty. Before the IO-boundary fallback fix in
+    SharedQspecQuantizer, this caused the cluster to stay in float.
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.clone(x)
+
+
 def test_uint8_io_quantization_config_tosa_INT_applies_to_io():
     model = SimpleMLP().eval()
     test_data = (torch.rand(1, 4),)
@@ -40,3 +57,39 @@ def test_uint8_io_quantization_config_tosa_INT_applies_to_io():
         output_qspecs={io_config.output_activation: 1},
     )
     pipeline.run()
+
+
+def test_io_boundary_shared_cluster_is_quantized():
+    """Regression: a zero-arithmetic cluster adjacent only to uint8-annotated IO
+    nodes must be annotated with the global int8 qspec, not left in float.
+
+    _skip_shared_qspec_from_io filters the uint8 qspec from IO nodes, so when
+    the cluster's only neighbours are such nodes adjacent_qspecs ends up empty.
+    The fix in SharedQspecQuantizer detects the IO-boundary via
+    _is_quantized_io_boundary and falls back to global_config.get_input_act_qspec().
+    """
+    model = CloneAtIoBoundary().eval()
+    test_data = (torch.rand(1, 4),)
+    compile_spec = common.get_tosa_compile_spec("TOSA-1.0+INT")
+
+    quantizer = TOSAQuantizer(compile_spec, use_composable_quantizer=True)
+    quantizer.set_global(get_symmetric_quantization_config())
+    quantizer.set_io(get_uint8_io_quantization_config())
+
+    exported = torch.export.export(model, test_data, strict=True)
+    prepared = prepare_pt2e(exported.module(), quantizer)
+
+    clone_nodes = [
+        n
+        for n in prepared.graph.nodes
+        if n.op == "call_function" and n.target == torch.ops.aten.clone.default
+    ]
+    assert len(clone_nodes) == 1, f"Expected 1 clone node, got {len(clone_nodes)}"
+    clone_node = clone_nodes[0]
+
+    assert Q_ANNOTATION_KEY in clone_node.meta, (
+        "clone node was not annotated — IO-boundary cluster stayed in float"
+    )
+    assert clone_node.meta[Q_ANNOTATION_KEY].output_qspec is not None, (
+        "clone node has no output_qspec — IO-boundary cluster stayed in float"
+    )

From c0ac9b6b9161be042f8fdccefb1664944b24b9b0 Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Tue, 16 Jun 2026 14:22:40 -0700
Subject: [PATCH 2/2] Apply lintrunner formatting to V2 quantizer IO-boundary
 fix

Runs lintrunner -a on the two files flagged by the Lint check on #20291
(UFMT import ordering and signature wrapping, DOCFORMATTER docstrings).
Formatting only; no logic changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 backends/arm/quantizer/arm_quantizer_utils.py |  8 ++++----
 .../quantizer/test_uint8_io_quantization.py   | 19 ++++++++++---------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index 9680406ec0e..a59ccff87b1 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -553,19 +553,19 @@ def _append_input_qspec(
         adjacent_qspecs.append(input_qspec)
 
     def _is_quantized_io_boundary(self, node: Node) -> bool:
-        """Return True if node is a model input/output annotated by the quantizer.
+        """Return True if node is a model input/output annotated by the
+        quantizer.
 
         Such a node sits on the quantized interface but its qspec is often
         filtered out of shared-cluster propagation: a uint8 IO qspec is skipped
         by _skip_shared_qspec_from_io, and an input-state placeholder may carry
         an annotation with no output_qspec. Its presence still signals that the
         cluster is on the quantized data path.
+
         """
         return node.op in ("placeholder", "output") and self._is_annotated(node)
 
-    def _get_shared_clique(
-        self, root_node: Node
-    ) -> tuple[set[Node], list[Any], bool]:
+    def _get_shared_clique(self, root_node: Node) -> tuple[set[Node], list[Any], bool]:
         shared_nodes = set()
         bfs_queue = [root_node]
         adjacent_qspecs: list[Any] = []
diff --git a/backends/arm/test/quantizer/test_uint8_io_quantization.py b/backends/arm/test/quantizer/test_uint8_io_quantization.py
index 32bf6fbc590..3b839dc01c0 100644
--- a/backends/arm/test/quantizer/test_uint8_io_quantization.py
+++ b/backends/arm/test/quantizer/test_uint8_io_quantization.py
@@ -4,8 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e
-from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 from executorch.backends.arm.quantizer import (
     get_symmetric_quantization_config,
@@ -14,6 +12,8 @@
 )
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import QuantizationPipeline
+from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e
+from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
 class SimpleMLP(torch.nn.Module):
@@ -28,13 +28,14 @@ def forward(self, x):
 
 
 class CloneAtIoBoundary(torch.nn.Module):
-    """zero-arithmetic cluster whose only adjacent annotated neighbours are
+    """Zero-arithmetic cluster whose only adjacent annotated neighbours are
     uint8-annotated IO nodes (input placeholder + graph output).
 
     With set_global(int8) + set_io(uint8), both the placeholder and the output
     node carry uint8 qspecs that _skip_shared_qspec_from_io filters out, leaving
     adjacent_qspecs empty. Before the IO-boundary fallback fix in
     SharedQspecQuantizer, this caused the cluster to stay in float.
+
     """
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -87,9 +88,9 @@ def test_io_boundary_shared_cluster_is_quantized():
     assert len(clone_nodes) == 1, f"Expected 1 clone node, got {len(clone_nodes)}"
     clone_node = clone_nodes[0]
 
-    assert Q_ANNOTATION_KEY in clone_node.meta, (
-        "clone node was not annotated — IO-boundary cluster stayed in float"
-    )
-    assert clone_node.meta[Q_ANNOTATION_KEY].output_qspec is not None, (
-        "clone node has no output_qspec — IO-boundary cluster stayed in float"
-    )
+    assert (
+        Q_ANNOTATION_KEY in clone_node.meta
+    ), "clone node was not annotated — IO-boundary cluster stayed in float"
+    assert (
+        clone_node.meta[Q_ANNOTATION_KEY].output_qspec is not None
+    ), "clone node has no output_qspec — IO-boundary cluster stayed in float"