pytorch
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 126 additions & 20 deletions b/‎backends/arm/quantizer/quantization_annotator.py‎
Lines changed: 126 additions & 20 deletions
@@ -13,6 +13,7 @@
 from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -50,6 +51,15 @@ def get_view(op):
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_quantization(op):
+    """Returns quant and dequant op of same type (per_channel/ tensor) as op if op is a dequant node, None otherwise."""
+    if op in DQ_OPS:
+        # Input of op can be placeholder, can't use that to get quant node directly.
+        quant_type_index = DQ_OPS.index(op)
+        return Q_OPS[quant_type_index], op
+    return None
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
     Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
@@ -121,6 +131,7 @@ def call_operator(self, op, args, kwargs, meta):
                 dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+            x = self._maybe_insert_q_dq_after(x, meta)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
@@ -133,7 +144,7 @@ def call_operator(self, op, args, kwargs, meta):
             dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
-
+            x = self._maybe_insert_q_dq_after(x, meta)
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
@@ -156,6 +167,45 @@ def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
         full = super().call_operator(
             full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
+        if (quant_ops := get_quantization(input_node.node.target)) is not None:
+            # Insert Q and DQ nodes after full op.
+            # Since the value of full is known, we can compute quant params such that dq(q_max_value)
+            q_op, dq_op = quant_ops
+            qmax = input_node.node.args[4]
+            full_quant_args = (
+                1 / (N * qmax),  # Scale to map qmax to 1/N
+                0,  # Zero point
+                *input_node.node.args[3:],
+            )
+            q_args = (full, *full_quant_args)
+            full = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (full, *full_quant_args)
+            full = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
+            # Insert Q and DQ nodes after sum op.
+            # Scale needs to be adjusted with N, since it was computed on data after the division with N.
+            sum_quant_args = (input_node.node.args[1] * N, *input_node.node.args[2:])
+            q_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
 
     def _reduce_by_average_pool(self, op, input_node, dims, meta):
@@ -190,10 +240,38 @@ def _reduce_by_average_pool(self, op, input_node, dims, meta):
         )
 
         if is_supported:
+            out = super().call_operator(avgpool_op, args, {}, meta, True)
+            out = self._maybe_insert_q_dq_after(out, meta)
             return (
-                super().call_operator(avgpool_op, args, {}, meta, True),
+                out,
                 dims_to_reduce_by_sum,
             )
 
         else:
             return input_node, dims
+
+    def _maybe_insert_q_dq_after(self, op, meta):
+        """If the input node of op is a dequant node, insert a q-dq pair after op with identical quantization parameters."""
+
+        if len(op.node.all_input_nodes) > 1:
+            raise ValueError(
+                f"Expected one input to {op.node}, got inputs {op.node.all_input_nodes}"
+            )
+        input_node = op.node.all_input_nodes[0]
+        if (quant_ops := get_quantization(input_node.target)) is not None:
+            q_op, dq_op = quant_ops
+            quant_args = list(input_node.args[1:])
+            q_args = (op, *quant_args)
+            out = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (out, *quant_args)
+            return super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+        else:
+            return op
@@ -65,7 +65,8 @@ def resolve_arg(arg):
             if isinstance(arg, torch.fx.Node) and arg in input_nodes:
                 idx = input_nodes.index(arg)
                 t = get_param_tensor(self.exported_program, arg)
-                if qparams:
+                # Check if qparams exist for this arg
+                if qparams and idx in qparams.keys():
                     t = qparams[idx].dequantize_value(t)
                 return t
             if isinstance(arg, tuple):
 
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide quantization annotation logic for Arm backends.
+
+This module computes per-node quantization properties and applies input/output
+annotations to FX graphs using TorchAO qspecs.
+
+"""
 
 import logging
 import operator
@@ -44,12 +50,31 @@ class _QuantProperty:
 
 
 class _OpQuantProperties:
+    """Collect input/output quantization properties for a node.
+
+    Attributes:
+        quant_inputs (List[_QuantProperty]): Quantization specs for inputs
+            indexed by argument positions.
+        quant_output (Optional[_QuantProperty]): Quantization spec for the
+            node's output when applicable.
+
+    """
+
     def __init__(self):
         self.quant_inputs: List[_QuantProperty] = []
         self.quant_output: Optional[_QuantProperty] = None
 
 
 def _as_list(x):
+    """Return ``x`` wrapped as a list if needed.
+
+    Args:
+        x: Value or list of values.
+
+    Returns:
+        list: ``x`` if already a list; otherwise ``[x]``.
+
+    """
     if isinstance(x, list):
         return x
     else:
@@ -66,16 +91,19 @@ def _is_ok_for_quantization(
     A node can be quantized if:
     - All inputs that are required for quantization are of type `float32`
       and are not large scalar values.
-    - The output of the node itself is of type `float32` and is not a large scalar.
+    - The output of the node itself is of type `float32` and is not a large
+      scalar.
 
     Args:
         node (Node): The node being analyzed.
-        quant_properties (_OpQuantProperties): Contains quantization properties for
-            the node, including input and output quantization specifications.
-        gm (torch.fx.GraphModule): The graph module containing the computational graph.
+        quant_properties (_OpQuantProperties): Contains quantization properties
+            for the node, including input and output quantization specifications.
+        gm (torch.fx.GraphModule): The graph module containing the computational
+            graph.
 
     Returns:
         bool: `True` if the node can be quantized, otherwise `False`.
+
     """
     # Check output
     if quant_properties.quant_output is not None:
@@ -127,16 +155,28 @@ def _is_ok_for_quantization(
 
 
 def _get_node_target(module: torch.nn.Module | torch.fx.GraphModule, target_str: str):
+    """Get an attribute from a module by dotted path.
+
+    Args:
+        module (torch.nn.Module | torch.fx.GraphModule): Root module.
+        target_str (str): Dotted attribute path, e.g., ``"sub.weight"``.
+
+    Returns:
+        Any: Resolved attribute on the module.
+
+    """
     targets = target_str.split(".")
     for target in targets[:-1]:
         module = module.get_submodule(target)
     return getattr(module, targets[-1])
 
 
 def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
-    """Check if input is a large scalar value. So that we can skip quantization for the
-    node since histc op (in HistogramObserver) only works for values up to certain upper
-    bound.
+    """Return True if input is a large scalar value.
+
+    Large scalars are skipped because ``torch.histc`` supports values only up
+    to a certain upper bound.
+
     """
     HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
@@ -166,11 +206,12 @@ def _is_non_float_tensor(node: Node) -> bool:
         bool: `True` if the data type is not float32, otherwise `False`.
 
     Note:
-        - If `node.meta["val"]` is a `list`, the function returns `True` if **any**
-          element is **not** an instance of `FakeTensor` or does **not** have
+        - If `node.meta["val"]` is a `list`, the function returns `True` if
+          any element is not an instance of `FakeTensor` or does not have
           `torch.float32` as its data type.
-        - If node.meta["val"] is missing or is not an instance of `FakeTensor`, the
-          function returns True.
+        - If node.meta["val"] is missing or is not an instance of `FakeTensor`,
+          the function returns True.
+
     """
     if "val" in node.meta and isinstance(node.meta["val"], Sequence):
         return any(
@@ -186,6 +227,20 @@ def _is_non_float_tensor(node: Node) -> bool:
 
 
 def _annotate_input(node: Node, quant_property: _QuantProperty):
+    """Annotate a node's input with the given qspec.
+
+    Maps the specified input argument(s) to the provided quantization spec and
+    optionally marks the input node(s) as annotated.
+
+    Args:
+        node (Node): Node whose input should be annotated.
+        quant_property (_QuantProperty): Input index and qspec(s).
+
+    Raises:
+        RuntimeError: If the node is already annotated.
+        TypeError: If an input argument is not a ``Node`` instance.
+
+    """
     if is_annotated(node):
         raise RuntimeError(
             f"Cannot annotate input: node '{node.name}' is already annotated"
@@ -211,6 +266,18 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
 
 
 def _annotate_output(node: Node, quant_property: _QuantProperty):
+    """Annotate a node's output with the given qspec.
+
+    Args:
+        node (Node): Node whose output should be annotated.
+        quant_property (_QuantProperty): Output index and qspec.
+
+    Raises:
+        RuntimeError: If the node is already annotated.
+        ValueError: If ``mark_annotated`` is True, ``optional`` is True, or
+            ``index`` is not zero.
+
+    """
     if is_annotated(node):
         raise RuntimeError(
             f"Cannot annotate output: node '{node.name}' is already annotated"
@@ -230,12 +297,13 @@ def _annotate_output(node: Node, quant_property: _QuantProperty):
 def _match_pattern(
     node: Node, pattern: List[List], filter_fn: Optional[Callable[[Node], bool]] = None
 ) -> bool:
-    """
-    Check if there's a chain of node.ancestors? -> node -> node.descendant? that matches the
-    chain provided in 'pattern'. If 'filter_fn' is provided, check that all the nodes in the
-    chain pass the filtering.
+    """Check whether a node chain matches a pattern.
+
+    Verify a chain of ancestors -> node -> descendants matches the provided
+    ``pattern``. If ``filter_fn`` is provided, require all nodes in the chain
+    to pass the filter. Each pattern element is a list of disjunctive node
+    targets.
 
-    Each 'pattern' element is composed of a list of disjunctive nodes types.
     """
     if len(pattern) < 1:
         raise ValueError("No pattern provided")
@@ -382,6 +450,21 @@ def _match_pattern(
 def get_quant_properties(  # noqa: C901
     node: Node, gm: torch.fx.GraphModule, quantization_config
 ) -> _OpQuantProperties | None:
+    """Compute quantization properties for a node.
+
+    Determine which inputs and/or outputs should be annotated for quantization
+    based on the node's operator and surrounding pattern.
+
+    Args:
+        node (Node): Node to analyze.
+        gm (torch.fx.GraphModule): Owning graph module.
+        quantization_config: Source for activation/weight/bias qspecs.
+
+    Returns:
+        _OpQuantProperties | None: Properties to apply, or ``None`` if the
+            node is unsupported or not suitable for quantization.
+
+    """
     input_act_qspec = quantization_config.get_input_act_qspec()
     weight_qspec = quantization_config.get_weight_qspec()
     output_act_qspec = quantization_config.get_output_act_qspec()
@@ -390,6 +473,7 @@ def get_quant_properties(  # noqa: C901
     quant_properties = _OpQuantProperties()
 
     def any_or_hardtanh_min_zero(n: Node):
+        """Return True for any op or hardtanh with ``min_val == 0``."""
         # Check that if the node is a hardtanh, its min_val is zero
         return n.target != torch.ops.aten.hardtanh.default or n.args[1] == 0
 
@@ -524,12 +608,19 @@ def any_or_hardtanh_min_zero(n: Node):
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)
     elif node.target in (torch.ops.aten.where.self,):
         true_node = ensure_type(Node, node.args[1])
-        shared_qspec = SharedQuantizationSpec(true_node)
+        input_qspec = (
+            SharedQuantizationSpec(true_node)
+            if is_output_annotated(true_node)
+            else input_act_qspec
+        )
         quant_properties.quant_inputs = [
-            _QuantProperty(1, shared_qspec),
-            _QuantProperty(2, shared_qspec),
+            _QuantProperty(1, input_qspec),
+            _QuantProperty(2, SharedQuantizationSpec((true_node, node))),
         ]
-        quant_properties.quant_output = _QuantProperty(0, shared_qspec)
+        quant_properties.quant_output = _QuantProperty(
+            0,
+            SharedQuantizationSpec((true_node, node)),
+        )
     elif node.target in _one_to_one_shared_input_or_input_act_qspec:
         input_node = ensure_type(Node, node.args[0])
         input_qspec = (
@@ -636,6 +727,21 @@ def annotate_graph(  # type: ignore[return]
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[List[List[Node]]]:
+    """Annotate supported nodes in a graph with quantization specs.
+
+    Iterate through call_function nodes, computes quantization properties, and
+    apply input/output annotations. A filter can restrict which nodes are
+    considered.
+
+    Args:
+        gm (torch.fx.GraphModule): Graph to annotate.
+        quantization_config (QuantizationConfig): Default qspecs for nodes.
+        filter_fn (Optional[Callable[[Node], bool]]): Optional node predicate.
+
+    Returns:
+        Optional[List[List[Node]]]: Reserved for future use; currently None.
+
+    """
     for node in gm.graph.nodes:
         if node.op != "call_function":
             continue