pytorch
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions
diff --git a/‎backends/arm/test/ops/test_avg_pool2d.py‎
Lines changed: 13 additions & 1 deletion b/‎backends/arm/test/ops/test_avg_pool2d.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎backends/arm/test/tester/analyze_output_utils.py‎
Lines changed: 74 additions & 35 deletions b/‎backends/arm/test/tester/analyze_output_utils.py‎
Lines changed: 74 additions & 35 deletions
@@ -13,6 +13,7 @@
 from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -50,6 +51,15 @@ def get_view(op):
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_quantization(op):
+    """Returns quant and dequant op of same type (per_channel/ tensor) as op if op is a dequant node, None otherwise."""
+    if op in DQ_OPS:
+        # Input of op can be placeholder, can't use that to get quant node directly.
+        quant_type_index = DQ_OPS.index(op)
+        return Q_OPS[quant_type_index], op
+    return None
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
     Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
@@ -121,6 +131,7 @@ def call_operator(self, op, args, kwargs, meta):
                 dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+            x = self._maybe_insert_q_dq_after(x, meta)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
@@ -133,7 +144,7 @@ def call_operator(self, op, args, kwargs, meta):
             dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
-
+            x = self._maybe_insert_q_dq_after(x, meta)
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
@@ -156,6 +167,45 @@ def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
         full = super().call_operator(
             full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
+        if (quant_ops := get_quantization(input_node.node.target)) is not None:
+            # Insert Q and DQ nodes after full op.
+            # Since the value of full is known, we can compute quant params such that dq(q_max_value)
+            q_op, dq_op = quant_ops
+            qmax = input_node.node.args[4]
+            full_quant_args = (
+                1 / (N * qmax),  # Scale to map qmax to 1/N
+                0,  # Zero point
+                *input_node.node.args[3:],
+            )
+            q_args = (full, *full_quant_args)
+            full = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (full, *full_quant_args)
+            full = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
+            # Insert Q and DQ nodes after sum op.
+            # Scale needs to be adjusted with N, since it was computed on data after the division with N.
+            sum_quant_args = (input_node.node.args[1] * N, *input_node.node.args[2:])
+            q_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
 
     def _reduce_by_average_pool(self, op, input_node, dims, meta):
@@ -190,10 +240,38 @@ def _reduce_by_average_pool(self, op, input_node, dims, meta):
         )
 
         if is_supported:
+            out = super().call_operator(avgpool_op, args, {}, meta, True)
+            out = self._maybe_insert_q_dq_after(out, meta)
             return (
-                super().call_operator(avgpool_op, args, {}, meta, True),
+                out,
                 dims_to_reduce_by_sum,
             )
 
         else:
             return input_node, dims
+
+    def _maybe_insert_q_dq_after(self, op, meta):
+        """If the input node of op is a dequant node, insert a q-dq pair after op with identical quantization parameters."""
+
+        if len(op.node.all_input_nodes) > 1:
+            raise ValueError(
+                f"Expected one input to {op.node}, got inputs {op.node.all_input_nodes}"
+            )
+        input_node = op.node.all_input_nodes[0]
+        if (quant_ops := get_quantization(input_node.target)) is not None:
+            q_op, dq_op = quant_ops
+            quant_args = list(input_node.args[1:])
+            q_args = (op, *quant_args)
+            out = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (out, *quant_args)
+            return super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+        else:
+            return op
@@ -23,7 +23,7 @@
     VgfPipeline,
 )
 
-aten_op = "torch.ops.aten.avg_pool2d.default"
+aten_op = "avg_pool2d.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"
 
 input_t = Tuple[torch.Tensor]
@@ -34,6 +34,15 @@ def forward(self, *args, **kwargs):
         return super().forward(*args, **kwargs)
 
 
+class BecomesMeanInToEdge(torch.nn.Module):
+    """This averagepool will be converted to mean when lowering to edge. This causes the decompose_meandim  pass to not
+    trigger until the backend pipeline, which requires extra care.
+    """
+
+    def forward(self, x: torch.Tensor):
+        return torch.nn.functional.adaptive_avg_pool2d(x, (1, 1))
+
+
 test_modules = {
     "zeros": lambda: (AvgPool2d(4, 2, 0, False), (torch.zeros(1, 16, 50, 32),)),
     "ones": lambda: (AvgPool2d(4, 2, 0, False, True), (torch.ones(1, 16, 50, 32),)),
@@ -110,6 +119,9 @@ def forward(self, *args, **kwargs):
         AvgPool2d(3, (1, 3), 1, count_include_pad=False),
         (torch.rand(1, 16, 54, 54),),
     ),
+    "becomes_mean_rank3": lambda: (BecomesMeanInToEdge(), (torch.rand(2, 8, 8),)),
+    "becomes_mean_rank4": lambda: (BecomesMeanInToEdge(), (torch.rand(2, 2, 8, 8),)),
+    "becomes_mean_rank5": lambda: (BecomesMeanInToEdge(), (torch.rand(2, 2, 8, 8),)),
 }
 
 
 
@@ -5,6 +5,7 @@
 
 import logging
 import tempfile
+from typing import Any, cast, Sequence
 
 import torch
 from executorch.backends.arm.test.runner_utils import (
@@ -17,9 +18,30 @@
 logger = logging.getLogger(__name__)
 
 
-def _print_channels(result, reference, channels_close, C, H, W, rtol, atol):
+TensorLike = torch.Tensor | tuple[torch.Tensor, ...]
+
+
+def _ensure_tensor(value: TensorLike) -> torch.Tensor:
+    if isinstance(value, torch.Tensor):
+        return value
+    if value and isinstance(value[0], torch.Tensor):
+        return value[0]
+    raise TypeError("Expected a Tensor or a non-empty tuple of Tensors")
+
+
+def _print_channels(
+    result: torch.Tensor,
+    reference: torch.Tensor,
+    channels_close: Sequence[bool],
+    C: int,
+    H: int,
+    W: int,
+    rtol: float,
+    atol: float,
+) -> str:
 
     output_str = ""
+    exp = "000"
     booldata = False
     if reference.dtype == torch.bool or result.dtype == torch.bool:
         booldata = True
@@ -62,7 +84,15 @@ def _print_channels(result, reference, channels_close, C, H, W, rtol, atol):
     return output_str
 
 
-def _print_elements(result, reference, C, H, W, rtol, atol):
+def _print_elements(
+    result: torch.Tensor,
+    reference: torch.Tensor,
+    C: int,
+    H: int,
+    W: int,
+    rtol: float,
+    atol: float,
+) -> str:
     output_str = ""
     for y in range(H):
         res = "["
@@ -92,14 +122,16 @@ def _print_elements(result, reference, C, H, W, rtol, atol):
 
 
 def print_error_diffs(
-    tester,
-    result: torch.Tensor | tuple,
-    reference: torch.Tensor | tuple,
-    quantization_scale=None,
-    atol=1e-03,
-    rtol=1e-03,
-    qtol=0,
-):
+    tester_or_result: Any,
+    result_or_reference: TensorLike,
+    reference: TensorLike | None = None,
+    # Force remaining args to be keyword-only to keep the two positional call patterns unambiguous.
+    *,
+    quantization_scale: float | None = None,
+    atol: float = 1e-03,
+    rtol: float = 1e-03,
+    qtol: float = 0,
+) -> None:
     """
     Prints the error difference between a result tensor and a reference tensor in NCHW format.
     Certain formatting rules are applied to clarify errors:
@@ -130,15 +162,16 @@ def print_error_diffs(
 
 
     """
-
-    if isinstance(reference, tuple):
-        reference = reference[0]
-    if isinstance(result, tuple):
-        result = result[0]
-
-    if not result.shape == reference.shape:
+    if reference is None:
+        result = _ensure_tensor(cast(TensorLike, tester_or_result))
+        reference_tensor = _ensure_tensor(result_or_reference)
+    else:
+        result = _ensure_tensor(result_or_reference)
+        reference_tensor = _ensure_tensor(reference)
+
+    if result.shape != reference_tensor.shape:
         raise ValueError(
-            f"Output needs to be of same shape: {result.shape} != {reference.shape}"
+            f"Output needs to be of same shape: {result.shape} != {reference_tensor.shape}"
         )
     shape = result.shape
 
@@ -161,29 +194,29 @@ def print_error_diffs(
 
     # Reshape tensors to 4D NCHW format
     result = torch.reshape(result, (N, C, H, W))
-    reference = torch.reshape(reference, (N, C, H, W))
+    reference_tensor = torch.reshape(reference_tensor, (N, C, H, W))
 
     output_str = ""
     for n in range(N):
         output_str += f"BATCH {n}\n"
         result_batch = result[n, :, :, :]
-        reference_batch = reference[n, :, :, :]
+        reference_batch = reference_tensor[n, :, :, :]
 
         is_close = torch.allclose(result_batch, reference_batch, rtol, atol)
         if is_close:
             output_str += ".\n"
         else:
-            channels_close = [None] * C
+            channels_close: list[bool] = [False] * C
             for c in range(C):
                 result_hw = result[n, c, :, :]
-                reference_hw = reference[n, c, :, :]
+                reference_hw = reference_tensor[n, c, :, :]
 
                 channels_close[c] = torch.allclose(result_hw, reference_hw, rtol, atol)
 
             if any(channels_close) or len(channels_close) == 1:
                 output_str += _print_channels(
                     result[n, :, :, :],
-                    reference[n, :, :, :],
+                    reference_tensor[n, :, :, :],
                     channels_close,
                     C,
                     H,
@@ -193,17 +226,23 @@ def print_error_diffs(
                 )
             else:
                 output_str += _print_elements(
-                    result[n, :, :, :], reference[n, :, :, :], C, H, W, rtol, atol
+                    result[n, :, :, :],
+                    reference_tensor[n, :, :, :],
+                    C,
+                    H,
+                    W,
+                    rtol,
+                    atol,
                 )
         if reference_batch.dtype == torch.bool or result_batch.dtype == torch.bool:
             mismatches = (reference_batch != result_batch).sum().item()
             total = reference_batch.numel()
             output_str += f"(BOOLEAN tensor) {mismatches} / {total} elements differ ({mismatches / total:.2%})\n"
 
     # Only compute numeric error metrics if tensor is not boolean
-    if reference.dtype != torch.bool and result.dtype != torch.bool:
-        reference_range = torch.max(reference) - torch.min(reference)
-        diff = torch.abs(reference - result).flatten()
+    if reference_tensor.dtype != torch.bool and result.dtype != torch.bool:
+        reference_range = torch.max(reference_tensor) - torch.min(reference_tensor)
+        diff = torch.abs(reference_tensor - result).flatten()
         diff = diff[diff.nonzero()]
         if not len(diff) == 0:
             diff_percent = diff / reference_range
@@ -230,14 +269,14 @@ def print_error_diffs(
 
 
 def dump_error_output(
-    tester,
-    reference_output,
-    stage_output,
-    quantization_scale=None,
-    atol=1e-03,
-    rtol=1e-03,
-    qtol=0,
-):
+    tester: Any,
+    reference_output: TensorLike,
+    stage_output: TensorLike,
+    quantization_scale: float | None = None,
+    atol: float = 1e-03,
+    rtol: float = 1e-03,
+    qtol: float = 0,
+) -> None:
     """
     Prints Quantization info and error tolerances, and saves the differing tensors to disc.
     """