pytorch
diff --git a/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions b/‎backends/arm/_passes/decompose_meandim_pass.py‎
Lines changed: 80 additions & 2 deletions
diff --git a/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/_passes/fuse_constant_ops_pass.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 10 additions & 43 deletions b/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 10 additions & 43 deletions
diff --git a/‎backends/arm/operators/op_clamp.py‎
Lines changed: 29 additions & 82 deletions b/‎backends/arm/operators/op_clamp.py‎
Lines changed: 29 additions & 82 deletions
@@ -13,6 +13,7 @@
 from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -50,6 +51,15 @@ def get_view(op):
     raise RuntimeError(f"Can't get meandim decomposition for op {op}")
 
 
+def get_quantization(op):
+    """Returns quant and dequant op of same type (per_channel/ tensor) as op if op is a dequant node, None otherwise."""
+    if op in DQ_OPS:
+        # Input of op can be placeholder, can't use that to get quant node directly.
+        quant_type_index = DQ_OPS.index(op)
+        return Q_OPS[quant_type_index], op
+    return None
+
+
 class DecomposeMeanDimPass(ArmPass):
     """
     Decomposes a meandim into avg_pool and/or sum + mul (1/N) depending on which dims the mean is taken for:
@@ -121,6 +131,7 @@ def call_operator(self, op, args, kwargs, meta):
                 dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
+            x = self._maybe_insert_q_dq_after(x, meta)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
@@ -133,7 +144,7 @@ def call_operator(self, op, args, kwargs, meta):
             dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
-
+            x = self._maybe_insert_q_dq_after(x, meta)
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
@@ -156,6 +167,45 @@ def _reduce_by_sum(self, op, input_node, dims, meta, dtype):
         full = super().call_operator(
             full_op, ([1] * len(output_shape), 1 / N), {"dtype": dtype}, meta, True
         )
+        if (quant_ops := get_quantization(input_node.node.target)) is not None:
+            # Insert Q and DQ nodes after full op.
+            # Since the value of full is known, we can compute quant params such that dq(q_max_value)
+            q_op, dq_op = quant_ops
+            qmax = input_node.node.args[4]
+            full_quant_args = (
+                1 / (N * qmax),  # Scale to map qmax to 1/N
+                0,  # Zero point
+                *input_node.node.args[3:],
+            )
+            q_args = (full, *full_quant_args)
+            full = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (full, *full_quant_args)
+            full = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
+            # Insert Q and DQ nodes after sum op.
+            # Scale needs to be adjusted with N, since it was computed on data after the division with N.
+            sum_quant_args = (input_node.node.args[1] * N, *input_node.node.args[2:])
+            q_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (sum, *sum_quant_args)
+            sum = super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+
         return super().call_operator(mul_op, (sum, full), {}, meta, True)
 
     def _reduce_by_average_pool(self, op, input_node, dims, meta):
@@ -190,10 +240,38 @@ def _reduce_by_average_pool(self, op, input_node, dims, meta):
         )
 
         if is_supported:
+            out = super().call_operator(avgpool_op, args, {}, meta, True)
+            out = self._maybe_insert_q_dq_after(out, meta)
             return (
-                super().call_operator(avgpool_op, args, {}, meta, True),
+                out,
                 dims_to_reduce_by_sum,
             )
 
         else:
             return input_node, dims
+
+    def _maybe_insert_q_dq_after(self, op, meta):
+        """If the input node of op is a dequant node, insert a q-dq pair after op with identical quantization parameters."""
+
+        if len(op.node.all_input_nodes) > 1:
+            raise ValueError(
+                f"Expected one input to {op.node}, got inputs {op.node.all_input_nodes}"
+            )
+        input_node = op.node.all_input_nodes[0]
+        if (quant_ops := get_quantization(input_node.target)) is not None:
+            q_op, dq_op = quant_ops
+            quant_args = list(input_node.args[1:])
+            q_args = (op, *quant_args)
+            out = super().call_operator(
+                q_op,
+                q_args,
+                kwargs={},
+                meta=meta,
+                updated=True,
+            )
+            dq_args = (out, *quant_args)
+            return super().call_operator(
+                dq_op, dq_args, kwargs={}, meta=meta, updated=True
+            )
+        else:
+            return op
@@ -65,7 +65,8 @@ def resolve_arg(arg):
             if isinstance(arg, torch.fx.Node) and arg in input_nodes:
                 idx = input_nodes.index(arg)
                 t = get_param_tensor(self.exported_program, arg)
-                if qparams:
+                # Check if qparams exist for this arg
+                if qparams and idx in qparams.keys():
                     t = qparams[idx].dequantize_value(t)
                 return t
             if isinstance(arg, tuple):
 
@@ -33,6 +33,7 @@ class AvgPool2dVisitor(NodeVisitor):
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def __init__(self, *args):
@@ -105,43 +106,6 @@ def _build_generic_avgpool2d(
             attr,
         )
 
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        validate_num_inputs(self.target, inputs, [3, 4, 5, 6, 7])
-        validate_same_dtype(self.target, [inputs[0], output], ts)
-        validate_valid_dtype(
-            self.target, [inputs[0], output], ts.DType.INT8, output.tosa_spec
-        )
-
-        accumulator_type = ts.DType.INT32
-
-        input_qargs = get_input_qparams(node)
-        input_zp = input_qargs[0].get_zp_per_tensor()
-
-        output_qargs = get_output_qparams(node)
-        output_zp = output_qargs[0].get_zp_per_tensor()
-
-        self._build_generic_avgpool2d(
-            node, tosa_graph, inputs, output, input_zp, output_zp, accumulator_type
-        )
-
-
-@register_node_visitor
-class AvgPool2dVisitor_FP(AvgPool2dVisitor):
-    target = "aten.avg_pool2d.default"
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-1.0+FP"),
-    ]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: torch.fx.Node,
@@ -159,14 +123,17 @@ def define_node(
         )
 
         if inputs[0].dtype == ts.DType.INT8:
-            super().define_node(node, tosa_graph, inputs, output)
+            accumulator_type = ts.DType.INT32
+            input_qargs = get_input_qparams(node)
+            input_zp = input_qargs[0].get_zp_per_tensor()
 
-        if inputs[0].dtype == ts.DType.FP32:
+            output_qargs = get_output_qparams(node)
+            output_zp = output_qargs[0].get_zp_per_tensor()
+        else:
             accumulator_type = ts.DType.FP32
-            # Initilize zero point to zero.
             input_zp = 0
             output_zp = 0
 
-            self._build_generic_avgpool2d(
-                node, tosa_graph, inputs, output, input_zp, output_zp, accumulator_type
-            )
+        self._build_generic_avgpool2d(
+            node, tosa_graph, inputs, output, input_zp, output_zp, accumulator_type
+        )
@@ -1,5 +1,4 @@
 # Copyright 2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree
@@ -27,18 +26,19 @@
 
 
 @register_node_visitor
-class ClampVisitor_INT(NodeVisitor):
+class ClampVisitor(NodeVisitor):
     target = "aten.clamp.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
     def __init__(self, *args):
         super().__init__(*args)
 
     def _get_min_max_arguments(
-        self, node: Node, dtype_min: int | float, dtype_max: int | float
+        self, node: Node, dtype: torch.dtype
     ) -> Tuple[int | float, int | float]:
 
         def cast_type(value: Any) -> int | float:
@@ -48,6 +48,13 @@ def cast_type(value: Any) -> int | float:
                 # Attempt to cast to float
                 return float(value)
 
+        if dtype.is_floating_point:
+            dtype_min = torch.finfo(dtype).min
+            dtype_max = torch.finfo(dtype).max
+        else:
+            dtype_min = torch.iinfo(dtype).min
+            dtype_max = torch.iinfo(dtype).max
+
         min_arg = dtype_min
         max_arg = dtype_max
 
@@ -60,53 +67,15 @@ def cast_type(value: Any) -> int | float:
 
         return min_arg, max_arg
 
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        validate_num_inputs(self.target, inputs, [2, 3])
-        validate_same_dtype(self.target, [inputs[0], output], ts)
-        validate_valid_dtype(
-            self.target, [inputs[0], output], [ts.DType.INT8], output.tosa_spec
-        )
-
-        # NOTE: Quantization of the min/max arguments is handled by QuantizeOperatorArguments
-        min_int8, max_int8 = self._get_min_max_arguments(
-            node,
-            torch.iinfo(torch.int8).min,
-            torch.iinfo(torch.int8).max,
-        )
-
-        attr = ts.TosaSerializerAttribute()
-        attr.ClampAttribute(
-            np.frombuffer(np.int8(min_int8).tobytes(), dtype=np.uint8).tolist(),
-            np.frombuffer(np.int8(max_int8).tobytes(), dtype=np.uint8).tolist(),
-            ts.NanPropagationMode.PROPAGATE,
-        )
-
-        self._serialize_operator(
-            node,
-            tosa_graph,
-            ts.Op.CLAMP,
-            [inputs[0].name],
-            [output.name],
-            attr,
-        )
-
-
-@register_node_visitor
-class ClampVisitor_FP(ClampVisitor_INT):
-    # inheriting 'target' from INT class
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-1.0+FP"),
-    ]
-
-    def __init__(self, *args):
-        super().__init__(*args)
+    def _to_bytes(self, value: int | float, dtype: torch.dtype) -> bytes:
+        if dtype == torch.float32:
+            return np.frombuffer(np.float32(value).tobytes(), dtype=np.uint8).tolist()
+        elif dtype == torch.float16:
+            return np.frombuffer(np.float16(value).tobytes(), dtype=np.uint8).tolist()
+        elif dtype == torch.int8:
+            return np.frombuffer(np.int8(value).tobytes(), dtype=np.uint8).tolist()
+        else:
+            raise ValueError(f"Unsupported dtype for to_bytes: {dtype}")
 
     def define_node(
         self,
@@ -120,42 +89,20 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.FP16, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.FP16, ts.DType.FP32],
             output.tosa_spec,
         )
 
+        node_input_dtype = node.meta["val"].dtype
+        # NOTE: Quantization of the min/max arguments is handled by QuantizeOperatorArguments
+        min_val, max_val = self._get_min_max_arguments(node, node_input_dtype)
+
         attr = ts.TosaSerializerAttribute()
-        match inputs[0].dtype:
-            case ts.DType.FP16:
-                min_f, max_f = self._get_min_max_arguments(
-                    node,
-                    torch.finfo(torch.float16).min,
-                    torch.finfo(torch.float16).max,
-                )
-                min_bytes = np.frombuffer(
-                    np.float16(min_f).tobytes(), dtype=np.uint8
-                ).tolist()
-                max_bytes = np.frombuffer(
-                    np.float16(max_f).tobytes(), dtype=np.uint8
-                ).tolist()
-            case ts.DType.FP32:
-                min_f, max_f = self._get_min_max_arguments(
-                    node,
-                    torch.finfo(torch.float32).min,
-                    torch.finfo(torch.float32).max,
-                )
-                min_bytes = np.frombuffer(
-                    np.float32(min_f).tobytes(), dtype=np.uint8
-                ).tolist()
-                max_bytes = np.frombuffer(
-                    np.float32(max_f).tobytes(), dtype=np.uint8
-                ).tolist()
-            case _:
-                raise RuntimeError(
-                    f"Internal error: Unsupported dtype {inputs[0].dtype} in {self.target}"
-                )
-
-        attr.ClampAttribute(min_bytes, max_bytes, ts.NanPropagationMode.PROPAGATE)
+        attr.ClampAttribute(
+            self._to_bytes(min_val, node_input_dtype),
+            self._to_bytes(max_val, node_input_dtype),
+            nan_mode=ts.NanPropagationMode.PROPAGATE,
+        )
 
         self._serialize_operator(
             node,