Update

swolchok · swolchok · commit f9b96eb086f5 · 2025-07-21T10:41:44.000-07:00
[ghstack-poisoned]
diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ Platform Support:
   - Arm
   - Cadence
   - MediaTek
+  - NXP
   - OpenVINO
   - Qualcomm
   - Vulkan
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -67,8 +67,8 @@
 )
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
+from .match_arg_dtype_pass import MatchArgDtypePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
-from .match_where_self_arg_dtype_pass import MatchWhereSelfDtypePass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_clone_pass import RemoveClonePass  # noqa
 from .replace_scalar_with_tensor_pass import (  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -66,8 +66,8 @@
     InsertCastForOpsWithInt64InputPass,
     InsertRescalePass,
     InsertTableOpsPass,
+    MatchArgDtypePass,
     MatchArgRanksPass,
-    MatchWhereSelfDtypePass,
     QuantizeOperatorArguments,
     RemoveClonePass,
     ReplaceInfValues,
@@ -116,7 +116,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
-        self.add_pass(MatchWhereSelfDtypePass())
+        self.add_pass(MatchArgDtypePass())
         if self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
@@ -193,8 +193,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
-        self.add_pass(MatchWhereSelfDtypePass())
-
+        self.add_pass(MatchArgDtypePass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass(exported_program))  # type: ignore[call-arg]
diff --git a/backends/arm/_passes/decompose_grouped_conv.py b/backends/arm/_passes/decompose_grouped_conv.py
@@ -6,6 +6,7 @@
 from copy import copy
 
 import torch
+from executorch.backends.arm.tosa_quant_utils import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -48,7 +49,40 @@ def _get_decomposition(op):
                     torch.ops.aten.cat.default,
                 )
             case _:
-                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+                raise RuntimeError("Invalid op for grouped conv decomposition")
+
+    @staticmethod
+    def _split_per_channel_qparams(qarg, index, output_slice_size):
+        if qarg is not None and qarg.per_channel:
+            start_index = index * output_slice_size
+            stop_index = (index + 1) * output_slice_size
+            return QuantArgs(
+                scale=qarg.scale[start_index:stop_index],
+                zp=qarg.zp[start_index:stop_index],
+                qmin=qarg.qmin,
+                qmax=qarg.qmax,
+                dtype=qarg.dtype,
+                axis=qarg.axis,
+                per_channel=qarg.per_channel,
+            )
+        return qarg
+
+    @staticmethod
+    def _get_meta_copy(meta, i, output_slice_size):
+        meta_copy = meta.copy()
+        if "input_qparams" in meta.data and len(meta.data["input_qparams"]) > 0:
+            # Handle per-channel quantization by splitting quantization params
+            # similarly to how activations/weights/biases are split.
+            new_qparams = meta.data.get("input_qparams").copy()
+            # Get quantization params of the weights and slice them.
+            qarg = new_qparams[1]
+            new_qparams[1] = DecomposeGroupedConv._split_per_channel_qparams(
+                qarg, index=i, output_slice_size=output_slice_size
+            )
+
+            meta_copy.data["input_qparams"] = new_qparams
+
+        return meta_copy
 
     def call_operator(self, op, args, kwargs, meta):
         if op == exir_ops.edge.aten.convolution.default:
@@ -105,7 +139,6 @@ def call_operator(self, op, args, kwargs, meta):
             if bias_node is None:
                 bias_slices.append(None)
             else:
-
                 start_index = i * output_slice_size
                 stop_index = (i + 1) * output_slice_size
                 slice_args = (bias_node, 0, start_index, stop_index)
@@ -115,20 +148,23 @@ def call_operator(self, op, args, kwargs, meta):
                 )
 
         output_slices = []
-        for input_slice, filter_slice, bias_slice in zip(
-            input_slices, filter_slices, bias_slices
+        for i, (input_slice, filter_slice, bias_slice) in enumerate(
+            zip(input_slices, filter_slices, bias_slices)
         ):
 
+            meta_copy = DecomposeGroupedConv._get_meta_copy(meta, i, output_slice_size)
+
             if op == exir_ops.edge.aten.convolution.default:
                 conv_args = (input_slice, filter_slice, bias_slice, *args[3:8], 1)
             elif op == torch.ops.aten.conv2d.default:
                 conv_args = (input_slice, filter_slice, bias_slice, *args[3:6], 1)
             else:
-                raise RuntimeError("Unvalid op for grouped conv decomposition.")
+                raise RuntimeError("Invalid op for grouped conv decomposition")
 
             output_slices.append(
-                super().call_operator(conv_op, conv_args, kwargs, meta)
+                super().call_operator(conv_op, conv_args, kwargs, meta_copy)
             )
 
         cat_args = (output_slices, 1)
-        return super().call_operator(cat_op, cat_args, kwargs, no_q_dq_meta)
+        # propagate original metadata (including quantization params) to the concatenated output
+        return super().call_operator(cat_op, cat_args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from copy import copy
 from math import prod
 
 import torch
@@ -75,35 +76,47 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         x = get_node_arg(args, 0)
-        input_shape = x.data.size()
-        output_shape = meta["val"].size()
+        input_shape = list(x.data.shape)
+        output_shape = list(meta["val"].shape)
         dims_to_reduce = get_node_arg(args, 1)
         dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
+        dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
 
         dtype = meta["val"].dtype
         view_op = get_view(op)
 
-        if len(input_shape) > 4:
-            raise NotImplementedError(
-                f"{op} with rank > 4 is currently not supported for the TOSA backend."
-            )
+        # Reshape to 4D
+        if len(input_shape) != 4:
+            new_shape = copy(input_shape)
+
+            while len(new_shape) < 4:
+                new_shape.insert(0, 1)
+                dims_to_reduce = [dim + 1 for dim in dims_to_reduce]
 
-        # Unsqueeze to 4D
-        if len(input_shape) < 4:
-            pad_n = 4 - len(input_shape)
-            new_shape = [1] * pad_n + list(input_shape)
-            dims_to_reduce = [dim + pad_n for dim in dims_to_reduce]
+            while len(new_shape) > 4:
+                i = new_shape.pop(0)
+                new_shape[0] = new_shape[0] * i
+                dims_to_reduce = [dim - 1 for dim in dims_to_reduce]
 
             x = super().call_operator(view_op, (x, new_shape), {}, meta, True)
 
         # Reduce (h,w) dims by avg pool if possible
         x, dims_to_reduce = self._reduce_by_average_pool(op, x, dims_to_reduce, meta)
 
+        # Reshape back to 5D if necessary
+        if len(input_shape) > 4:
+            original_dims = input_shape[0:-4]
+            temp_shape = list(x.data.shape)[1:]
+            temp_shape = original_dims + temp_shape
+            dims_to_reduce = [dim + len(original_dims) - 1 for dim in dims_to_reduce]
+
+            x = super().call_operator(view_op, (x, temp_shape), {}, meta, True)
+
         # Reduce remaining dims by sum
         x = self._reduce_by_sum(op, x, dims_to_reduce, meta, dtype)
 
         # Reshape to correct output shape if necessary
-        if x.data.size() != output_shape:
+        if list(x.data.shape) != output_shape:
             x = super().call_operator(view_op, (x, output_shape), {}, meta, True)
 
         return x
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -75,7 +75,7 @@ class FoldAndAnnotateQParamsPass(ArmPass):
      node.
      The quantization parameters from the DQ/Q nodes are stored as meta values to be
      accessible for later lowering and serialization passes.
-     The assumption is that the quantization annotatation adds DQ nodes for all tensor
+     The assumption is that the quantization annotation adds DQ nodes for all tensor
      inputs to the target one Q node to the output.
 
      Example ('executorch_exir_dialects_edge__ops_' prefix removed from operators for readability):
@@ -95,7 +95,7 @@ class FoldAndAnnotateQParamsPass(ArmPass):
 
         output_dq: "f32[5]" = quantized_decomposed_dequantize_per_tensor_default(aten_add_tensor_q, 0.05487706884741783, -128, -128, 127, torch.int8)
 
-    The quantization parameters for x_dq and aten_add_tensor_q are store in meta for the aten_add_tensor node.
+    The quantization parameters for x_dq and aten_add_tensor_q are stored in meta for the aten_add_tensor node.
 
     """
 
@@ -132,7 +132,7 @@ def fold_and_annotate_arg(
                 nodes_to_remove.add(arg)
             if input_qparams is not None and input_qparams != arg_quant_params:
                 # Two args are quantized differently
-                raise RuntimeError("Input qparams does not match!")
+                raise RuntimeError("Input qparams do not match")
             input_qparams = arg_quant_params
         if input_qparams is not None:
             node.meta["input_qparams"][i] = input_qparams
diff --git a/backends/arm/_passes/match_arg_dtype_pass.py b/backends/arm/_passes/match_arg_dtype_pass.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.arm_pass_utils import create_node, get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -26,7 +26,7 @@ def get_largest_dtype(dtype_1, dtype_2):
     return dtype_1 if DTYPE_RANK[dtype_1] > DTYPE_RANK[dtype_2] else dtype_2
 
 
-class MatchWhereSelfDtypePass(ExportPass):
+class MatchArgDtypePass(ExportPass):
     """Pass to match data types of non-condition input tensors.
 
     Edge dialect allows different data types for non-condition tensors, while TOSA
@@ -38,14 +38,18 @@ class MatchWhereSelfDtypePass(ExportPass):
 
     """
 
+    targeted_ops = {exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.where.self}
+
     def call(self, graph_module: torch.fx.GraphModule):
         modified_graph = False
         graph = graph_module.graph
-        node_list = graph.find_nodes(
-            op="call_function", target=exir_ops.edge.aten.where.self
-        )
-        for node in node_list:
-            cond, input_, other_ = node.args
+
+        for node in list(graph.nodes):
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            input_ = get_node_arg(node.args, 0)
+            other_ = get_node_arg(node.args, 1)
 
             input_dtype = input_.meta["val"].dtype
             other_dtype = other_.meta["val"].dtype
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -385,8 +385,6 @@ def forward(self, x):
     f"{k},per_channel_quant={q}": (lambda v=v, q=q: (v(), q))
     for (k, v) in test_data_MI.items()
     for q in [True, False]
-    # TODO: Invalid TOSA graph (MLETORCH-1144)
-    if (k not in ["groups", "groups_bias"]) and (q is True)
 }
 
 fvp_xfails = {
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
@@ -195,6 +195,21 @@ class MeanDim(torch.nn.Module):
             (-4, -3, -2, -1),
             False,
         ),
+        "rank5_01234": lambda: (
+            torch.rand(1, 1, 7, 3, 2),
+            (-5, -4, -3, -2, -1),
+            False,
+        ),
+        "rank5_234": lambda: (
+            torch.rand(1, 1, 7, 3, 2),
+            (-3, -2, -1),
+            False,
+        ),
+        "rank5_12": lambda: (
+            torch.rand(1, 1, 7, 3, 2),
+            (1, 2),
+            False,
+        ),
         "u55_avg_pool_not_supported": lambda: (
             torch.rand(1, 1, 1, 257),
             (0, 1, 2, 3),
@@ -236,7 +251,14 @@ def test_mean_dim_tosa_BI(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", MeanDim.test_data_suite)
+xfails = {
+    "rank5_01234": "Rank 5 graph input currently not supported in EthosUBackend (passes since CHW are all averaged over so data order does not matter in this case)",
+    "rank5_234": "Rank 5 graph input currently not supported in EthosUBackend (passes since CHW are all averaged over so data order does not matter in this case)",
+    "rank5_12": "Rank 5 graph input currently not supported in EthosUBackend",
+}
+
+
+@common.parametrize("test_data", MeanDim.test_data_suite, xfails=xfails, strict=False)
 @common.XfailIfNoCorstone300
 def test_mean_dim_u55_BI(test_data):
     test_data, dim, keep_dim = test_data()
@@ -256,7 +278,7 @@ def test_mean_dim_u55_BI(test_data):
     pipeline.run()
 
 
-@common.parametrize("test_data", MeanDim.test_data_suite)
+@common.parametrize("test_data", MeanDim.test_data_suite, xfails=xfails, strict=False)
 @common.XfailIfNoCorstone320
 def test_mean_dim_u85_BI(test_data):
     test_data, dim, keep_dim = test_data()
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
@@ -242,21 +242,16 @@ def test_add_scalar_u85_BI():
 
 
 # SUB MI ------------------------------------------------------
-mi_sub_xfails = {
-    "int_r1_ts": "TypeError: All IO needs to have the same data type, got input 1: 8, input 2: 6 and output: 8",
-    "int_r4_ts": "TypeError: All IO needs to have the same data type, got input 1: 8, input 2: 6 and output: 8",
-    **xfails,
-}
 
 
-@common.parametrize("test_data", tensor_scalar_tests, xfails=mi_sub_xfails)
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
 def test_sub_tensor_tosa_MI_scalar(test_data):
     """Tests regular sub with one scalar input."""
     pipeline = TosaPipelineMI[input_t1](Sub(), test_data, aten_op=Sub.aten_op)
     pipeline.run()
 
 
-@common.parametrize("test_data", tensor_scalar_tests, xfails=mi_sub_xfails)
+@common.parametrize("test_data", tensor_scalar_tests, xfails=xfails)
 def test_sub_tensor_tosa_MI_inplace(test_data):
     """Tests inplace sub with one scalar input."""
     pipeline = TosaPipelineMI[input_t1](SubInplace(), test_data, aten_op=[])
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
diff --git a/backends/cortex_m/ops/targets.bzl b/backends/cortex_m/ops/targets.bzl
diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
diff --git a/docs/source/index.md b/docs/source/index.md

Original file line number	Diff line number	Diff line change
`@@ -385,8 +385,6 @@ def forward(self, x):`
`385`	`385`	`f"{k},per_channel_quant={q}": (lambda v=v, q=q: (v(), q))`
`386`	`386`	`for (k, v) in test_data_MI.items()`
`387`	`387`	`for q in [True, False]`
`388`		`- # TODO: Invalid TOSA graph (MLETORCH-1144)`
`389`		`- if (k not in ["groups", "groups_bias"]) and (q is True)`
`390`	`388`	`}`
`391`	`389`
`392`	`390`	`fvp_xfails = {`