NXP backend: Add support for aten.clone with contiguous memory format.

MartinPavella · MartinPavella · commit 0d71247fe262 · 2025-11-20T09:47:12.000+01:00
This node is sometimes added into a QDQ cluster after lowering to edge, if a tensor has some specific memory format which is not supported by the following node.
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
@@ -13,10 +14,31 @@
 
 
 def _has_supported_memory_format(node: Node) -> bool:
-    if "memory_format" in node.kwargs.keys():
-        return node.kwargs["memory_format"] == torch.preserve_format
+    """The node can either represent an `aten.clone` or a `dim_order_ops._clone_dim_order` operator."""
+    memory_format = node.kwargs.get("memory_format", None)  # Attribute of `aten.clone`.
+    dim_order = node.kwargs.get(
+        "dim_order", None
+    )  # Attribute of `dim_order_ops._clone_dim_order`.
+
+    if (memory_format, dim_order) == (torch.preserve_format, None):
+        # The operator does nothing (e.g. originated as a `Dropout`).
+        return True
+
+    contiguous_dim_order = list(range(len(node.meta["val"].shape)))
+    if (memory_format, dim_order) in [
+        (torch.contiguous_format, None),
+        (None, contiguous_dim_order),
+    ]:
+        # Sometimes there is a `permute_copy` (Transpose) in Executorch, which doesn't actually permute the data in
+        #  memory. Instead, it just changes the `strides` (memory format) to match the permutation. Then, some
+        #  following operator may or may not support the particular strides (e.g. `mul` supports anything but
+        #  `view_copy` does not), so the `clone` may be inserted to actually permute the data in memory to the
+        #  `contiguous` format. This is purely an Executorch issue, and there is no equivalent system in NeutronIR.
+        #  In NeutronIR, every tensor is stored in memory exactly as its shape suggests. Therefore, the `clone` can
+        #  simply be omitted.
+        return True
 
-    return True
+    return False
 
 
 class CloneConverter(NodeConverter):
diff --git a/backends/nxp/edge_passes/move_auxiliary_operator_into_separate_qdq_cluster_pass.py b/backends/nxp/edge_passes/move_auxiliary_operator_into_separate_qdq_cluster_pass.py
@@ -20,6 +20,8 @@
 Relu = exir_ops.edge.aten.relu.default
 Sigmoid = exir_ops.edge.aten.sigmoid.default
 Tanh = exir_ops.edge.aten.tanh.default
+Clone = exir_ops.edge.aten.clone.default
+CloneDimOrder = exir_ops.edge.dim_order_ops._clone_dim_order.default
 
 
 def insert_qdq_pair_after_node(
@@ -69,29 +71,29 @@ def _is_quantize(node_: Node) -> bool:
 
 class MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
     """
-                                                           │
-                                                     ┌─────▼──────┐
-                │                                    │ dequantize │
-          ┌─────▼──────┐                             └─────┬──────┘
-          │ dequantize │                             ┌─────▼──────┐
-          └─────┬──────┘                             │ <aux_node> │
-          ┌─────▼──────┐                             └─────┬──────┘
-          │ <aux_node> │                              ┌────▼─────┐            ┐
-          └─────┬──────┘                              │ quantize │            │
-     ┌──────────▼──────────┐       replaced with      └────┬─────┘            │
-    ⋯┤ <main_cluster_node> ├⋯     ──────────────►          │                  │ newly added nodes
-     └──────────┬──────────┘                         ┌─────▼──────┐           │
-                ▼                                    │ dequantize │           │
-                ⋮                                    └─────┬──────┘           ┘
-           ┌────▼─────┐                         ┌──────────▼──────────┐
-           │ quantize │                        ⋯┤ <main_cluster_node> ├⋯
-           └────┬─────┘                         └──────────┬──────────┘
-                ▼                                          ▼
-                                                           ⋮
-                                                      ┌────▼─────┐
-                                                      │ quantize │
-                                                      └────┬─────┘
-                                                           ▼
+                                                             │
+                                                       ┌─────▼──────┐
+                  │                                    │ dequantize │
+            ┌─────▼──────┐                             └─────┬──────┘
+            │ dequantize │                             ┌─────▼──────┐
+            └─────┬──────┘                             │ <aux_node> │
+            ┌─────▼──────┐                             └─────┬──────┘
+            │ <aux_node> │                              ┌────▼─────┐            ┐
+            └─────┬──────┘                              │ quantize │            │
+       ┌──────────▼──────────┐       replaced with      └────┬─────┘            │
+    ...┤ <main_cluster_node> ├...   ──────────────►          │                  │ newly added nodes
+       └──────────┬──────────┘                         ┌─────▼──────┐           │
+                  ▼                                    │ dequantize │           │
+                  .                                    └─────┬──────┘           ┘
+             ┌────▼─────┐                         ┌──────────▼──────────┐
+             │ quantize │                      ...┤ <main_cluster_node> ├...
+             └────┬─────┘                         └──────────┬──────────┘
+                  ▼                                          ▼
+                                                             .
+                                                        ┌────▼─────┐
+                                                        │ quantize │
+                                                        └────┬─────┘
+                                                             ▼
     """
 
     # Dictionary mapping main cluster nodes to auxiliary nodes, for which this optimization will be applied.
@@ -102,6 +104,7 @@ class MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
         MM: [
             ViewCopy,
         ],
+        ViewCopy: [Clone, CloneDimOrder],
     }
 
     def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
@@ -152,28 +155,28 @@ def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
 
 class MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
     """
-                                                            │
-                                                      ┌─────▼──────┐
-                │                                     │ dequantize │
-          ┌─────▼──────┐                              └─────┬──────┘
-          │ dequantize │                                    ⋮
-          └─────┬──────┘                         ┌──────────▼──────────┐
-                ▼                               ⋯┤ <main_cluster_node> ├⋯
-                ⋮                                └──────────┬──────────┘
-     ┌──────────▼──────────┐       replaced with       ┌────▼─────┐            ┐
-    ⋯┤ <main_cluster_node> ├⋯     ──────────────►      │ quantize │            │
-     └──────────┬──────────┘                           └────┬─────┘            │
-          ┌─────▼──────┐                                    │                  │ newly added nodes
-          │ <aux_node> │                              ┌─────▼──────┐           │
-          └─────┬──────┘                              │ dequantize │           │
-           ┌────▼─────┐                               └─────┬──────┘           ┘
-           │ quantize │                               ┌─────▼──────┐
-           └────┬─────┘                               │ <aux_node> │
-                ▼                                     └─────┬──────┘
-                                                       ┌────▼─────┐
-                                                       │ quantize │
-                                                       └────┬─────┘
-                                                            ▼
+                                                              │
+                                                        ┌─────▼──────┐
+                  │                                     │ dequantize │
+            ┌─────▼──────┐                              └─────┬──────┘
+            │ dequantize │                                    .
+            └─────┬──────┘                         ┌──────────▼──────────┐
+                  ▼                             ...┤ <main_cluster_node> ├...
+                  .                                └──────────┬──────────┘
+       ┌──────────▼──────────┐       replaced with       ┌────▼─────┐            ┐
+    ...┤ <main_cluster_node> ├...   ──────────────►      │ quantize │            │
+       └──────────┬──────────┘                           └────┬─────┘            │
+            ┌─────▼──────┐                                    │                  │ newly added nodes
+            │ <aux_node> │                              ┌─────▼──────┐           │
+            └─────┬──────┘                              │ dequantize │           │
+             ┌────▼─────┐                               └─────┬──────┘           ┘
+             │ quantize │                               ┌─────▼──────┐
+             └────┬─────┘                               │ <aux_node> │
+                  ▼                                     └─────┬──────┘
+                                                         ┌────▼─────┐
+                                                         │ quantize │
+                                                         └────┬─────┘
+                                                              ▼
     """
 
     # Dictionary mapping main cluster nodes to auxiliary nodes, for which this optimization will be applied.
@@ -198,6 +201,7 @@ class MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(NeutronEdgePass):
             Sigmoid,
             Tanh,
         ],
+        ViewCopy: [Clone, CloneDimOrder],
     }
 
     def run(self, graph_module: torch.fx.GraphModule) -> PassResult:
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
@@ -79,6 +79,8 @@ class QDQCluster:
         exir_ops.edge.aten.relu.default,
         exir_ops.edge.aten.sigmoid.default,
         exir_ops.edge.aten.tanh.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.dim_order_ops._clone_dim_order.default,
     ]
 
     def __init__(self):
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -2,8 +2,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
-
 import itertools
 import unittest
 
@@ -14,18 +12,40 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import (
+    PermuteCopyConverter,
+)
+from executorch.backends.nxp.backend.node_format_inference import NodeFormatInference
+from executorch.backends.nxp.edge_passes.move_auxiliary_operator_into_separate_qdq_cluster_pass import (
+    MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass,
+)
+from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
+    NeutronEdgePassManager,
+)
+from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
+    RemoveIOQuantOpsPass,
+)
+from executorch.backends.nxp.neutron_partitioner import QDQClusterRecognizer
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from executorch.backends.nxp.tests import executors
 from executorch.backends.nxp.tests.executorch_pipeline import (
+    _quantize_model,
+    get_random_calibration_inputs,
     to_edge_program,
+    to_model_input_spec,
     to_quantized_edge_program,
 )
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
     graph_contains_any,
     graph_contains_any_of_ops,
+    OverrideTargetSupportCheck,
     ToChannelFirstPreprocess,
     ToChannelLastPreprocess,
 )
+from executorch.exir import EdgeCompileConfig
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.extension.export_util.utils import export_to_edge
 from parameterized import parameterized
 from torch import nn
 from torch.export import ExportedProgram
@@ -76,6 +96,42 @@ def forward(self, x):
         return self.block(x)
 
 
+class TransposeReshapeModel(nn.Module):
+
+    def __init__(self, new_shape: list[int]):
+        super().__init__()
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        # `x` should be 4D.
+
+        x = torch.add(x, x)
+        x = torch.permute(x, [0, 3, 1, 2])
+        # A `clone(memory_format=contiguous)` will be added here during the lowering to edge dialect.
+        x = torch.reshape(x, self.new_shape)
+
+        return x
+
+
+class PermuteCopyReshapeModel(nn.Module):
+
+    def __init__(self, new_shape: list[int], permutation: list[int]):
+        super().__init__()
+        self.new_shape = new_shape
+        self.permutation = permutation
+
+    def forward(self, x):
+        # `x` should be 4D.
+
+        x = torch.add(x, x)
+        x = torch.permute(x, self.permutation)
+        # A `clone(memory_format=contiguous)` will be added here during the lowering to edge dialect.
+        x = torch.reshape(x, self.new_shape)
+        x = torch.add(x, x)
+
+        return x
+
+
 class TestCloneConverter(unittest.TestCase):
     __test__ = False  # Prevent interfering with PyTest tests
 
@@ -185,3 +241,87 @@ def test_clone_pool_view_copy_quant(self, input_shape: tuple[int] = (1, 64, 25,
                 input_data=input_data,
                 atol=1.0,
             )
+
+    def test_clone__to_contiguous_format(self):
+        input_shape = (1, 8, 8, 8)
+        new_shape = [1, 32, 2, 8]
+
+        model = TransposeReshapeModel(new_shape).eval()
+
+        calibration_inputs = get_random_calibration_inputs(
+            to_model_input_spec(input_shape)
+        )
+
+        example_input = calibration_inputs[0]
+
+        exir_program_aten = torch.export.export(model, example_input, strict=True)
+
+        exir_program_aten__module_quant = _quantize_model(
+            exir_program_aten.module(), NeutronQuantizer(), calibration_inputs
+        )
+
+        edge_compile_config = EdgeCompileConfig(_check_ir_validity=False)
+        edge_program_manager = export_to_edge(
+            exir_program_aten__module_quant,
+            example_input,
+            edge_compile_config=edge_compile_config,
+        )
+        # Make sure the `aten.clone` was inserted as expected.
+        nodes = list(edge_program_manager.exported_program().graph.nodes)
+        assert nodes[9].target == exir_ops.edge.dim_order_ops._clone_dim_order.default
+        assert nodes[9].kwargs["dim_order"] == [0, 1, 2, 3]
+
+        # Move the `clone` out of the cluster with the `view_copy`.
+        edge_program_manager = NeutronEdgePassManager(
+            [MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass()]
+        )(edge_program_manager)
+
+        # Tag QDQ clusters, so the conversion works correctly.
+        QDQClusterRecognizer().tag_qdq_clusters(
+            list(edge_program_manager.exported_program().graph.nodes)
+        )
+        edge_program_manager.exported_program().graph_module.recompile()
+        edge_program_manager = edge_program_manager.transform(
+            [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]
+        )
+
+        # Identify the node formats.
+        NodeFormatInference(
+            edge_program_manager.exported_program()
+        ).identify_node_formats()
+
+        # Convert to the IR.
+        converted_model, _ = EdgeProgramToIRConverter().convert_program(
+            edge_program_manager.exported_program()
+        )
+
+        # Make sure the IR version produces the same outputs.
+        executors.convert_run_compare(
+            edge_program_manager.exported_program(),
+            np.random.random_integers(0, 255, input_shape).astype("int8"),
+            tfl_model=converted_model,
+        )
+
+    def test_clone__to_contiguous_format__non_delegated_permute_copy(self):
+        input_shape = (2, 4, 6, 8)
+        new_shape = [3, 4, 16, 2]
+        permutation = [3, 2, 1, 0]  # Unsupported by default.
+
+        model = PermuteCopyReshapeModel(new_shape, permutation).eval()
+
+        # Prohibit `permute_copy` delegation in case support for the permutation is added in the future.
+        def _unsupported_target(*_):
+            return False
+
+        with OverrideTargetSupportCheck(
+            PermuteCopyConverter, new_target_support_check=_unsupported_target
+        ):
+            ep = to_quantized_edge_program(model, input_shape).exported_program()
+
+        nodes = list(ep.graph.nodes)
+        assert not graph_contains_any_of_ops(
+            ep.graph, [exir_ops.edge.aten.clone.default]
+        )
+        assert nodes[3].name == "executorch_call_delegate"
+        assert nodes[6].target == exir_ops.edge.aten.permute_copy.default
+        assert nodes[9].name == "executorch_call_delegate_1"

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,8 @@ class QDQCluster:`
`79`	`79`	`exir_ops.edge.aten.relu.default,`
`80`	`80`	`exir_ops.edge.aten.sigmoid.default,`
`81`	`81`	`exir_ops.edge.aten.tanh.default,`
	`82`	`+ exir_ops.edge.aten.clone.default,`
	`83`	`+ exir_ops.edge.dim_order_ops._clone_dim_order.default,`
`82`	`84`	`]`
`83`	`85`
`84`	`86`	`def __init__(self):`