From fe5e8be3b004f7b92673ccdb1ad35f617d27157a Mon Sep 17 00:00:00 2001
From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com>
Date: Thu, 23 Jan 2025 14:49:44 +0100
Subject: [PATCH] Arm backend: enable dim_order (#7831)

Add support for to_dim_order_copy

With edge_compile_config.skip_dim_order = True removed, to_copy will
be converted into to_dim_order_copy nodes. This commit moves our logic
from to_copy into to_dim_order_copy.

Signed-off-by: Oscar Andersson <oscar.andersson@arm.com>
(cherry picked from commit 135e875f2f0c9ee9704899c1208512d5fcf697bd)
---
 .../arm/operator_support/to_copy_support.py   | 19 +++++++--
 backends/arm/operators/__init__.py            |  1 +
 .../arm/operators/op_to_dim_order_copy.py     | 40 +++++++++++++++++++
 .../arm/test/models/test_mobilenet_v2_arm.py  | 13 ++----
 backends/arm/test/ops/test_add.py             |  9 +----
 backends/arm/test/ops/test_linear.py          | 11 ++---
 backends/arm/test/ops/test_maximum.py         |  9 +----
 backends/arm/test/ops/test_minimum.py         |  9 +----
 backends/arm/test/ops/test_sum.py             |  9 +----
 backends/arm/test/tester/arm_tester.py        |  3 --
 examples/arm/aot_arm_compiler.py              |  2 -
 11 files changed, 72 insertions(+), 53 deletions(-)
 create mode 100644 backends/arm/operators/op_to_dim_order_copy.py

diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
index dcf2ce316be..1ed15b26abe 100644
--- a/backends/arm/operator_support/to_copy_support.py
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,7 +22,10 @@
 
 @register_tosa_support_check
 class ToCopySupported(SupportedTOSAOperatorCheck):
-    targets = [exir_ops.edge.aten._to_copy.default]
+    targets = [
+        exir_ops.edge.aten._to_copy.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    ]
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
@@ -110,7 +113,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
             )
             return False
 
-        # Check memory format
+        # Check memory format (to_copy)
         if "memory_format" in node.kwargs:
             if node.kwargs["memory_format"] in (torch.preserve_format,):
                 logger.info(
@@ -119,4 +122,14 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
                 )
                 return False
 
+        # Check dim_order (to_dim_order_copy)
+        if "dim_order" in node.kwargs:
+            dim_order = node.kwargs["dim_order"]
+            if dim_order != list(range(len(dim_order))):
+                logger.info(
+                    f"Argument {dim_order=} is not supported for "
+                    f"{node.target.name()} right now."  # pyre-ignore[16]
+                )
+                return False
+
         return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index 6db9c968f09..16c9f654c39 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -39,6 +39,7 @@
     op_sum,
     op_tanh,
     op_to_copy,
+    op_to_dim_order_copy,
     op_transpose,
     op_unsqueeze,
     op_upsample_nearest2d,
diff --git a/backends/arm/operators/op_to_dim_order_copy.py b/backends/arm/operators/op_to_dim_order_copy.py
new file mode 100644
index 00000000000..c2ec620b821
--- /dev/null
+++ b/backends/arm/operators/op_to_dim_order_copy.py
@@ -0,0 +1,40 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch
+import tosa.Op as TosaOp
+
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+
+@register_node_visitor
+class ToDimOrderCopyVisitor(NodeVisitor):
+    """
+    Implement the type cast functionality of _to_dim_order_copy.
+
+    Other features like setting of the dim_order or moving a tensor to a
+    different device are not supported.
+
+    Also note that the node should not be quantized.
+    """
+
+    target = "dim_order_ops._to_dim_order_copy.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        tosa_graph.addOperator(TosaOp.Op().CAST, [inputs[0].name], [output.name])
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 24af9cf41a2..1c16c84aefe 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -12,7 +12,6 @@
 from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from torchvision import models, transforms
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 
@@ -45,10 +44,6 @@ class TestMobileNetV2(unittest.TestCase):
         "executorch_exir_dialects_edge__ops_aten__native_batch_norm_legit_no_training_default",
     }
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     def test_mv2_tosa_MI(self):
         (
             ArmTester(
@@ -59,7 +54,7 @@ def test_mv2_tosa_MI(self):
                 ),
             )
             .export()
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .to_executorch()
             .run_method_and_compare_outputs(inputs=self.model_inputs)
         )
@@ -75,7 +70,7 @@ def test_mv2_tosa_BI(self):
             )
             .quantize()
             .export()
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .to_executorch()
             # atol=1.0 is a defensive upper limit
             # TODO MLETROCH-72
@@ -92,7 +87,7 @@ def test_mv2_u55_BI(self):
             )
             .quantize()
             .export()
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .to_executorch()
             .serialize()
         )
@@ -110,7 +105,7 @@ def test_mv2_u85_BI(self):
             )
             .quantize()
             .export()
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .to_executorch()
             .serialize()
         )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index f40037f62fa..0c99a3224f4 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -12,7 +12,6 @@
 import torch
 from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
@@ -50,10 +49,6 @@ def __init__(self):
         def forward(self, x, y):
             return x + y
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     def _test_add_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
@@ -66,7 +61,7 @@ def _test_add_tosa_MI_pipeline(
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 1})
             .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
@@ -86,7 +81,7 @@ def _test_add_tosa_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 8aabd365af4..c24d3625f84 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -14,7 +14,6 @@
 from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
@@ -106,10 +105,6 @@
 class TestLinear(unittest.TestCase):
     """tests the linear operation y = Ax + b"""
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     class Linear(torch.nn.Module):
         def __init__(
             self,
@@ -141,7 +136,7 @@ def _test_linear_tosa_MI_pipeline(
             .export()
             .check_count({"torch.ops.aten.linear.default": 1})
             .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data)
@@ -162,7 +157,7 @@ def _test_linear_tosa_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.linear.default": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .run_method_and_compare_outputs(inputs=test_data, qtol=1)
@@ -184,7 +179,7 @@ def _test_linear_tosa_ethosu_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.linear.default": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge_transform_and_lower(edge_compile_config=self._edge_compile_config)
+            .to_edge_transform_and_lower()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
             .serialize()
diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py
index 61e1cccd0be..383558faee5 100644
--- a/backends/arm/test/ops/test_maximum.py
+++ b/backends/arm/test/ops/test_maximum.py
@@ -12,7 +12,6 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
@@ -38,10 +37,6 @@ def __init__(self):
         def forward(self, x, y):
             return torch.maximum(x, y)
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     def _test_maximum_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
@@ -54,7 +49,7 @@ def _test_maximum_tosa_MI_pipeline(
             .export()
             .check_count({"torch.ops.aten.maximum.default": 1})
             .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
@@ -74,7 +69,7 @@ def _test_maximum_tosa_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.maximum.default": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py
index b63bf80f69c..c03a2f6e0ac 100644
--- a/backends/arm/test/ops/test_minimum.py
+++ b/backends/arm/test/ops/test_minimum.py
@@ -12,7 +12,6 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
@@ -38,10 +37,6 @@ def __init__(self):
         def forward(self, x, y):
             return torch.minimum(x, y)
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     def _test_minimum_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
     ):
@@ -54,7 +49,7 @@ def _test_minimum_tosa_MI_pipeline(
             .export()
             .check_count({"torch.ops.aten.minimum.default": 1})
             .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
@@ -74,7 +69,7 @@ def _test_minimum_tosa_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.minimum.default": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 111517afbba..89e0592e0a1 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -11,7 +11,6 @@
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
-from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
 
@@ -50,10 +49,6 @@ class Sum(torch.nn.Module):
         def forward(self, x: torch.Tensor, dim: int, keepdim: bool):
             return x.sum(dim=dim, keepdim=keepdim)
 
-    _edge_compile_config: EdgeCompileConfig = EdgeCompileConfig(
-        _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
     def _test_sum_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: tuple[exampledata_t]
     ):
@@ -66,7 +61,7 @@ def _test_sum_tosa_MI_pipeline(
             .export()
             .check_count({"torch.ops.aten.sum.dim_IntList": 1})
             .check_not(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
@@ -86,7 +81,7 @@ def _test_sum_tosa_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.sum.dim_IntList": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 7b129a98877..5d755e71625 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -218,8 +218,6 @@ def to_edge(
             if config is not None:
                 to_edge_stage.edge_compile_conf = config
 
-        # TODO(T182928844): Delegate dim order op to backend.
-        to_edge_stage.edge_compile_conf._skip_dim_order = True
         return super().to_edge(to_edge_stage)
 
     def partition(self, partition_stage: Optional[Partition] = None):
@@ -245,7 +243,6 @@ def to_edge_transform_and_lower(
                 to_edge_and_lower_stage.partitioners = partitioners
             if edge_compile_config is not None:
                 to_edge_and_lower_stage.edge_compile_conf = edge_compile_config
-        to_edge_and_lower_stage.edge_compile_conf._skip_dim_order = True
         return super().to_edge_transform_and_lower(to_edge_and_lower_stage)
 
     def to_executorch(self, to_executorch_stage: Optional[ToExecutorch] | None = None):
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 6d899c21461..c2cc1dbaa6c 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -511,7 +511,6 @@ def get_args():
             partitioner=[ArmPartitioner(compile_spec)],
             compile_config=EdgeCompileConfig(
                 _check_ir_validity=False,
-                _skip_dim_order=True,
             ),
         )
     else:
@@ -519,7 +518,6 @@ def get_args():
             exported_program,
             compile_config=EdgeCompileConfig(
                 _check_ir_validity=False,
-                _skip_dim_order=True,
             ),
         )