pytorch
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion b/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/op_add.cpp‎
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/hifi/operators/op_add.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cadence/hifi/operators/op_cat.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/hifi/operators/op_cat.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/hifi/operators/op_clamp.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/hifi/operators/op_clamp.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_div.cpp‎
Lines changed: 4 additions & 4 deletions b/‎backends/cadence/hifi/operators/op_div.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/cadence/hifi/operators/op_maximum.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/hifi/operators/op_maximum.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_minimum.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/hifi/operators/op_minimum.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_mul.cpp‎
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/hifi/operators/op_mul.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cadence/hifi/operators/op_rsqrt.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/hifi/operators/op_rsqrt.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -33,6 +33,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -186,14 +187,17 @@ def export_to_edge(
     edge_prog_manager = to_edge(
         expo_program,
         compile_config=EdgeCompileConfig(
-            _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
                 torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                # cadence replaced to_dim_order_copy with _to_copy for performance
+                # skip _to_copy op to get around of dim order check
+                # We should remove this op once cadence can support dim order
+                exir_ops.edge.aten._to_copy.default,
             ],
         ),
         constant_methods=constant_methods,
 
@@ -11,6 +11,7 @@
 
 # pyre-unsafe
 
+import copy
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -35,7 +36,12 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from executorch.exir.passes.dim_order_ops_registry import (
+    DimOrderOpsMap,
+    MemoryFormatOpsMap,
+)
 from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
@@ -1799,6 +1805,72 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
+    """
+    dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
+    If the dim order is sequential, we don't need the extra work with strides and
+    can just use to_copy.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in DimOrderOpsMap:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # new kwargs with dim_order, and no memory_format for the new op
+        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+        ndim = None
+
+        # can always get the shape, assuming rank is specialized
+
+        # pyre-ignore[16]: `None` has no attribute `to_tensor`
+        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
+            # pyre-ignore[16]: `None` has no attribute `to_tensor`
+            ndim = args[0].to_tensor().dim()
+        elif isinstance(args[0], torch.Tensor):
+            # pyre-ignore[16]: `None` has no attribute `dim`
+            ndim = args[0].dim()
+        elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
+            # pyre-ignore[6]: Incompatible parameter type
+            ndim = len(args[0])
+        else:
+            assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
+
+        # get the "to" memory format for the EdgeOp
+        contiguous_dim_order = list(range(ndim))
+        dim_order = nkwargs.pop("dim_order", None)
+
+        # Cadence only supports contiguous memory format
+        assert (
+            dim_order is None
+            # pyre-ignore[6]: Incompatible parameter type
+            or len(dim_order) == 0
+            or dim_order == contiguous_dim_order
+        ), "Expected dim order in congituous or prevserve memory format, but got {}".format(
+            dim_order
+        )
+
+        # bring back memory format
+        # pyre-ignore[6]: Incompatible parameter type
+        nkwargs["memory_format"] = get_memory_format(dim_order)
+
+        memory_format_op = MemoryFormatOpsMap[op]
+
+        return super().call_operator(
+            memory_format_op,
+            args,
+            nkwargs,
+            meta,
+        )
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceFullLikeWithFullPass(ExportPass):
     """
@@ -2108,4 +2180,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
+        ReplaceToDimOrderCopyWithToCopyPass,
     ]
@@ -16,9 +16,9 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using executorch::aten::Scalar;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::CppTypeToScalarType;
 using executorch::runtime::KernelRuntimeContext;
 
@@ -30,7 +30,7 @@ namespace native {
 
 Tensor& cat_out(
     RuntimeContext& ctx,
-    executorch::aten::ArrayRef<Tensor> tensors,
+    exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
 
@@ -51,8 +51,8 @@ namespace native {
 Tensor& clamp_tensor_out(
     RuntimeContext& ctx,
     const Tensor& in,
-    const executorch::aten::optional<Tensor>& min_opt,
-    const executorch::aten::optional<Tensor>& max_opt,
+    const exec_aten::optional<Tensor>& min_opt,
+    const exec_aten::optional<Tensor>& max_opt,
     Tensor& out) {
   (void)ctx;
 
 
@@ -17,10 +17,10 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cmath>
 
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
-using executorch::aten::Scalar;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 using torch::executor::Error;
 
 namespace cadence {
@@ -165,7 +165,7 @@ Tensor& div_out_mode(
     RuntimeContext& ctx,
     const Tensor& a,
     const Tensor& b,
-    executorch::aten::optional<executorch::aten::string_view> mode,
+    exec_aten::optional<exec_aten::string_view> mode,
     Tensor& out) {
   ET_KERNEL_CHECK(
       ctx,
 
@@ -12,9 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::canCast;
 using executorch::runtime::CppTypeToScalarType;
 
@@ -12,9 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::canCast;
 using executorch::runtime::CppTypeToScalarType;
 
@@ -15,10 +15,10 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
-using executorch::aten::Scalar;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 using executorch::runtime::can_cast;
 using executorch::runtime::CppTypeToScalarType;
 using torch::executor::Error;
 
@@ -11,9 +11,9 @@
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
 using executorch::aten::RuntimeContext;
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 
 namespace cadence {
 namespace impl {