pytorch
diff --git a/‎backends/arm/operator_support/to_copy_support.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/to_copy_support.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion b/‎backends/cadence/aot/compiler.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎backends/cadence/build_cadence_fusionG3.sh‎
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/build_cadence_fusionG3.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cadence/build_cadence_hifi4.sh‎
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/build_cadence_hifi4.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cadence/build_cadence_runner.sh‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/build_cadence_runner.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 4 additions & 4 deletions b/‎backends/cadence/fusion_g3/operators/op_exp.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl‎
Lines changed: 1 addition & 1 deletion
@@ -125,6 +125,7 @@ def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool
         # Check dim_order (to_dim_order_copy)
         if "dim_order" in node.kwargs:
             dim_order = node.kwargs["dim_order"]
+            # pyre-ignore[6]
             if dim_order != list(range(len(dim_order))):
                 logger.info(
                     f"Argument {dim_order=} is not supported for "
 
@@ -33,6 +33,7 @@
     ExecutorchProgramManager,
     to_edge,
 )
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import PassResult
 from executorch.exir.passes import ToOutVarPass
 from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
@@ -186,14 +187,17 @@ def export_to_edge(
     edge_prog_manager = to_edge(
         expo_program,
         compile_config=EdgeCompileConfig(
-            _skip_dim_order=True,
             # Allow specific non-core aten ops in the IR.
             _core_aten_ops_exception_list=[
                 torch.ops.aten._native_batch_norm_legit_functional.default,
                 torch.ops.aten.linear.default,
                 torch.ops.aten.linalg_vector_norm.default,
                 torch.ops.aten.unfold.default,
                 torch.ops.aten.angle.default,
+                # cadence replaced to_dim_order_copy with _to_copy for performance
+                # skip _to_copy op to get around of dim order check
+                # We should remove this op once cadence can support dim order
+                exir_ops.edge.aten._to_copy.default,
             ],
         ),
         constant_methods=constant_methods,
 
@@ -11,6 +11,7 @@
 
 # pyre-unsafe
 
+import copy
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -35,7 +36,12 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
+from executorch.exir.dim_order_utils import get_memory_format
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
+from executorch.exir.passes.dim_order_ops_registry import (
+    DimOrderOpsMap,
+    MemoryFormatOpsMap,
+)
 from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
@@ -1799,6 +1805,72 @@ def call_operator(
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceToDimOrderCopyWithToCopyPass(ExportPass):
+    """
+    dim_order_ops::to_dim_order_copy is not supported, so this is an opt_level=0 pass.
+    If the dim order is sequential, we don't need the extra work with strides and
+    can just use to_copy.
+    """
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in DimOrderOpsMap:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # new kwargs with dim_order, and no memory_format for the new op
+        nkwargs = dict(copy.deepcopy(kwargs))  # orig kwargs are immutable
+
+        ndim = None
+
+        # can always get the shape, assuming rank is specialized
+
+        # pyre-ignore[16]: `None` has no attribute `to_tensor`
+        if isinstance(args[0], ProxyValue) and args[0].is_tensor():
+            # pyre-ignore[16]: `None` has no attribute `to_tensor`
+            ndim = args[0].to_tensor().dim()
+        elif isinstance(args[0], torch.Tensor):
+            # pyre-ignore[16]: `None` has no attribute `dim`
+            ndim = args[0].dim()
+        elif isinstance(args[0], torch.fx.immutable_collections.immutable_list):
+            # pyre-ignore[6]: Incompatible parameter type
+            ndim = len(args[0])
+        else:
+            assert 0, f"Expecting a Tensor or a ProxyValue but got {type(args[0])}"
+
+        # get the "to" memory format for the EdgeOp
+        contiguous_dim_order = list(range(ndim))
+        dim_order = nkwargs.pop("dim_order", None)
+
+        # Cadence only supports contiguous memory format
+        assert (
+            dim_order is None
+            # pyre-ignore[6]: Incompatible parameter type
+            or len(dim_order) == 0
+            or dim_order == contiguous_dim_order
+        ), "Expected dim order in congituous or prevserve memory format, but got {}".format(
+            dim_order
+        )
+
+        # bring back memory format
+        # pyre-ignore[6]: Incompatible parameter type
+        nkwargs["memory_format"] = get_memory_format(dim_order)
+
+        memory_format_op = MemoryFormatOpsMap[op]
+
+        return super().call_operator(
+            memory_format_op,
+            args,
+            nkwargs,
+            meta,
+        )
+
+
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
 class ReplaceFullLikeWithFullPass(ExportPass):
     """
@@ -2108,4 +2180,5 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
+        ReplaceToDimOrderCopyWithToCopyPass,
     ]
@@ -21,7 +21,7 @@ STEPWISE_BUILD=false
 
 if $STEPWISE_BUILD; then
     echo "Building ExecuTorch"
-    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
         -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
@@ -37,7 +37,7 @@ if $STEPWISE_BUILD; then
         -Bcmake-out .
 
     echo "Building any Cadence-specific binaries on top"
-    cmake -DBUCK2="$BUCK" \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
         -DCMAKE_TOOLCHAIN_FILE=/home/zonglinpeng/ws/zonglinpeng/executorch/backends/cadence/cadence.cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
@@ -61,7 +61,7 @@ if $STEPWISE_BUILD; then
 else
     echo "Building Cadence toolchain with ExecuTorch packages"
     cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
-    cmake -DBUCK2="$BUCK" \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
         -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
         -DHAVE_SYS_STAT_H=ON \
         -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
 
@@ -21,7 +21,7 @@ STEPWISE_BUILD=false
 
 if $STEPWISE_BUILD; then
     echo "Building ExecuTorch"
-    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
         -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
@@ -36,7 +36,7 @@ if $STEPWISE_BUILD; then
         -Bcmake-out .
 
     echo "Building any Cadence-specific binaries on top"
-    cmake -DBUCK2="$BUCK" \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
         -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
@@ -60,7 +60,7 @@ if $STEPWISE_BUILD; then
 else
     echo "Building Cadence toolchain with ExecuTorch packages"
     cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
-    cmake -DBUCK2="$BUCK" \
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
         -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
         -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
 
@@ -21,7 +21,7 @@ main() {
   cd "${EXECUTORCH_ROOT}"
 
   rm -rf cmake-out
-  cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+  CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
@@ -33,7 +33,7 @@ main() {
   local build_dir="cmake-out/${example_dir}"
   local cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
   rm -rf ${build_dir}
-  cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+  CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_CADENCE_CPU_RUNNER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=ON \
 
@@ -49,9 +49,9 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out);
 #endif
 
-  if (out.scalar_type() == ScalarType::Float) {
-    float* const out_data = out.mutable_data_ptr<float>();
-    const float* const in_data = in.const_data_ptr<float>();
+  if (in.scalar_type() == ScalarType::Float) {
+    float* __restrict__ out_data = out.mutable_data_ptr<float>();
+    const float* __restrict__ in_data = in.const_data_ptr<float>();
 
     XT_KERNEL_CHECK(
         ctx, out, xa_nn_elm_exp_f32_f32, out_data, in_data, out.numel());
@@ -66,4 +66,4 @@ Tensor& exp_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
@@ -41,7 +41,7 @@ void main() {
     div_by_x % out_limits.y,
     div_by_x / out_limits.y);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
 
 
@@ -59,7 +59,7 @@ void main() {
   pos.y *= BATCH_SIZE_Y;
 
   // do not process if top pixel does not fit within the output range
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
 
 
@@ -44,7 +44,7 @@ void main() {
     div_by_x % out_limits.y,
     div_by_x / out_limits.y);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (pos.z >= out_limits.z) {
     return;
   }
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ void main() {`
`41`	`41`	`div_by_x % out_limits.y,`
`42`	`42`	`div_by_x / out_limits.y);`
`43`	`43`
`44`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`44`	`+ if (pos.z >= out_limits.z) {`
`45`	`45`	`return;`
`46`	`46`	`}`
`47`	`47`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ void main() {`
`59`	`59`	`pos.y *= BATCH_SIZE_Y;`
`60`	`60`
`61`	`61`	`// do not process if top pixel does not fit within the output range`
`62`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`62`	`+ if (pos.z >= out_limits.z) {`
`63`	`63`	`return;`
`64`	`64`	`}`
`65`	`65`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ void main() {`
`44`	`44`	`div_by_x % out_limits.y,`
`45`	`45`	`div_by_x / out_limits.y);`
`46`	`46`
`47`		`- if (any(greaterThanEqual(pos, out_limits))) {`
	`47`	`+ if (pos.z >= out_limits.z) {`
`48`	`48`	`return;`
`49`	`49`	`}`
`50`	`50`