Add a fast path for _clone_dim_order (#15815)

GregoryComer · web-flow · commit 0704ae349de3 · 2025-11-13T16:43:56.000-08:00
### Summary
Add a direct memcpy fast path for the portable _clone_dim_order op, as
it can be a performance bottleneck. I'd like to more aggressively
optimize these out of the graph, but this fast path should reduce the
perf impact significantly.

### Test plan
Existing correctness tests for the _clone_dim_order implementation
should cover it.

For performance, I did a quick test with a default dim order (1, 128,
256, 256) element tensor on an x86 server. This is mainly intended as a
quick smoke test and not a proper benchmark. I included numbers for both
optimized and debug builds. Optimized matters more, but super long debug
runs can be painful for development.

[Optimized Build]
Before: 27.9 ms
After: 6.4 ms

[Debug Build]
Before: 5947.01 ms
After: 7.2 ms
diff --git a/kernels/portable/cpu/op__clone_dim_order.cpp b/kernels/portable/cpu/op__clone_dim_order.cpp
@@ -10,6 +10,9 @@
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <algorithm>
+#include <cstring>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,6 +22,30 @@ using Tensor = executorch::aten::Tensor;
 template <typename T>
 using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 
+namespace {
+
+/**
+ * Checks the conditions for fast path direct memcpy. This can be used
+ * when the output dim order is unchanged.
+ */
+bool check_fast_path_conditions(
+    const Tensor& in,
+    OptionalArrayRef<int64_t> dim_order) {
+  if (!dim_order.has_value()) {
+    // No dim order means preserve input dim order.
+    return true;
+  }
+
+  auto input_dim_order = in.dim_order();
+  return std::equal(
+      dim_order.value().begin(),
+      dim_order.value().end(),
+      input_dim_order.begin(),
+      input_dim_order.end());
+}
+
+} // namespace
+
 /**
  * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
  * dim_order=None, Tensor(a!) out) -> Tensor(a!)
@@ -55,13 +82,18 @@ Tensor& _clone_dim_order_out(
     return out;
   }
 
-  // Select the correct input dtype and copy the tensors.
-  ET_SWITCH_REALHBBF16_TYPES(
-      self.scalar_type(),
-      ctx,
-      "dim_order_ops::_clone_dim_order.out",
-      CTYPE,
-      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+  // Dispatch to the fast path if we can use direct memcpy.
+  if (check_fast_path_conditions(self, dim_order)) {
+    std::memcpy(out.mutable_data_ptr(), self.const_data_ptr(), self.nbytes());
+  } else {
+    // Select the correct input dtype and copy the tensors.
+    ET_SWITCH_REALHBBF16_TYPES(
+        self.scalar_type(),
+        ctx,
+        "dim_order_ops::_clone_dim_order.out",
+        CTYPE,
+        [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+  }
 
   return out;
 }
@@ -77,4 +109,4 @@ Tensor& _clone_dim_order_out(
 
 } // namespace native
 } // namespace executor
-} // namespace torch
+} // namespace torch