Skip to content

Commit 0704ae3

Browse files
authored
Add a fast path for _clone_dim_order (#15815)
### Summary Add a direct memcpy fast path for the portable _clone_dim_order op, as it can be a performance bottleneck. I'd like to more aggressively optimize these out of the graph, but this fast path should reduce the perf impact significantly. ### Test plan Existing correctness tests for the _clone_dim_order implementation should cover it. For performance, I did a quick test with a default dim order (1, 128, 256, 256) element tensor on an x86 server. This is mainly intended as a quick smoke test and not a proper benchmark. I included numbers for both optimized and debug builds. Optimized matters more, but super long debug runs can be painful for development. [Optimized Build] Before: 27.9 ms After: 6.4 ms [Debug Build] Before: 5947.01 ms After: 7.2 ms
1 parent c247604 commit 0704ae3

File tree

1 file changed

+40
-8
lines changed

1 file changed

+40
-8
lines changed

kernels/portable/cpu/op__clone_dim_order.cpp

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
1111
#include <executorch/runtime/kernel/kernel_includes.h>
1212

13+
#include <algorithm>
14+
#include <cstring>
15+
1316
namespace torch {
1417
namespace executor {
1518
namespace native {
@@ -19,6 +22,30 @@ using Tensor = executorch::aten::Tensor;
1922
template <typename T>
2023
using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
2124

25+
namespace {
26+
27+
/**
28+
* Checks the conditions for fast path direct memcpy. This can be used
29+
* when the output dim order is unchanged.
30+
*/
31+
bool check_fast_path_conditions(
32+
const Tensor& in,
33+
OptionalArrayRef<int64_t> dim_order) {
34+
if (!dim_order.has_value()) {
35+
// No dim order means preserve input dim order.
36+
return true;
37+
}
38+
39+
auto input_dim_order = in.dim_order();
40+
return std::equal(
41+
dim_order.value().begin(),
42+
dim_order.value().end(),
43+
input_dim_order.begin(),
44+
input_dim_order.end());
45+
}
46+
47+
} // namespace
48+
2249
/**
2350
* _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
2451
* dim_order=None, Tensor(a!) out) -> Tensor(a!)
@@ -55,13 +82,18 @@ Tensor& _clone_dim_order_out(
5582
return out;
5683
}
5784

58-
// Select the correct input dtype and copy the tensors.
59-
ET_SWITCH_REALHBBF16_TYPES(
60-
self.scalar_type(),
61-
ctx,
62-
"dim_order_ops::_clone_dim_order.out",
63-
CTYPE,
64-
[&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
85+
// Dispatch to the fast path if we can use direct memcpy.
86+
if (check_fast_path_conditions(self, dim_order)) {
87+
std::memcpy(out.mutable_data_ptr(), self.const_data_ptr(), self.nbytes());
88+
} else {
89+
// Select the correct input dtype and copy the tensors.
90+
ET_SWITCH_REALHBBF16_TYPES(
91+
self.scalar_type(),
92+
ctx,
93+
"dim_order_ops::_clone_dim_order.out",
94+
CTYPE,
95+
[&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
96+
}
6597

6698
return out;
6799
}
@@ -77,4 +109,4 @@ Tensor& _clone_dim_order_out(
77109

78110
} // namespace native
79111
} // namespace executor
80-
} // namespace torch
112+
} // namespace torch

0 commit comments

Comments
 (0)