Skip to content

Commit 3a02146

Browse files
Add _clone_dim_order portable kernel (#12974)
### Summary This is PR 1 of 3 implementing a dim order aware clone op. Currently, clone ops are removed during export as no-ops, causing memory layout (dim order) changes to be lost. This can cause backend failures, incorrect outputs when ops expect specific layouts, and performance degradation. This set of PRs introduces a dim order aware clone op, `_clone_dim_order`, which preserves memory layout changes by explicitly storing dim order information. This is implemented by replacing standard clone ops with this variant during export and updating the clone removal transform to preserve clones that change layout. This PR adds the portable CPU kernel for the `_clone_dim_order` op, implementing a clone variant that preserves dim order at runtime. The portable kernel validates dtype and layout compatibility, resizes the output tensor if needed, and performs an element wise clone of the tensors. Note: A future PR will add the ATen kernel for `_clone_dim_order`. Related PRs: - PR 2: [#12971](#12971) - Register `_clone_dim_order` op and map `aten.clone` - PR 3: [#12976](#12976) - Update RemoveCloneOpsTransform to be dim_order aware Fixes #12645 ### Test plan Added kernel runtime tests to verify: - Tensors of all real dtypes are cloned correctly. - Failure when input and output tensor shapes mismatch. - Failure with unsupported memory formats. - Failure when `non_blocking=true` since the portable kernel only supports blocking data transfer. - Dynamic shape outputs are cloned with correct values. - Layout conversions are cloned correctly for `contiguous` to `channels_last`, `channels_last` to `contiguous`, and `channels_last` is preserved. All runtime tests pass via: `build-ninja/kernels/test/portable_kernels_test` --------- Co-authored-by: Gasoonjia <[email protected]>
1 parent 9308d2c commit 3a02146

File tree

9 files changed

+486
-24
lines changed

9 files changed

+486
-24
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/kernels/portable/cpu/scalar_utils.h>
10+
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
11+
#include <executorch/runtime/kernel/kernel_includes.h>
12+
13+
namespace torch {
14+
namespace executor {
15+
namespace native {
16+
17+
using Tensor = executorch::aten::Tensor;
18+
19+
template <typename T>
20+
using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
21+
22+
/**
23+
* _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
24+
* dim_order=None, Tensor(a!) out) -> Tensor(a!)
25+
*
26+
* Clones via element-wise copy while preserving dim_order.
27+
*/
28+
Tensor& _clone_dim_order_out(
29+
KernelRuntimeContext& ctx,
30+
const Tensor& self,
31+
bool non_blocking,
32+
OptionalArrayRef<int64_t> dim_order,
33+
Tensor& out) {
34+
(void)ctx;
35+
36+
// Ensure input and output dtype match.
37+
ET_KERNEL_CHECK(
38+
ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
39+
40+
// Ensure output has the same layout as input or matches dim_order.
41+
ET_KERNEL_CHECK(
42+
ctx,
43+
check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
44+
InvalidArgument,
45+
out);
46+
47+
// Ensure input and output shapes match, resizing if necessary.
48+
ET_KERNEL_CHECK(
49+
ctx,
50+
resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
51+
InvalidArgument,
52+
out);
53+
54+
if (self.numel() == 0) {
55+
return out;
56+
}
57+
58+
// Select the correct input dtype and copy the tensors.
59+
ET_SWITCH_REALHBBF16_TYPES(
60+
self.scalar_type(),
61+
ctx,
62+
"dim_order_ops::_clone_dim_order.out",
63+
CTYPE,
64+
[&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
65+
66+
return out;
67+
}
68+
69+
Tensor& _clone_dim_order_out(
70+
const Tensor& self,
71+
bool non_blocking,
72+
OptionalArrayRef<int64_t> dim_order,
73+
Tensor& out) {
74+
executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
75+
return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
76+
}
77+
78+
} // namespace native
79+
} // namespace executor
80+
} // namespace torch

kernels/portable/cpu/op__to_dim_order_copy.cpp

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
2929
template <typename T>
3030
using Optional = std::optional<T>;
3131

32-
namespace {
33-
34-
template <typename SELF_CTYPE, typename OUT_CTYPE>
35-
void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
36-
auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
37-
auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
38-
39-
// Here we make a slightly off-label use of
40-
// BroadcastIndexesRange. It always assumes it doesn't have to care
41-
// about different dim_order between input and output, but we can
42-
// just force it to respect strides (and thus dim_order) for its
43-
// inputs using support_noncontiguous_input_tensors=true, and then pretend
44-
// the output is just another input.
45-
for (const auto [unused_index, self_data_index, out_data_index] :
46-
BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
47-
/*dummy output*/ self, self, out)) {
48-
(void)unused_index;
49-
out_data[out_data_index] =
50-
static_cast<OUT_CTYPE>(self_data[self_data_index]);
51-
}
52-
}
53-
} // namespace
54-
5532
// _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
5633
// dim_order=None, Tensor(a!) out) -> Tensor(a!)
5734
Tensor& _to_dim_order_copy_out(

kernels/portable/cpu/util/copy_ops_util.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#pragma once
1010
#include <c10/util/irange.h>
1111

12+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
1213
#include <executorch/runtime/kernel/kernel_includes.h>
1314

1415
namespace torch {
@@ -77,6 +78,29 @@ void as_strided_copy(
7778
}
7879
}
7980

81+
/**
82+
* Copies and casts a tensor while preserving input dim_order.
83+
*/
84+
template <typename SELF_CTYPE, typename OUT_CTYPE>
85+
void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
86+
auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
87+
auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
88+
89+
// Here we make a slightly off-label use of
90+
// BroadcastIndexesRange. It always assumes it doesn't have to care
91+
// about different dim_order between input and output, but we can
92+
// just force it to respect strides (and thus dim_order) for its
93+
// inputs using support_noncontiguous_input_tensors=true, and then pretend
94+
// the output is just another input.
95+
for (const auto [unused_index, self_data_index, out_data_index] :
96+
BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
97+
/*dummy output*/ self, self, out)) {
98+
(void)unused_index;
99+
out_data[out_data_index] =
100+
static_cast<OUT_CTYPE>(self_data[self_data_index]);
101+
}
102+
}
103+
80104
bool check_cat_args(
81105
executorch::aten::ArrayRef<Tensor> tensors,
82106
int64_t dim,

kernels/portable/cpu/util/targets.bzl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ def define_common_targets():
147147
"copy_ops_util.h",
148148
],
149149
compiler_flags = ["-Wno-missing-prototypes"],
150+
exported_deps = [
151+
":broadcast_util",
152+
],
150153
deps = [
151154
"//executorch/runtime/kernel:kernel_includes",
152155
],
@@ -348,7 +351,6 @@ def define_common_targets():
348351
],
349352
)
350353

351-
352354
runtime.cxx_library(
353355
name = "arange_util{}".format(suffix),
354356
srcs = ["arange_util.cpp"],

kernels/portable/functions.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,3 +1009,8 @@
10091009
kernels:
10101010
- arg_meta: null
10111011
kernel_name: torch::executor::_to_dim_order_copy_out
1012+
1013+
- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
1014+
kernels:
1015+
- arg_meta: null
1016+
kernel_name: torch::executor::_clone_dim_order_out

kernels/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ add_custom_target(
108108
set(all_test_sources
109109
"BinaryLogicalOpTest.cpp"
110110
"op__to_dim_order_copy_test.cpp"
111+
"op__clone_dim_order_test.cpp"
111112
"op_abs_test.cpp"
112113
"op_acos_test.cpp"
113114
"op_acosh_test.cpp"

0 commit comments

Comments
 (0)