Add _clone_dim_order portable kernel (#12974)

keyprocedure · Gasoonjia · web-flow · commit 3a021469b687 · 2025-08-11T13:25:12.000-07:00
### Summary This is PR 1 of 3 implementing a dim order aware clone op. Currently, clone ops are removed during export as no-ops, causing memory layout (dim order) changes to be lost. This can cause backend failures, incorrect outputs when ops expect specific layouts, and performance degradation. This set of PRs introduces a dim order aware clone op, `_clone_dim_order`, which preserves memory layout changes by explicitly storing dim order information. This is implemented by replacing standard clone ops with this variant during export and updating the clone removal transform to preserve clones that change layout. This PR adds the portable CPU kernel for the `_clone_dim_order` op, implementing a clone variant that preserves dim order at runtime. The portable kernel validates dtype and layout compatibility, resizes the output tensor if needed, and performs an element wise clone of the tensors. Note: A future PR will add the ATen kernel for `_clone_dim_order`. Related PRs: - PR 2: [#12971](#12971) - Register `_clone_dim_order` op and map `aten.clone` - PR 3: [#12976](#12976) - Update RemoveCloneOpsTransform to be dim_order aware Fixes #12645 ### Test plan Added kernel runtime tests to verify: - Tensors of all real dtypes are cloned correctly. - Failure when input and output tensor shapes mismatch. - Failure with unsupported memory formats. - Failure when `non_blocking=true` since the portable kernel only supports blocking data transfer. - Dynamic shape outputs are cloned with correct values. - Layout conversions are cloned correctly for `contiguous` to `channels_last`, `channels_last` to `contiguous`, and `channels_last` is preserved. All runtime tests pass via: `build-ninja/kernels/test/portable_kernels_test` --------- Co-authored-by: Gasoonjia <gasoonjia@meta.com>
diff --git a/kernels/portable/cpu/op__clone_dim_order.cpp b/kernels/portable/cpu/op__clone_dim_order.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones via element-wise copy while preserving dim_order.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+
+  // Ensure input and output dtype match.
+  ET_KERNEL_CHECK(
+      ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  // Ensure input and output shapes match, resizing if necessary.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  // Select the correct input dtype and copy the tensors.
+  ET_SWITCH_REALHBBF16_TYPES(
+      self.scalar_type(),
+      ctx,
+      "dim_order_ops::_clone_dim_order.out",
+      CTYPE,
+      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
+  return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/op__to_dim_order_copy.cpp b/kernels/portable/cpu/op__to_dim_order_copy.cpp
@@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 template <typename T>
 using Optional = std::optional<T>;
 
-namespace {
-
-template <typename SELF_CTYPE, typename OUT_CTYPE>
-void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
-  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
-  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
-
-  // Here we make a slightly off-label use of
-  // BroadcastIndexesRange. It always assumes it doesn't have to care
-  // about different dim_order between input and output, but we can
-  // just force it to respect strides (and thus dim_order) for its
-  // inputs using support_noncontiguous_input_tensors=true, and then pretend
-  // the output is just another input.
-  for (const auto [unused_index, self_data_index, out_data_index] :
-       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
-           /*dummy output*/ self, self, out)) {
-    (void)unused_index;
-    out_data[out_data_index] =
-        static_cast<OUT_CTYPE>(self_data[self_data_index]);
-  }
-}
-} // namespace
-
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
@@ -9,6 +9,7 @@
 #pragma once
 #include <c10/util/irange.h>
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -77,6 +78,29 @@ void as_strided_copy(
   }
 }
 
+/**
+ * Copies and casts a tensor while preserving input dim_order.
+ */
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  // Here we make a slightly off-label use of
+  // BroadcastIndexesRange. It always assumes it doesn't have to care
+  // about different dim_order between input and output, but we can
+  // just force it to respect strides (and thus dim_order) for its
+  // inputs using support_noncontiguous_input_tensors=true, and then pretend
+  // the output is just another input.
+  for (const auto [unused_index, self_data_index, out_data_index] :
+       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
+           /*dummy output*/ self, self, out)) {
+    (void)unused_index;
+    out_data[out_data_index] =
+        static_cast<OUT_CTYPE>(self_data[self_data_index]);
+  }
+}
+
 bool check_cat_args(
     executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -147,6 +147,9 @@ def define_common_targets():
             "copy_ops_util.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
+        exported_deps = [
+            ":broadcast_util",
+        ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ],
@@ -348,7 +351,6 @@ def define_common_targets():
             ],
         )
 
-
         runtime.cxx_library(
             name = "arange_util{}".format(suffix),
             srcs = ["arange_util.cpp"],
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
@@ -1009,3 +1009,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::_to_dim_order_copy_out
+
+- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_clone_dim_order_out
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
@@ -108,6 +108,7 @@ add_custom_target(
 set(all_test_sources
     "BinaryLogicalOpTest.cpp"
     "op__to_dim_order_copy_test.cpp"
+    "op__clone_dim_order_test.cpp"
     "op_abs_test.cpp"
     "op_acos_test.cpp"
     "op_acosh_test.cpp"
diff --git a/kernels/test/op__clone_dim_order_test.cpp b/kernels/test/op__clone_dim_order_test.cpp
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl