Add clone_dim_order kernel (portable + ATen) and layout conversion tests

keyprocedure · keyprocedure · commit 1f60bb76deae · 2025-07-29T11:44:18.000-07:00
diff --git a/exir/tests/test_memory_format_ops_pass.py b/exir/tests/test_memory_format_ops_pass.py
@@ -28,6 +28,8 @@
     MemoryFormatOpsPassTestUtils,
     MemoryFormatTestSet,
     PropagateToCopyChannalsLastModule,
+    SimpleCloneChannelsLastModule,
+    SimpleCloneContiguousModule,
     SimpleEmptyChannelLastModule,
     SimpleEmptyContiguoustModule,
     SimpleToCopyChannelsLastModule,
@@ -91,6 +93,36 @@ def test_op_empty_replacement_contiguous(self) -> None:
             ),
         )
 
+    def test_op_clone_replacement_contiguous(self) -> None:
+        model = SimpleCloneContiguousModule()
+        MemoryFormatOpsPassTestUtils.memory_format_test_runner(
+            self,
+            MemoryFormatTestSet(
+                module=model.eval(),
+                op=torch.ops.aten.clone.default,
+                sample_input=(
+                    torch.randn((3, 4, 5, 6)).to(memory_format=torch.channels_last),
+                ),
+                target_memory_format=torch.contiguous_format,
+                _load_for_executorch_from_buffer=_load_for_executorch_from_buffer,
+            ),
+        )
+
+    def test_op_clone_replacement_channels_last(self) -> None:
+        model = SimpleCloneChannelsLastModule()
+        MemoryFormatOpsPassTestUtils.memory_format_test_runner(
+            self,
+            MemoryFormatTestSet(
+                module=model.eval(),
+                op=torch.ops.aten.clone.default,
+                sample_input=(
+                    torch.randn((3, 4, 5, 6)).to(memory_format=torch.contiguous_format),
+                ),
+                target_memory_format=torch.channels_last,
+                _load_for_executorch_from_buffer=_load_for_executorch_from_buffer,
+            ),
+        )
+
     def test_op_dim_order_update(self) -> None:
         MemoryFormatOpsPassTestUtils.memory_format_test_runner(
             self,
diff --git a/kernels/aten/cpu/op__clone_dim_order.cpp b/kernels/aten/cpu/op__clone_dim_order.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/aten/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using MemoryFormat = executorch::aten::MemoryFormat;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+template <typename T>
+using Optional = std::optional<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones with explicit dim_order, using the corresponding memory format.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  Optional<MemoryFormat> memory_format = get_memory_format(dim_order);
+  at::clone_outf(self, memory_format, out);
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  KernelRuntimeContext ctx{};
+  return _clone_dim_order_out(ctx, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/aten/cpu/util/copy_ops_util.cpp b/kernels/aten/cpu/util/copy_ops_util.cpp
@@ -15,6 +15,28 @@ namespace torch {
 namespace executor {
 
 using Tensor = executorch::aten::Tensor;
+using MemoryFormat = executorch::aten::MemoryFormat;
+
+/**
+ * Determines the memory format (Contiguous or ChannelsLast) corresponding to
+ * the dim_order. Provides support for bridging torch.memory_format with
+ * ExecuTorch's dim_order.
+ */
+std::optional<MemoryFormat> get_memory_format(
+    executorch::aten::OptionalArrayRef<int64_t> dim_order) {
+  if (!dim_order.has_value()) {
+    return executorch::aten::nullopt;
+  }
+  if (is_contiguous_dim_order(
+          dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::Contiguous;
+  } else if (is_channels_last_dim_order(
+                 dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::ChannelsLast;
+  } else {
+    ET_ASSERT_UNREACHABLE();
+  }
+}
 
 bool check__to_dim_order_copy_args(
     const Tensor& input,
diff --git a/kernels/aten/cpu/util/copy_ops_util.h b/kernels/aten/cpu/util/copy_ops_util.h
@@ -13,6 +13,9 @@
 namespace torch {
 namespace executor {
 
+std::optional<MemoryFormat> get_memory_format(
+    executorch::aten::OptionalArrayRef<int64_t> dim_order);
+
 bool check__to_dim_order_copy_args(
     const Tensor& input,
     bool non_blocking,
diff --git a/kernels/aten/edge_dialect_aten_op.yaml b/kernels/aten/edge_dialect_aten_op.yaml
@@ -11,3 +11,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::_to_dim_order_copy_out
+
+- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_clone_dim_order_out
diff --git a/kernels/portable/cpu/op__clone_dim_order.cpp b/kernels/portable/cpu/op__clone_dim_order.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones via element-wise copy while preserving dim_order.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+
+  // Ensure input and output dtype match.
+  ET_KERNEL_CHECK(
+      ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  // Ensure input and output shapes match, resizing if necessary.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  // Select the correct input dtype and copy the tensors.
+  ET_SWITCH_REALHBBF16_TYPES(
+      self.scalar_type(),
+      ctx,
+      "dim_order_ops::_clone_dim_order.out",
+      CTYPE,
+      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
+  return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
@@ -9,6 +9,7 @@
 #pragma once
 #include <c10/util/irange.h>
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -77,6 +78,29 @@ void as_strided_copy(
   }
 }
 
+/**
+ * Copies and casts a tensor while preserving input dim_order.
+ */
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  // Here we make a slightly off-label use of
+  // BroadcastIndexesRange. It always assumes it doesn't have to care
+  // about different dim_order between input and output, but we can
+  // just force it to respect strides (and thus dim_order) for its
+  // inputs using support_noncontiguous_input_tensors=true, and then pretend
+  // the output is just another input.
+  for (const auto [unused_index, self_data_index, out_data_index] :
+       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
+           /*dummy output*/ self, self, out)) {
+    (void)unused_index;
+    out_data[out_data_index] =
+        static_cast<OUT_CTYPE>(self_data[self_data_index]);
+  }
+}
+
 bool check_cat_args(
     executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
@@ -1009,3 +1009,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::_to_dim_order_copy_out
+
+- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_clone_dim_order_out