Move MemoryFormat/Layout to headeronly (pytorch#168034)

janeyx99 · pytorchmergebot · commit bc8da6339631 · 2025-11-20T00:55:09.000Z
~This PR does change the semantics of the >> operator by using STD_TORCH_CHECK to throw the error instead of TORCH_CHECK. Jane (who is writing this message) thinks it is okay because it is the error case when an invalid MemoryFormat or Layout is getting passed into >>, so the UX benefits of TORCH_CHECK over STD_TORCH_CHECK there are not significant enough to warrant making a new copy of Layout and MemoryFormat's >> APIs.~ Never mind! We shouldn't change TORCH_CHECK to STD_TORCH_CHECK for core usage ever, cuz the traceback info and c10::Error is very much desired!! So the solution is to not migrate the >>s. I pushed new commits to the stack to remove the >> code, but for reference, pytorch@8a30179 has all the code that I ended up deleting. Pull Request resolved: pytorch#168034 Approved by: https://github.com/janeyx99 ghstack dependencies: pytorch#168025, pytorch#167802, pytorch#167803, pytorch#167804, pytorch#167962 Co-authored-by: Jane Xu <janeyx@meta.com>
diff --git a/c10/core/Layout.h b/c10/core/Layout.h
@@ -3,30 +3,9 @@
 #include <c10/core/Backend.h>
 #include <c10/util/Exception.h>
 
-#include <cstdint>
-#include <ostream>
+#include <torch/headeronly/core/Layout.h>
 
 namespace c10 {
-enum class Layout : int8_t {
-  Strided,
-  Sparse,
-  SparseCsr,
-  Mkldnn,
-  SparseCsc,
-  SparseBsr,
-  SparseBsc,
-  Jagged,
-  NumOptions
-};
-
-constexpr auto kStrided = Layout::Strided;
-constexpr auto kSparse = Layout::Sparse;
-constexpr auto kSparseCsr = Layout::SparseCsr;
-constexpr auto kMkldnn = Layout::Mkldnn;
-constexpr auto kSparseCsc = Layout::SparseCsc;
-constexpr auto kSparseBsr = Layout::SparseBsr;
-constexpr auto kSparseBsc = Layout::SparseBsc;
-constexpr auto kJagged = Layout::Jagged;
 
 inline Layout layout_from_backend(Backend backend) {
   C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
@@ -3,46 +3,18 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
 
+#include <torch/headeronly/core/MemoryFormat.h>
+
 #include <cstdint>
-#include <ostream>
 #include <vector>
 
-// Memory format is not the property of a Tensor. It is the way to tell an
-// operator how the result should be organized in memory and nothing more. That
-// means memory format should never be used as return value for any tensor state
-// interrogation functions (internally and externally).
-//
-// Possible options are:
-//  Preserve:
-//    If any of the input tensors is in channels_last format, operator output
-//    should be in channels_last format
-//
-//  Contiguous:
-//    Regardless of input tensors format, the output should be contiguous
-//    Tensor.
-//
-//  ChannelsLast:
-//    Regardless of input tensors format, the output should be in channels_last
-//    format.
-
 namespace c10 {
-enum class MemoryFormat : int8_t {
-  Contiguous,
-  Preserve,
-  ChannelsLast,
-  ChannelsLast3d,
-  NumOptions
-};
 
 // If you are seeing this, it means that this call site was not checked if
 // the memory format could be preserved, and it was switched to old default
 // behaviour of contiguous
 #define LEGACY_CONTIGUOUS_MEMORY_FORMAT c10::get_contiguous_memory_format()
 
-inline MemoryFormat get_contiguous_memory_format() {
-  return MemoryFormat::Contiguous;
-}
-
 inline std::ostream& operator<<(
     std::ostream& stream,
     at::MemoryFormat memory_format) {
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -16,8 +16,10 @@ set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_layout.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_memoryformat.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_metaprogramming.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_scalartype.cpp
diff --git a/test/cpp/aoti_abi_check/test_layout.cpp b/test/cpp/aoti_abi_check/test_layout.cpp
@@ -0,0 +1,20 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/Layout.h>
+
+TEST(TestLayout, TestLayout) {
+  using torch::headeronly::Layout;
+  constexpr Layout expected_layouts[] = {
+      torch::headeronly::kStrided,
+      torch::headeronly::kSparse,
+      torch::headeronly::kSparseCsr,
+      torch::headeronly::kMkldnn,
+      torch::headeronly::kSparseCsc,
+      torch::headeronly::kSparseBsr,
+      torch::headeronly::kSparseBsc,
+      torch::headeronly::kJagged,
+  };
+  for (int8_t i = 0; i < static_cast<int8_t>(Layout::NumOptions); i++) {
+    EXPECT_EQ(static_cast<Layout>(i), expected_layouts[i]);
+  }
+}
diff --git a/test/cpp/aoti_abi_check/test_memoryformat.cpp b/test/cpp/aoti_abi_check/test_memoryformat.cpp
@@ -0,0 +1,23 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/MemoryFormat.h>
+
+TEST(TestMemoryFormat, TestMemoryFormat) {
+  using torch::headeronly::MemoryFormat;
+  constexpr MemoryFormat expected_memory_formats[] = {
+      MemoryFormat::Contiguous,
+      MemoryFormat::Preserve,
+      MemoryFormat::ChannelsLast,
+      MemoryFormat::ChannelsLast3d,
+  };
+  for (int8_t i = 0; i < static_cast<int8_t>(MemoryFormat::NumOptions); i++) {
+    EXPECT_EQ(static_cast<MemoryFormat>(i), expected_memory_formats[i]);
+  }
+}
+
+TEST(TestMemoryFormat, get_contiguous_memory_format) {
+  using torch::headeronly::get_contiguous_memory_format;
+  using torch::headeronly::MemoryFormat;
+
+  EXPECT_EQ(get_contiguous_memory_format(), MemoryFormat::Contiguous);
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@@ -10,14 +10,16 @@ using torch::stable::Tensor;
 Tensor my_empty(
     torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
     std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::headeronly::Layout> layout,
     std::optional<torch::stable::Device> device,
-    std::optional<bool> pin_memory) {
-  return empty(size, dtype, device, pin_memory);
+    std::optional<bool> pin_memory,
+    std::optional<torch::headeronly::MemoryFormat> memory_format) {
+  return empty(size, dtype, layout, device, pin_memory, memory_format);
 }
 
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
   m.def(
-      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+      "my_empty(int[] size, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
diff --git a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
@@ -156,20 +156,24 @@ def test_get_num_threads() -> int:
     return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
 
 
-def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
+def my_empty(
+    size, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+) -> Tensor:
     """
-    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
+    Creates an empty tensor with the specified size, dtype, layout, device, pin_memory, and memory_format.
 
     Args:
         size: list[int] - size of the tensor to create
         dtype: ScalarType or None - data type of the tensor
+        layout: Layout or None - layout of the tensor
         device: Device or None - device on which to create the tensor
         pin_memory: bool or None - whether to use pinned memory
+        memory_format: MemoryFormat or None - memory format of the tensor
 
     Returns: Tensor - an uninitialized tensor with the specified properties
     """
     return torch.ops.libtorch_agnostic_2_10.my_empty.default(
-        size, dtype, device, pin_memory
+        size, dtype, layout, device, pin_memory, memory_format
     )
 
 
diff --git a/test/cpp_extensions/test_libtorch_agnostic.py b/test/cpp_extensions/test_libtorch_agnostic.py
@@ -14,6 +14,7 @@
 from torch.testing._internal.common_utils import (
     install_cpp_extension,
     IS_WINDOWS,
+    parametrize,
     run_tests,
     skipIfTorchDynamo,
     TestCase,
@@ -618,43 +619,92 @@ def test_get_num_threads(self, device):
             self.assertEqual(num_threads, expected_num_threads)
 
         @skipIfTorchVersionLessThan(2, 10)
-        def test_my_empty(self, device):
+        @parametrize("layout", [None, torch.strided, torch.sparse_coo])
+        @parametrize(
+            "memory_format", [None, torch.channels_last, torch.contiguous_format]
+        )
+        def test_my_empty(self, device, layout, memory_format):
             import libtorch_agnostic_2_10 as libtorch_agnostic
 
             deterministic = torch.are_deterministic_algorithms_enabled()
             try:
                 # set use_deterministic_algorithms to fill uninitialized memory
                 torch.use_deterministic_algorithms(True)
 
-                size = [2, 3]
-                result = libtorch_agnostic.ops.my_empty(size, None, None, None)
-                expected = torch.empty(size)
-                self.assertEqual(result, expected, exact_device=True)
+                # Use 4D size for channels_last, 2D otherwise
+                size = [2, 3, 4, 5] if memory_format == torch.channels_last else [2, 3]
+
+                # sparse_coo layout doesn't support memory_format parameter
+                if layout == torch.sparse_coo and memory_format is not None:
+                    return
+
+                # Test default parameters
+                result = libtorch_agnostic.ops.my_empty(
+                    size, None, layout, None, None, memory_format
+                )
+                expected = torch.empty(size, layout=layout, memory_format=memory_format)
+                self.assertEqual(result, expected, exact_device=True, exact_layout=True)
 
+                # Test with dtype
                 result_float = libtorch_agnostic.ops.my_empty(
-                    size, torch.float32, None, None
+                    size, torch.float32, layout, None, None, memory_format
+                )
+                expected_float = torch.empty(
+                    size,
+                    dtype=torch.float32,
+                    layout=layout,
+                    memory_format=memory_format,
+                )
+                self.assertEqual(
+                    result_float, expected_float, exact_device=True, exact_layout=True
                 )
-                expected_float = torch.empty(size, dtype=torch.float32)
-                self.assertEqual(result_float, expected_float, exact_device=True)
 
+                # Test with dtype and device
                 result_with_device = libtorch_agnostic.ops.my_empty(
-                    size, torch.float64, device, None
+                    size, torch.float64, layout, device, None, memory_format
                 )
                 expected_with_device = torch.empty(
-                    size, dtype=torch.float64, device=device
+                    size,
+                    dtype=torch.float64,
+                    layout=layout,
+                    device=device,
+                    memory_format=memory_format,
                 )
                 self.assertEqual(
-                    result_with_device, expected_with_device, exact_device=True
+                    result_with_device,
+                    expected_with_device,
+                    exact_device=True,
+                    exact_layout=True,
                 )
 
-                if device == "cuda":
+                # Verify layout if specified
+                if layout is not None:
+                    self.assertEqual(result_with_device.layout, layout)
+
+                # Verify memory format if specified
+                if memory_format == torch.channels_last:
+                    self.assertTrue(
+                        result_with_device.is_contiguous(
+                            memory_format=torch.channels_last
+                        )
+                    )
+                elif memory_format == torch.contiguous_format:
+                    self.assertTrue(result_with_device.is_contiguous())
+
+                # Test pin_memory on CUDA (only once, not for every parameter combination)
+                if device == "cuda" and layout is None and memory_format is None:
                     result_pinned = libtorch_agnostic.ops.my_empty(
-                        size, torch.float32, "cpu", True
+                        [2, 3], torch.float32, None, "cpu", True, None
                     )
                     expected_pinned = torch.empty(
-                        size, dtype=torch.float32, device="cpu", pin_memory=True
+                        [2, 3], dtype=torch.float32, device="cpu", pin_memory=True
+                    )
+                    self.assertEqual(
+                        result_pinned,
+                        expected_pinned,
+                        exact_device=True,
+                        exact_layout=True,
                     )
-                    self.assertEqual(result_pinned, expected_pinned)
                     self.assertTrue(result_pinned.is_pinned())
             finally:
                 torch.use_deterministic_algorithms(deterministic)
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
@@ -326,24 +326,26 @@ inline uint32_t get_num_threads() {
   return num_threads;
 }
 
-// We expect this to be the stable version of the empty op that takes in
-// device and dtype parameters. The empty op creates a tensor with uninitialized
-// values of the specified size, dtype, and device.
-// This function is only available in 2.10 because it uses the stableivalue
-// conversion for HeaderOnlyArrayRef<T>, which is only available in 2.10.
+// We expect this to be the stable version of the empty.memory_format op that
+// takes in device and dtype parameters. This function is only available in 2.10
+// because it uses the stableivalue conversion for HeaderOnlyArrayRef<T>, which
+// is only available in 2.10.
 inline torch::stable::Tensor empty(
     torch::headeronly::IntHeaderOnlyArrayRef size,
     std::optional<torch::headeronly::ScalarType> dtype = std::nullopt,
+    std::optional<torch::headeronly::Layout> layout = std::nullopt,
     std::optional<torch::stable::Device> device = std::nullopt,
-    std::optional<bool> pin_memory = std::nullopt) {
+    std::optional<bool> pin_memory = std::nullopt,
+    std::optional<torch::headeronly::MemoryFormat> memory_format =
+        std::nullopt) {
   const auto num_args = 6;
   std::array<StableIValue, num_args> stack{
       torch::stable::detail::from(size),
       torch::stable::detail::from(dtype),
-      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(layout),
       torch::stable::detail::from(device),
       torch::stable::detail::from(pin_memory),
-      torch::stable::detail::from(std::nullopt)};
+      torch::stable::detail::from(memory_format)};
   TORCH_ERROR_CODE_CHECK(torch_call_dispatcher(
       "aten::empty", "memory_format", stack.data(), TORCH_ABI_VERSION));
   return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
diff --git a/torch/csrc/stable/stableivalue_conversions.h b/torch/csrc/stable/stableivalue_conversions.h
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
diff --git a/torch/headeronly/core/Layout.h b/torch/headeronly/core/Layout.h
diff --git a/torch/headeronly/core/MemoryFormat.h b/torch/headeronly/core/MemoryFormat.h