add copy_ support for float4 dtype (pytorch#169595)

vkuzo · pytorchmergebot · commit 3cf2f19f0a61 · 2025-12-05T11:19:55.000Z
Summary: Enables `copy_` support for the `torch.float4_e2m1fn_x2` dtype. This is useful when slicing a tensor across dim1 and then calling contiguous, which can happen in vllm and therefore should be supported. Test Plan: ``` pytest test/quantization/core/experimental/test_floatx.py -s -k test_float4_e2m1fn_x2 ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: pytorch#169595 Approved by: https://github.com/drisspg ghstack dependencies: pytorch#169575
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -235,6 +235,8 @@ void direct_copy_kernel(TensorIteratorBase &iter) {
     });
   } else if (dtype == ScalarType::ComplexHalf) {
     cpu_kernel(iter, [=](c10::complex<at::Half> a) -> c10::complex<at::Half> { return a; });
+  } else if (dtype == ScalarType::Float4_e2m1fn_x2) {
+    cpu_kernel(iter, [=](Float4_e2m1fn_x2 a) -> Float4_e2m1fn_x2 { return a; });
   } else if (isBitsType(dtype)) {
     AT_DISPATCH_BIT_TYPES(dtype, "copy_kernel", [&] {
       cpu_kernel(
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
@@ -234,6 +234,10 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
       gpu_kernel_nocast(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
     });
+  } else if (dtype == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
+      "Float4_e2m1fn_x2 to different types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
+    gpu_kernel_nocast(iter, [] GPU_LAMBDA(Float4_e2m1fn_x2 x) { return x; });
   } else {
     AT_DISPATCH_V2(
         dtype, "copy_", AT_WRAP([&] {
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
@@ -412,6 +412,9 @@ def test_float4_e2m1fn_x2(self, device):
         x3 = copy.deepcopy(x1)
         self.assertEqual(x1, x3, atol=0, rtol=0)
 
+        # can call contiguous on a dim1 slice (calls `copy_` under the hood)
+        x1[:, 0:2048].contiguous()
+
     def test_f4_save_load(self, device):
         x1 = torch.randint(0, 10, (4, 4), device=device, dtype=torch.uint8).view(
             torch.float4_e2m1fn_x2