make the float4 dtype support equality comparisons (pytorch#169575)

vkuzo · pytorchmergebot · commit ae64a53c7c25 · 2025-12-05T11:19:55.000Z
Summary: Makes `torch.allclose(a, b, atol=0, rtol=0)` work for `a` and `b` with dtype `torch.float4_e2m1fn_x2`. This is useful for testing. Test Plan: ``` pytest test/quantization/core/experimental/test_floatx.py -s -k test_float4_e2m1fn_x2 ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: pytorch#169575 Approved by: https://github.com/eqy, https://github.com/drisspg
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -624,38 +624,38 @@ void ge_kernel(TensorIteratorBase& iter) {
 void eq_kernel(TensorIteratorBase& iter) {
   // See Note [special-case bool outputs]
   if (iter.dtype() == ScalarType::Bool) {
-    _AT_DISPATCH_ALL_TYPES_AND_BOOL(iter.common_dtype(), "eq_cpu", [&]() {
+    AT_DISPATCH_V2(iter.common_dtype(), "eq_cpu", AT_WRAP([&]() {
       cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a == b; });
-    });
+    }), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);
   } else {
-    _AT_DISPATCH_ALL_TYPES_NO_BOOL(iter.common_dtype(), "eq_cpu", [&]() {
+    AT_DISPATCH_V2(iter.common_dtype(), "eq_cpu", AT_WRAP([&]() {
       cpu_kernel_vec(
           iter,
           [](scalar_t a, scalar_t b) -> scalar_t {
             return static_cast<scalar_t>(a == b);
           },
           [](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
               -> Vectorized<scalar_t> { return a.eq(b); });
-    });
+    }), kComplexHalf, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);
   }
 }
 
 void ne_kernel(TensorIteratorBase& iter) {
   // See Note [special-case bool outputs]
   if (iter.dtype() == ScalarType::Bool) {
-    _AT_DISPATCH_ALL_TYPES_AND_BOOL(iter.common_dtype(), "ne_cpu", [&]() {
+    AT_DISPATCH_V2(iter.common_dtype(), "ne_cpu", AT_WRAP([&]() {
       cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a != b; });
-    });
+    }), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);
   } else {
-    _AT_DISPATCH_ALL_TYPES_NO_BOOL(iter.common_dtype(), "ne_cpu", [&]() {
+    AT_DISPATCH_V2(iter.common_dtype(), "ne_cpu", AT_WRAP([&]() {
       cpu_kernel_vec(
           iter,
           [](scalar_t a, scalar_t b) -> scalar_t {
             return static_cast<scalar_t>(a != b);
           },
           [](Vectorized<scalar_t> a, Vectorized<scalar_t> b)
               -> Vectorized<scalar_t> { return a.ne(b); });
-    });
+    }), kComplexHalf, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -33,7 +33,7 @@ C10_NOINLINE void compare_eq_ne_kernel(TensorIteratorBase &iter, EqOpType op) {
   AT_DISPATCH_V2(iter.common_dtype(), "compare_eq_ne_cuda", AT_WRAP([&]() {
     opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
         iter, CompareEqFunctor<scalar_t>(op));
-  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);
 }
 
 void eq_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: quantization"]
 
+import copy
 import struct
 import unittest
 
@@ -407,6 +408,10 @@ def test_float4_e2m1fn_x2(self, device):
         # can view uint8 as float4_e2m1fn_x2
         x2.view(torch.float4_e2m1fn_x2)
 
+        # can do equality comparisons
+        x3 = copy.deepcopy(x1)
+        self.assertEqual(x1, x3, atol=0, rtol=0)
+
     def test_f4_save_load(self, device):
         x1 = torch.randint(0, 10, (4, 4), device=device, dtype=torch.uint8).view(
             torch.float4_e2m1fn_x2
diff --git a/torch/headeronly/util/Float4_e2m1fn_x2.h b/torch/headeronly/util/Float4_e2m1fn_x2.h
@@ -25,8 +25,23 @@ struct alignas(1) Float4_e2m1fn_x2 {
   C10_HOST_DEVICE explicit Float4_e2m1fn_x2(uint8_t val) : val_(val) {}
 };
 
+/// Comparison operators
+inline C10_HOST_DEVICE bool operator==(
+    const Float4_e2m1fn_x2& a,
+    const Float4_e2m1fn_x2& b) {
+  return a.val_ == b.val_;
+}
+
+inline C10_HOST_DEVICE bool operator!=(
+    const Float4_e2m1fn_x2& a,
+    const Float4_e2m1fn_x2& b) {
+  return a.val_ != b.val_;
+}
+
 } // namespace c10
 
 HIDDEN_NAMESPACE_BEGIN(torch, headeronly)
 using c10::Float4_e2m1fn_x2;
+using c10::operator==;
+using c10::operator!=;
 HIDDEN_NAMESPACE_END(torch, headeronly)

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ C10_NOINLINE void compare_eq_ne_kernel(TensorIteratorBase &iter, EqOpType op) {`
`33`	`33`	`AT_DISPATCH_V2(iter.common_dtype(), "compare_eq_ne_cuda", AT_WRAP([&]() {`
`34`	`34`	`opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(`
`35`	`35`	`iter, CompareEqFunctor<scalar_t>(op));`
`36`		`- }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));`
	`36`	`+ }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kFloat4_e2m1fn_x2);`
`37`	`37`	`}`
`38`	`38`
`39`	`39`	`void eq_kernel_cuda(TensorIteratorBase& iter) {`