Revert partial "[BACKEND] BF16 atomic_add support (#6519)"

whitneywhtsang · whitneywhtsang · commit 13dae0710908 · 2025-05-13T20:57:47.000Z
This reverts partial commit 236f6b5.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1564,7 +1564,6 @@ def kernel(X, Y, Z):
 @pytest.mark.parametrize(
     "op, dtype_x_str, mode, sem",
     itertools.chain.from_iterable([[
-        ('add', 'bfloat16', mode, sem),
         ('add', 'float16', mode, sem),
         ('add', 'uint32', mode, sem),
         ('add', 'int32', mode, sem),
@@ -1590,8 +1589,8 @@ def kernel(X, Y, Z):
 def test_atomic_rmw(op, dtype_x_str, mode, sem, device):
     check_type_supported(dtype_x_str, device)
     if is_interpreter():
-        if dtype_x_str == 'float16' or dtype_x_str == 'bfloat16':
-            pytest.xfail("Only test atomic bfloat16/float16 ops on GPU")
+        if dtype_x_str == 'float16':
+            pytest.xfail("Only test atomic float16 ops on GPU")
 
     n_programs = 5
 
@@ -1606,14 +1605,12 @@ def kernel(X, Z):
     sem_arg = sem if sem is None else f'"{sem}"'
     kernel = patch_kernel(kernel, {'GENERATE_TEST_HERE': f'tl.atomic_{op}(Z, x, sem={sem_arg})'})
     numpy_op = {'add': np.sum, 'max': np.max, 'min': np.min}[op]
-    max_neutral = float('-inf') if dtype_x_str in float_dtypes_with_bfloat16 else np.iinfo(getattr(np, dtype_x_str)).min
-    min_neutral = float('inf') if dtype_x_str in float_dtypes_with_bfloat16 else np.iinfo(getattr(np, dtype_x_str)).max
+    max_neutral = float('-inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).min
+    min_neutral = float('inf') if dtype_x_str in float_dtypes else np.iinfo(getattr(np, dtype_x_str)).max
     neutral = {'add': 0, 'max': max_neutral, 'min': min_neutral}[op]
 
     # triton result
     rs = RandomState(17)
-    dst_type = 'bfloat16' if (dtype_x_str == 'bfloat16') else None
-    dtype_x_str = 'float32' if (dtype_x_str == 'bfloat16') else dtype_x_str
     x = np.array([2**i for i in range(n_programs)], dtype=getattr(np, dtype_x_str))
     if mode == 'all_neg':
         x = -np.abs(x)
@@ -1625,17 +1622,12 @@ def kernel(X, Z):
     if mode == 'max_pos':
         idx = rs.randint(n_programs, size=(1, )).item()
         x[idx] = np.max(np.abs(x)) + 1
-    x_tri = to_triton(x, device=device, dst_type=dst_type)
+    x_tri = to_triton(x, device=device)
 
-    z_tri = to_triton(np.array([neutral], dtype=getattr(np, dtype_x_str)), device=device, dst_type=dst_type)
+    z_tri = to_triton(np.array([neutral], dtype=getattr(np, dtype_x_str)), device=device)
     h = kernel[(n_programs, )](x_tri, z_tri)
     # torch result
-    if dst_type == 'bfloat16':
-        z_ref = numpy_op(x).astype(getattr(np, dtype_x_str))
-        # trunc mantissa for a fair comparison of accuracy
-        z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
-    else:
-        z_ref = numpy_op(x).astype(getattr(np, dtype_x_str))
+    z_ref = numpy_op(x).astype(getattr(np, dtype_x_str))
     # compare
     exact = op not in ['add']
     if exact:
@@ -1646,12 +1638,6 @@ def kernel(X, Z):
     if not is_cuda():
         return
 
-    # atom.add.bf16 is unsupported prior to Hopper so instead we generate an
-    # atom.cas add loop on Ampere and prior
-    if dst_type == 'bfloat16' and torch.cuda.get_device_capability()[0] < 9:
-        assert f"atom.{sem_str}.global.cas" in h.asm["ptx"]
-        return
-
     assert f"atom.global.gpu.{sem_str}" in h.asm["ptx"]
 
 
@@ -1676,7 +1662,7 @@ def kernel(X):
                           for shape in [(2, 2), (2, 8), (8, 2), (8, 8), (32, 32), (64, 64)]
                           for axis in [0, 1]
                           for num_ctas in num_ctas_list
-                          for dtype_x_str in ['bfloat16', 'float16', 'float32', 'uint64', 'int64', 'float64']
+                          for dtype_x_str in ['float16', 'float32', 'uint64', 'int64', 'float64']
                           for check_return_val in ([True, False] if is_hip() else [True])])
 def test_tensor_atomic_rmw(shape, axis, num_ctas, dtype_x_str, check_return_val, device):
     check_type_supported(dtype_x_str, device)
@@ -1690,14 +1676,14 @@ def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.const
         off1 = tl.arange(0, SHAPE1)
         x = tl.load(X + off0[:, None] * SHAPE1 + off1[None, :])
 
-        if DTYPE == tl.float16 or DTYPE == tl.bfloat16:
+        if DTYPE == tl.float16:
             # sum can have bad numerics when accumulating in float16.
             # if we're dealing with float16, do the sum in float32.
             x = x.to(tl.float32)
 
         z = tl.sum(x, axis=AXIS)
 
-        if DTYPE == tl.float16 or DTYPE == tl.bfloat16:
+        if DTYPE == tl.float16:
             z = z.to(DTYPE)
 
         if AXIS == 1:
@@ -1713,7 +1699,7 @@ def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.const
     x = numpy_random((shape0, shape1), dtype_str=dtype_x_str, rs=rs)
     z_shape = (shape0, ) if axis == 1 else (shape1, )
     z = numpy_random(z_shape, dtype_str=dtype_x_str, rs=rs)
-    old = np.zeros(z_shape, dtype=z.dtype)
+    old = np.zeros(z_shape, dtype=getattr(np, dtype_x_str))
     # reference results
     if x.dtype == np.float16:
         # do the sum in float32 to reduce numerical variation
@@ -1722,31 +1708,17 @@ def kernel(Z, X, OLD, AXIS: tl.constexpr, SHAPE0: tl.constexpr, SHAPE1: tl.const
         z_ref = z + np.sum(x, axis=axis, keepdims=False)
     old_ref = np.copy(z)
     # triton result
-    x_tri = to_triton(x, device=device, dst_type=dtype_x_str)
-    z_tri = to_triton(z, device=device, dst_type=dtype_x_str)
-    old_tri = to_triton(old, device=device, dst_type=dtype_x_str)
+    x_tri = to_triton(x, device=device)
+    z_tri = to_triton(z, device=device)
+    old_tri = to_triton(old, device=device)
 
     def torch_to_triton_dtype(t):
-        if t == torch.bfloat16:
-            return tl.bfloat16
         if t == torch.float16:
             return tl.float16
         return None
 
     kernel[(1, )](z_tri, x_tri, old_tri, axis, shape0, shape1, torch_to_triton_dtype(x_tri.dtype), check_return_val,
                   num_ctas=num_ctas)
-
-    if dtype_x_str == 'bfloat16':
-        # trunc mantissa for a fair comparison of accuracy
-        z_ref = (z_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
-        old_ref = (old_ref.view('uint32') & np.uint32(0xffff0000)).view('float32')
-        # mantissa trunc is not enough, bump up the relative tolerance as well
-        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.5)
-        # check return vals, but use assert_allclose for bf16
-        if check_return_val:
-            np.testing.assert_allclose(old_ref, to_numpy(old_tri), rtol=0.5)
-        return
-
     np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=1e-4)
     if check_return_val:
         np.testing.assert_equal(old_ref, to_numpy(old_tri))
@@ -1756,9 +1728,8 @@ def torch_to_triton_dtype(t):
 @pytest.mark.parametrize("size, num_ctas, dtype_x_str", [(size, num_ctas, dtype_x_str)
                                                          for size in [2, 4, 8, 32, 64, 128]
                                                          for num_ctas in num_ctas_list
-                                                         for dtype_x_str in ['bfloat16', 'float16', 'float32']])
+                                                         for dtype_x_str in ['float16', 'float32']])
 def test_tensor_atomic_add_non_exclusive_offset(size, num_ctas, dtype_x_str, device):
-    check_type_supported(dtype_x_str, device)
 
     @triton.jit
     def kernel(X, val, NUM: tl.constexpr):
@@ -1768,9 +1739,8 @@ def kernel(X, val, NUM: tl.constexpr):
         tl.atomic_add(X + offset // 2, val)
 
     shape = (size // 2, size)
-    dtype = getattr(torch, dtype_x_str)
-    x = torch.zeros(shape, dtype=dtype, device=device)
-    val = torch.randn((size**2), dtype=dtype, device=device)
+    x = torch.zeros(shape, dtype=getattr(torch, dtype_x_str), device=device)
+    val = torch.randn((size**2), dtype=getattr(torch, dtype_x_str), device=device)
     kernel[(1, )](x, val, size, num_warps=1, num_ctas=num_ctas)
     ref = val[0::2] + val[1::2]
     torch.testing.assert_close(ref, x.reshape(math.prod(shape)))
@@ -1783,7 +1753,7 @@ def kernel(X, val, NUM: tl.constexpr):
                           for idx_order in ['increase', 'decrease', 'random_no_duplication', 'random']
                           for mask_step in range(1, 5)
                           for num_ctas in num_ctas_list
-                          for dtype_x_str in ['bfloat16', 'float16', 'float32']])
+                          for dtype_x_str in ['float16', 'float32']])
 def test_tensor_atomic_add_access_patterns(shape, idx_order, mask_step, num_ctas, dtype_x_str, device):
     check_type_supported(dtype_x_str, device)
     if is_interpreter():
@@ -1811,9 +1781,8 @@ def kernel(in_ptr, idx_ptr, out_ptr, shape0, shape1, mask_step, XBLOCK: tl.const
     if idx_order == 'random':
         idx = torch.randint(0, shape1, size=(shape0, shape1), device=device)
 
-    dtype = getattr(torch, dtype_x_str)
-    val = torch.randn((shape0, shape1), dtype=dtype, device=device)
-    dst = torch.randn((shape0, shape1), dtype=dtype, device=device)
+    val = torch.randn((shape0, shape1), dtype=getattr(torch, dtype_x_str), device=device)
+    dst = torch.randn((shape0, shape1), dtype=getattr(torch, dtype_x_str), device=device)
 
     dst_ref = dst.clone()
 
@@ -1825,11 +1794,6 @@ def kernel(in_ptr, idx_ptr, out_ptr, shape0, shape1, mask_step, XBLOCK: tl.const
             cnt += 1
 
     kernel[(1, )](val, idx, dst, shape0, shape1, mask_step, 64, num_ctas=num_ctas)
-
-    if dtype_x_str == 'bfloat16':
-        torch.testing.assert_close(dst_ref, dst, rtol=0.1, atol=0.1)
-        return
-
     np.testing.assert_allclose(to_numpy(dst_ref), to_numpy(dst), atol=1e-2)