[TEST] Enable bfloat16 unit tests

whitneywhtsang · whitneywhtsang · commit b3479a0f9f35 · 2025-09-23T01:07:22.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1353,8 +1353,6 @@ def test_atomic_rmw(op, dtype_x_str, mode, sem, device):
             pytest.xfail("Only test atomic bfloat16/float16 ops on GPU")
     if "uint" in dtype_x_str and mode in ["min_neg", "all_neg"]:
         pytest.xfail("uint cannot be negative")
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     n_programs = 5
 
@@ -1443,8 +1441,6 @@ def kernel(X):
                           for check_return_val in ([True, False] if is_hip() else [True])])
 def test_tensor_atomic_rmw(shape, axis, num_ctas, dtype_x_str, check_return_val, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
     shape0, shape1 = shape
     # triton kernel
 
@@ -1524,8 +1520,6 @@ def torch_to_triton_dtype(t):
                                                          for dtype_x_str in ['bfloat16', 'float16', 'float32']])
 def test_tensor_atomic_add_non_exclusive_offset(size, num_ctas, dtype_x_str, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     @triton.jit
     def kernel(X, val, NUM: tl.constexpr):
@@ -1550,8 +1544,6 @@ def kernel(X, val, NUM: tl.constexpr):
                                                          for dtype_x_str in ['bfloat16', 'float16', 'float32']])
 def test_tensor_atomic_add_shift_1(size, num_ctas, dtype_x_str, device):
     check_type_supported(dtype_x_str, device)
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
 
     @triton.jit
     def kernel(X, val, NUM: tl.constexpr):
@@ -1588,9 +1580,6 @@ def test_tensor_atomic_add_access_patterns(shape, idx_order, mask_step, num_ctas
     if is_interpreter():
         pytest.xfail("not supported in the interpreter")
 
-    if is_xpu() and dtype_x_str == 'bfloat16':
-        pytest.skip("bfloat16 not yet supported for xpu")
-
     @triton.jit
     def kernel(in_ptr, idx_ptr, out_ptr, shape0, shape1, mask_step, XBLOCK: tl.constexpr):
         xoffset = tl.program_id(0) * XBLOCK
diff --git a/python/test/unit/language/test_tensor_descriptor.py b/python/test/unit/language/test_tensor_descriptor.py
@@ -1566,9 +1566,6 @@ def test_tensor_descriptor_reduce(kind, descriptor, dtype_str, num_ctas, M_BLOCK
             pytest.xfail("Multi-CTA not supported")
         if is_hip_cdna3() and (kind, dtype_str, M_BLOCK, N_BLOCK) in REDUCE_SKIP_HIP_CDNA3:
             pytest.skip("Broken on rocm")
-        if is_xpu():
-            if (kind, dtype_str) in [("add", "bfloat16")]:
-                pytest.skip("FIXME: issue #3914")
 
     @triton.jit(debug=True)
     def kernel(out_desc, out_ptr, a_ptr, M, N, M_BLOCK: tl.constexpr, N_BLOCK: tl.constexpr, kind: tl.constexpr):