Enable test small batch matmul test (#5154)

dev-tomek · anmyachev · web-flow · commit eab348f588a5 · 2025-09-24T15:13:51.000+02:00
This PR enables small batch matmul test on XPU. Reported in #5092 --------- Co-authored-by: Anatoly Myachev <anatoliimyachev@mail.com>
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -21,7 +21,7 @@
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_kernels.target_info import is_hip, is_xpu, is_hip_cdna3, is_cuda, is_hip_cdna4
+from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
 
 # ---------------
 # initialize data
@@ -507,11 +507,9 @@ def round_x(x, idx):
 @pytest.mark.parametrize("m", [8, 16, 32, 64, 128])
 @pytest.mark.parametrize("n", [8, 16, 32, 64, 128])
 @pytest.mark.parametrize("k", [8, 16, 32, 64, 128])
-def test_small_batch_matmul(m, n, k):
+def test_small_batch_matmul(m, n, k, device):
     if is_hip():
         pytest.skip("Not fully tested on AMD")
-    if is_xpu():
-        pytest.xfail("Enable: https://github.com/intel/intel-xpu-backend-for-triton/issues/5092")
 
     if m * n * k > 16384:
         pytest.skip()
@@ -521,7 +519,7 @@ def test_small_batch_matmul(m, n, k):
     def _make_tensor(shape, dtype, trans):
         if trans:
             shape = (shape[0], shape[2], shape[1])
-        t = alloc_rand(shape, "cuda", dtype)
+        t = alloc_rand(shape, device, dtype)
         return t.transpose(1, 2) if trans else t
 
     for x_transpose, w_transpose, bias, dtype in itertools.product(
@@ -530,7 +528,7 @@ def _make_tensor(shape, dtype, trans):
         (False, True),
         (torch.float16, torch.bfloat16, torch.float8_e5m2),
     ):
-        if (
+        if is_cuda() and (
             torch.cuda.get_device_capability()[0] < 10
             and dtype is torch.float8_e5m2
             and (not w_transpose)