enable test_small_batch_matmul

dev-tomek · dev-tomek · commit 421072e7d6ec · 2025-09-17T10:05:23.000Z
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -476,11 +476,9 @@ def round_x(x, idx):
 @pytest.mark.parametrize("m", [8, 16, 32, 64, 128])
 @pytest.mark.parametrize("n", [8, 16, 32, 64, 128])
 @pytest.mark.parametrize("k", [8, 16, 32, 64, 128])
-def test_small_batch_matmul(m, n, k):
+def test_small_batch_matmul(m, n, k, device):
     if is_hip():
         pytest.skip("Not fully tested on AMD")
-    if is_xpu():
-        pytest.xfail("Enable: https://github.com/intel/intel-xpu-backend-for-triton/issues/5092")
 
     if m * n * k > 16384:
         pytest.skip()
@@ -490,7 +488,7 @@ def test_small_batch_matmul(m, n, k):
     def _make_tensor(shape, dtype, trans):
         if trans:
             shape = (shape[0], shape[2], shape[1])
-        t = alloc_rand(shape, "cuda", dtype)
+        t = alloc_rand(shape, device, dtype)
         return t.transpose(1, 2) if trans else t
 
     for x_transpose, w_transpose, bias, dtype in itertools.product(
@@ -499,7 +497,7 @@ def _make_tensor(shape, dtype, trans):
         (False, True),
         (torch.float16, torch.bfloat16, torch.float8_e5m2),
     ):
-        if (
+        if device == "cuda" and (
             torch.cuda.get_device_capability()[0] < 10
             and dtype is torch.float8_e5m2
             and (not w_transpose)