Implement make_opt_flags function for XPU and enable tests in test_matmul.py (#5051)

anmyachev · web-flow · commit 632d2342f14a · 2025-09-10T15:15:45.000+02:00
**Note for reviewers:** I've left only the most basic heuristics. If you
have improvements in mind that will definitely work better without
testing, i.e. were already known, then we can make edits directly to
this pull request. If you have improvements that need to be tested -
that's also good, please also write, but I'd prefer to implement the
basic version as quickly as possible and tune it in separate PRs if
possible.

Pass rate: 84.11% -&gt; 89.04%

---------

Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -20,7 +20,7 @@
 # testing utilities
 from triton_kernels.testing import assert_close, compute_actual_scale
 # target-specific utilities
-from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
+from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_xpu, is_hip_cdna4
 
 # ---------------
 # initialize data
@@ -73,7 +73,7 @@ def init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_sh
     if mode == 'batched' or (not has_y_gammas) or (has_y_gammas and (gindx is not None) and act_dtype.itemsize >= 2):
         gs0 = None
         gs1 = None
-    if "float8" in str(weight_dtype) and torch.cuda.get_device_capability()[0] < 10:
+    if is_cuda() and "float8" in str(weight_dtype) and torch.cuda.get_device_capability()[0] < 10:
         w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
     return x, w, bias, gs0, gs1
 
@@ -294,6 +294,10 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
         if split_k > 1:
             pytest.skip("splitK hasn't been fully tested on AMD GPU.")
 
+    elif is_xpu():
+        if split_k > 1:
+            pytest.skip("FIXME: https://github.com/intel/intel-xpu-backend-for-triton/issues/5074")
+
     if "float8_e4m3fnuz" in (weight_dtype_str, act_dtype_str) and not is_hip_cdna3():
         pytest.skip("float8_e4m3fnuz only tested on AMD CDNA3 Platform")
 
@@ -308,20 +312,21 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                 pytest.skip("Non-scale swizzling not supported on CDNA4 yet")
             if n % 32 != 0 or k % (32 * 8) != 0:
                 pytest.skip(f"Shape {m}x{n}x{k} is not supported for scale swizzling on AMD GPU")
-        if torch.cuda.get_device_capability()[0] < 9:
-            pytest.skip("NYI. Ampere swizzling.")
-        if torch.cuda.get_device_capability()[0] < 10:
-            if "mxfloat4" not in weight_dtype_str:
-                pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
-            if k % 64 != 0 or n % 64 != 0:
-                # Automatic padding not implemented for Hopper swizzle
-                pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
+        if is_cuda():
+            if torch.cuda.get_device_capability()[0] < 9:
+                pytest.skip("NYI. Ampere swizzling.")
+            if torch.cuda.get_device_capability()[0] < 10:
+                if "mxfloat4" not in weight_dtype_str:
+                    pytest.skip("NYI. Hopper swizzling just implemented for mxfp4.")
+                if k % 64 != 0 or n % 64 != 0:
+                    # Automatic padding not implemented for Hopper swizzle
+                    pytest.skip("Hopper swizzling acts on a 64x64 tile (4x1 mma tiles).")
 
     # launch metadata for batched / mx types may not work yet.
     torch.manual_seed(0)
 
     block_k = None
-    if is_persistent and weight_dtype_str.startswith("mx") and torch.cuda.get_device_capability()[0] < 10:
+    if is_cuda() and is_persistent and weight_dtype_str.startswith("mx") and torch.cuda.get_device_capability()[0] < 10:
         # Override block_k for testing correctness. The default is temporarily 128 for
         # performance reasons which doesn't work with persistent matmul.
         # TODO: revisit when Triton is better for H100 + MXFP4
@@ -436,7 +441,7 @@ def round_x(x, idx):
 
     round_y = lambda y: (y / y_scale).to(act_dtype).to(torch.float32) * y_scale if sep_scatter else y
     ref_y = matmul_ogs_torch(x_ref, w_ref, bias_ref,  #
-                             rdata, gindx, sindx, round_x=round_x, round_y=round_y, gammas=gs1_ref)
+                             rdata, gindx, sindx, round_x=round_x, round_y=round_y, gammas=gs1_ref, device=device)
     scale = lambda val, scal: val if scal is None else val / scal
     if n_expt_shards > 1:
         if do_scatter:
@@ -549,21 +554,21 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter,
     (4096, 4096, 0),
 ])
 @pytest.mark.parametrize("view_x_as_zero_cols", [False, True])
-def test_zero_reduction_dim(m, n, k, view_x_as_zero_cols):
+def test_zero_reduction_dim(m, n, k, view_x_as_zero_cols, device):
     torch.manual_seed(0)
 
     if view_x_as_zero_cols:
-        x = torch.randn(m, m, device="cuda", dtype=torch.bfloat16)
+        x = torch.randn(m, m, device=device, dtype=torch.bfloat16)
         x = x[:0, :].transpose(-1, -2)
     else:
-        x = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
-    w = torch.randn(k, n, device="cuda", dtype=torch.bfloat16)
-    bias = torch.randn(n, device="cuda", dtype=torch.float32)
+        x = torch.randn(m, k, device=device, dtype=torch.bfloat16)
+    w = torch.randn(k, n, device=device, dtype=torch.bfloat16)
+    bias = torch.randn(n, device=device, dtype=torch.float32)
 
     try:
         tri_y = matmul_ogs(x, w, bias)
     except opt_flags.InapplicableConstraint:
         pytest.skip("inapplicable constraint")
-    ref_y = matmul_ogs_torch(x, w, bias, round_x=lambda x, idx: x, round_y=lambda y: y)
+    ref_y = matmul_ogs_torch(x, w, bias, round_x=lambda x, idx: x, round_y=lambda y: y, device=device)
 
     assert_close(ref_y, tri_y)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -549,7 +549,7 @@ def matmul_ogs_torch(x, w, bias,
                  betas = None,
                  gammas = None,
                  round_x = None, round_y = None,
-                 ):
+                 device: str = "cuda"):
     is_input_batched = x.ndim == 3
     assert x.dtype.itemsize > 1
     assert w.dtype.itemsize > 1
@@ -588,7 +588,7 @@ def matmul_ogs_torch(x, w, bias,
         else:
             idx = gather_indx.src_indx[lo:hi] // n_expts_act
         batch = i if is_input_batched else 0
-        out = torch.matmul(round_x(x[batch, idx, :], torch.arange(lo, hi, device="cuda")).float(),
+        out = torch.matmul(round_x(x[batch, idx, :], torch.arange(lo, hi, device=device)).float(),
                            w[i].float())
         if bias is not None:
             out += bias[i, :] if betas is None else bias[i, :] * betas[lo:hi, None]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -4,7 +4,7 @@
 import triton
 from triton_kernels.target_info import get_cdna_version
 import torch
-from .opt_flags_details import opt_flags_amd, opt_flags_nvidia
+from .opt_flags_details import opt_flags_amd, opt_flags_nvidia, opt_flags_intel
 
 
 @dataclass
@@ -30,6 +30,84 @@ def __post_init__(self):
             raise ValueError("Not supported")
 
 
+def make_default_opt_flags_intel(
+    out_dtype,
+    lhs_dtype,
+    rhs_dtype,
+    precision_config,
+    m,
+    n,
+    k,
+    routing_data,
+    can_use_persistent_tma,
+    can_use_fused_scatter,
+    enforce_bitwise_invariance,
+    epilogue_effective_itemsize,
+    constraints,
+):
+    constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages"]
+    assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
+    # tokens per expert
+    if routing_data is None:
+        tokens_per_expt = m
+    elif routing_data.expected_tokens_per_expt is None:
+        tokens_per_expt = max(1, m // routing_data.n_expts_tot)
+    else:
+        tokens_per_expt = routing_data.expected_tokens_per_expt
+    # pid swizzling
+    group_m = 8
+    xcd_swizzle = 1
+    # block_m
+    if constraints.get("block_m", None):
+        block_m = constraints["block_m"]
+    elif enforce_bitwise_invariance:
+        block_m = 128
+    else:
+        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+    # block n
+    block_n = opt_flags_intel.compute_block_n(n)
+    # is_persistent
+    is_persistent = constraints.get("is_persistent", False)
+    # block k
+    if constraints.get("block_k", None) is not None:
+        block_k = constraints["block_k"]
+    else:
+        block_k = opt_flags_intel.compute_block_k(k, is_persistent, precision_config)
+    # split_k
+    if constraints.get("split_k", None) is not None:
+        split_k = constraints["split_k"]
+    elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
+        split_k = 1
+    else:
+        estimated_actual_grid_size = opt_flags_intel.compute_grid_size(None, m, n, block_m, block_n)
+        split_k = opt_flags_intel.compute_split_k(block_k, k, estimated_actual_grid_size)
+
+    epilogue_subtile = constraints.get('epilogue_subtile', None)
+    if epilogue_subtile is None:
+        epilogue_subtile = 1
+
+    ret = OptFlags(
+        block_m=block_m,
+        block_n=block_n,
+        block_k=block_k,
+        num_warps=opt_flags_intel.compute_num_warps(block_m, block_n),
+        num_stages=constraints.get("num_stages", 2),
+        fused_scatter=constraints.get('fused_scatter', False),
+        group_m=group_m,
+        xcd_swizzle=xcd_swizzle,
+        w_cache_modifier=None,
+        split_k=split_k,
+        is_persistent=is_persistent,
+        epilogue_subtile=epilogue_subtile,
+        arch=None,
+        target_kernel_kwargs=dict(),
+        idle_sms=0,
+    )
+    # check constraints
+    assert all(getattr(ret, ck) == cv for ck, cv in constraints.items() if cv is not None), f"{ret} != {constraints}"
+    return ret
+
+
 def make_default_opt_flags_amd(
     out_dtype,
     lhs_dtype,
@@ -296,6 +374,8 @@ def make_opt_flags(
             enforce_bitwise_invariance, epilogue_effective_itemsize,
             _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
+    if backend == "xpu":
+        return make_default_opt_flags_intel(*args)
     if backend == "hip":
         return make_default_opt_flags_amd(*args)
     if backend == "cuda":
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_intel.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_intel.py
@@ -0,0 +1,41 @@
+import torch
+import triton
+
+
+def compute_grid_size(routing_data, m, n, block_m, block_n):
+    if routing_data is not None:
+        grid_m = routing_data.n_blocks(m, block_m)
+    else:
+        grid_m = triton.cdiv(m, block_m)
+    grid_n = (n + block_n - 1) // block_n
+    return grid_m * grid_n
+
+
+def compute_block_n(n: int):
+    # block_n:
+    return max(16, min(128, triton.next_power_of_2(n)))
+
+
+def compute_block_k(k: int | None, is_persistent: bool, precision_config):
+    if k is not None:
+        block_k = max(32, min(128, triton.next_power_of_2(k)))
+    has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None
+    if is_persistent and has_mx_weight_scale:
+        block_k = min(block_k, 128)
+    return block_k
+
+
+def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
+    device_props = torch.xpu.get_device_properties(0)
+    n_sms = device_props.gpu_subslice_count
+    split_k = n_sms // grid_size
+    if k is not None:
+        # avoid split_k for small k
+        num_block_k = triton.cdiv(k, block_k)
+        split_k = min(split_k, num_block_k // 4)
+    split_k = max(split_k, 1)
+    return split_k
+
+
+def compute_num_warps(block_m, block_n):
+    return max(block_m * block_n // 4096, 4)
diff --git a/python/triton_kernels/triton_kernels/target_info.py b/python/triton_kernels/triton_kernels/target_info.py
@@ -18,6 +18,7 @@
     "has_native_mxfp",
     "is_cuda",
     "is_hip",
+    "is_xpu",
     "is_hip_cdna3",
     "is_hip_cdna4",
     "is_xpu",
diff --git a/scripts/skiplist/default/triton_kernels.txt b/scripts/skiplist/default/triton_kernels.txt
@@ -1,3 +1,9 @@
-tests/test_matmul.py::test_op
-tests/test_matmul.py::test_fused_act
-tests/test_matmul.py::test_zero_reduction_dim
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/5074
+tests/test_matmul.py::test_op[False-False-False-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-False-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-True-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-True-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-False-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-False-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-True-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-True-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
diff --git a/scripts/skiplist/xe2/triton_kernels.txt b/scripts/skiplist/xe2/triton_kernels.txt
@@ -1,3 +1,9 @@
-tests/test_matmul.py::test_op
-tests/test_matmul.py::test_fused_act
-tests/test_matmul.py::test_zero_reduction_dim
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/5074
+tests/test_matmul.py::test_op[False-False-False-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-False-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-True-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-False-True-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-False-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-False-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-True-True-False-16-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
+tests/test_matmul.py::test_op[False-True-True-True-False-128-1000-400-400-ragged-float8_e4m3fn-float8_e4m3fn-3-1-1-1-False-None]
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
@@ -577,7 +577,7 @@ run_triton_kernels_tests() {
   cd $TRITON_PROJ/python/triton_kernels/tests
 
   TRITON_TEST_SUITE=triton_kernels \
-    run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu .
+    run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} --device xpu .
 }
 
 test_triton() {
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -42,6 +42,7 @@ class XPUOptions:
     generate_native_code: bool = False
     advanced_path: bool = False
     enable_tile_load_linear_layout: bool = True
+    arch: str = None
     # FIXME: enable for XPU: https://github.com/intel/intel-xpu-backend-for-triton/issues/4954
     instrumentation_mode: str = ""
 

Original file line number	Diff line number	Diff line change
`@@ -577,7 +577,7 @@ run_triton_kernels_tests() {`
`577`	`577`	`cd $TRITON_PROJ/python/triton_kernels/tests`
`578`	`578`
`579`	`579`	`TRITON_TEST_SUITE=triton_kernels \`
`580`		`- run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu .`
	`580`	`+ run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-4} --device xpu .`
`581`	`581`	`}`
`582`	`582`
`583`	`583`	`test_triton() {`