Align behavior with CUDA/HIP: skip test_matmul when swiglu_opts is not None and do_gamma is set

wdziurdz · anmyachev · commit e33b846ef34c · 2025-12-02T16:18:40.000Z
Signed-off-by: Witold Dziurdz <witold.dziurdz@intel.com> (cherry picked from commit 1479afd)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -16,7 +16,7 @@
 # testing utilities
 from triton_kernels.testing import assert_close, make_random_tensor
 # target-specific utilities
-from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4
+from triton_kernels.target_info import is_hip, is_hip_cdna3, is_cuda, is_hip_cdna4, is_xpu
 from triton_kernels.swiglu import swiglu, swiglu_fn
 from triton_kernels.swiglu import PrecisionConfig as SwiGLUPrecisionConfig
 
@@ -243,6 +243,10 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, do_gamma,
         if split_k is not None and split_k > 1:
             pytest.skip("splitK hasn't been fully tested on AMD GPU.")
 
+    elif is_xpu():
+        if swiglu_opts is not None and do_gamma:
+            pytest.xfail("NYI: swiglu and gamma not supported together")
+
     if "float8_e4m3fnuz" in (weight_dtype_str, act_dtype_str) and not is_hip_cdna3():
         pytest.xfail("float8_e4m3fnuz only tested on AMD CDNA3 Platform")
 
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags.py
@@ -54,7 +54,7 @@ def make_default_opt_flags_intel(
     m,
     n,
     k,
-    routing_data,
+    ragged_metadata,
     can_use_persistent_tma,
     can_use_split_k,
     enforce_bitwise_invariance,
@@ -65,13 +65,13 @@ def make_default_opt_flags_intel(
 ):
     constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "epilogue_subtile", "num_stages", "max_allowable_mn"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
-    # tokens per expert
-    if routing_data is None:
-        tokens_per_expt = m
-    elif routing_data.expected_tokens_per_expt is None:
-        tokens_per_expt = max(1, m // routing_data.n_expts_tot)
+    # tokens per slice
+    if ragged_metadata is None:
+        slice_size = m
+    elif ragged_metadata.expected_slice_size is None:
+        slice_size = max(1, m // ragged_metadata.n_slices)
     else:
-        tokens_per_expt = routing_data.expected_tokens_per_expt
+        slice_size = ragged_metadata.expected_slice_size
     # pid swizzling
     group_m = 8
     xcd_swizzle = 1
@@ -81,7 +81,7 @@ def make_default_opt_flags_intel(
     elif enforce_bitwise_invariance:
         block_m = 128
     else:
-        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+        block_m = max(16, min(triton.next_power_of_2(slice_size), 128))
     # block n
     block_n = opt_flags_intel.compute_block_n(n)
     # is_persistent
diff --git a/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_intel.py b/python/triton_kernels/triton_kernels/matmul_details/opt_flags_details/opt_flags_intel.py
@@ -4,7 +4,7 @@
 
 def compute_grid_size(routing_data, m, n, block_m, block_n):
     if routing_data is not None:
-        grid_m = routing_data.n_blocks(m, block_m)
+        grid_m = routing_data.n_blocks(routing_data.n_slices, m, block_m)
     else:
         grid_m = triton.cdiv(m, block_m)
     grid_n = (n + block_n - 1) // block_n
@@ -19,7 +19,7 @@ def compute_block_n(n: int):
 def compute_block_k(k: int | None, is_persistent: bool, precision_config):
     if k is not None:
         block_k = max(32, min(128, triton.next_power_of_2(k)))
-    has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None
+    has_mx_weight_scale = precision_config is not None and precision_config.b_mx_scale is not None
     if is_persistent and has_mx_weight_scale:
         block_k = min(block_k, 128)
     return block_k