[triton_kernels] forbid use of split_k > 1 with fused scatter (#8618)

ptillet · web-flow · commit 318fa9c42fd9 · 2025-11-02T13:39:18.000-08:00
API doesn't accept scale for the intermediate tensor produced between
split_k and fused_scatter; this mode should therefore be disabled for
now.

Will be re-enabled after expert aggregation is moved out of the
matmul_ogs API
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -303,27 +303,27 @@ class Case:
     ],
 )
 @pytest.mark.parametrize("block_m", [16, 128])
-@pytest.mark.parametrize("do_gather, do_scatter, fused_scatter, inner_expt_opt", [
-    (False, False, False, None),
-    (True, False, False, None),
-    (False, True, False, None),
-    (False, True, True, None),
-    (True, True, False, None),
-    (True, True, True, None),
-    (False, False, False, "pad_w"),
-    (False, False, False, "pad_x"),
+@pytest.mark.parametrize("do_gather, do_scatter, inner_expt_opt", [
+    (False, False, None),
+    (True, False, None),
+    (False, True, None),
+    (False, True, None),
+    (True, True, None),
+    (True, True, None),
+    (False, False, "pad_w"),
+    (False, False, "pad_x"),
 ])
 @pytest.mark.parametrize("has_y_gammas", [False, True])
 @pytest.mark.parametrize("is_persistent", [False, True])
-def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
+def test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
             n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
             x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope):
     # We catch and re-invoke pytest.skip(), because otherwise pytest may hold a reference to
     # the frame that called pytest.skip, including all the tensors, leading to OOM.
     skip_message = None
     try:
-        _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
+        _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
                  n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
                  x_transpose, w_transpose, y_transpose,
                  device, opt_flags_scope)
@@ -333,7 +333,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
     if skip_message is not None:
         pytest.skip(skip_message)
 
-def _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
+def _test_op(m, n, k, split_k, do_gather, do_scatter, inner_expt_opt, has_y_gammas, is_persistent, n_expts_tot,
             n_expts_act, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, colmajor_mxfp_weight, epilogue_subtile,
             x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope):
@@ -362,9 +362,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_
     if "float8_e4m3fnuz" in (weight_dtype_str, act_dtype_str) and not is_hip_cdna3():
         pytest.skip("float8_e4m3fnuz only tested on AMD CDNA3 Platform")
 
-    if fused_scatter and split_k is not None and split_k > 1:
-        pytest.skip("fused scatter scratchpad not supported with split_k")
-
     if hbm_swizzling:
         if is_hip():
             if not is_hip_cdna4():
@@ -413,7 +410,6 @@ def _test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_
         "block_m": block_m,
         "block_k": block_k,
         "split_k": split_k,
-        "fused_scatter": fused_scatter,
         "is_persistent": is_persistent,
         "epilogue_subtile": epilogue_subtile,
     }
@@ -726,12 +722,11 @@ def test_set_idle_sms():
     (800, 800, 400, "batched"),
 ])
 @pytest.mark.parametrize("split_k", [1, 2])
-@pytest.mark.parametrize("do_gather, do_scatter, fused_scatter", [
-    (False, False, False),
-    (True, False, False),
-    (False, True, False),
-    (True, True, False),
-    (True, True, True),
+@pytest.mark.parametrize("do_gather, do_scatter", [
+    (False, False),
+    (True, False),
+    (False, True),
+    (True, True),
 ])
 @pytest.mark.parametrize("is_persistent, epilogue_subtile", [
     (False, None),
@@ -743,16 +738,13 @@ def test_set_idle_sms():
     (1.0, 1.2),
     (0.7, 1.0),
 ])
-def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter, is_persistent, epilogue_subtile,
+def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, is_persistent, epilogue_subtile,
                    swiglu_alpha, swiglu_limit, device, opt_flags_scope):
-    if fused_scatter and split_k > 1:
-        pytest.skip("fused scatter scratchpad not supported with split_k")
     torch.manual_seed(0)
     constraints = {
         "is_persistent": is_persistent,
         "epilogue_subtile": epilogue_subtile,
         "split_k": split_k,
-        "fused_scatter": fused_scatter,
     }
     n_expts_tot, n_expts_act = 1, 1
     opt_flags.update_opt_flags_constraints(constraints)
diff --git a/python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py b/python/triton_kernels/tests/test_matmul_details/test_opt_flags_split_k.py
@@ -185,7 +185,7 @@ def get_flags(split_k, max_mn):
             k,
             None,
             False,
-            False,
+            True,
             False,
             0,
             False,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -419,9 +419,10 @@ def matmul_ogs(x, w, bias,
     has_gather_tma = has_gather and target_info.has_tma_gather()
     # hopper w/ mxfp4 doesn't support TMA
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
+    can_use_split_k = scatter_indx is None and not x_has_mx and not w_has_mx
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
         batch_size, M, N, w.shape[-2], routing_data,
-        can_use_tma, scatter_indx is not None, epilogue.effective_itemsize,
+        can_use_tma, can_use_split_k, epilogue.effective_itemsize,
         x_transpose, y_acc_in is not None,
         inner_routing_data.block_k if inner_routing_data is not None else None,
     )
@@ -618,21 +619,21 @@ def matmul_ogs(x, w, bias,
                    **fused_comm_kwargs,
                    **opt_flags.target_kernel_kwargs)
 
+    assert not (opt_flags.split_k > 1 and scatter_indx is not None)
     out_final_mx_scale = None
     if opt_flags.split_k > 1:
         assert not out_matmul_has_mx
-        has_scatter = scatter_indx is not None
         postprocess_fn1 = ReducePostprocessFn(specs=reduce_fused_activation.specs, fn_args=reduce_fused_activation.fn_args)
         postprocess_fn2 = None if has_scatter else ReducePostprocessFn(specs=epilogue.specs, fn_args=epilogue.fn_arg_values_finalize)
         y, y_mx_scale = reduce(
             x = out_matmul.view(out_matmul.shape[0], -1, out_matmul.shape[-1]),
             dim = 0,
             # output data/metadata
-            y = None if has_scatter else memory["output"].view(-1, memory["output"].shape[-1]),
-            y_dtype = out_matmul.dtype if has_scatter else memory["output"].dtype,
-            y_flex = OutFlexData() if has_scatter else precision_config.flex_ctx.out_data,
-            y_flex_saturate_inf = None if has_scatter else precision_config.flexpoint_saturate_inf,
-            y_has_mx = scatter_indx is None and precision_config.out_scale is not None,
+            y = memory["output"].view(-1, memory["output"].shape[-1]),
+            y_dtype = memory["output"].dtype,
+            y_flex = precision_config.flex_ctx.out_data,
+            y_flex_saturate_inf = precision_config.flexpoint_saturate_inf,
+            y_has_mx = precision_config.out_scale is not None,
             # fused functions
             postprocess_fn1 = postprocess_fn1,
             postprocess_fn2 = postprocess_fn2,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -22,7 +22,6 @@ class OptFlags:
     w_cache_modifier: str
     split_k: int
     is_persistent: bool
-    fused_scatter: bool
     idle_sms: int
     epilogue_subtile: int | None
     arch: str
@@ -56,14 +55,14 @@ def make_default_opt_flags_amd(
     k,
     routing_data,
     can_use_persistent_tma,
-    can_use_fused_scatter,
+    can_use_split_k,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
     x_transpose,
     has_y_acc_in,
     constraints,
 ):
-    constraints_supported = ["block_m", "block_n", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile", "max_allowable_mn"]
+    constraints_supported = ["block_m", "block_n", "block_k", "split_k", "is_persistent", "epilogue_subtile", "max_allowable_mn"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
     # tokens per expert
     if routing_data is None:
@@ -102,13 +101,12 @@ def make_default_opt_flags_amd(
     )
     is_persistent = constraints.get("is_persistent", False)
     # split_k:
+    split_k = 1
     if constraints.get("max_allowable_mn", 0) > 0 and constraints.get("split_k") is not None:
         split_k = max_allowable_mn(constraints["max_allowable_mn"], m, n, constraints.get("split_k"))
     elif constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
-    elif is_persistent or enforce_bitwise_invariance:
-        split_k = 1
-    else:
+    elif can_use_split_k and not enforce_bitwise_invariance:
         grid_size = grid_m * ((n + block_n - 1) // block_n)
         n_cu = torch.cuda.get_device_properties(0).multi_processor_count
         split_k = max(1, n_cu // grid_size)
@@ -156,7 +154,6 @@ def replace_with_valid_constraint(k: str, v):
         w_cache_modifier=w_cache_modifier,
         split_k=split_k,
         is_persistent=is_persistent,
-        fused_scatter=constraints.get('fused_scatter', False),
         idle_sms=0,
         epilogue_subtile=epilogue_subtile,
         arch=None,
@@ -177,14 +174,14 @@ def make_default_opt_flags_nvidia(
     k,
     routing_data,
     can_use_persistent_tma,
-    can_use_fused_scatter,
+    can_use_split_k,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
     x_transpose,
     has_y_acc_in,
     constraints,
 ):
-    constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages", "idle_sms", "max_allowable_mn"]
+    constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "epilogue_subtile", "num_stages", "idle_sms", "max_allowable_mn"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
     # tokens per expert
     if routing_data is None or batch_size > 1:
@@ -236,26 +233,21 @@ def make_default_opt_flags_nvidia(
     if constraints.get("block_k", None) is not None:
         block_k = constraints["block_k"]
     # split_k
+    split_k = 1
     if constraints.get("max_allowable_mn", 0) > 0 and constraints.get("split_k") is not None:
         split_k = max_allowable_mn(constraints["max_allowable_mn"], m, n, constraints.get("split_k"))
     elif constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
-    elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
-        split_k = 1
-    else:
+    elif can_use_split_k and not enforce_bitwise_invariance:
         estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, batch_size, m, n, block_m, block_n)
         split_k = opt_flags_nvidia.compute_split_k(block_k, k, estimated_actual_grid_size)
-    if split_k > 1:
-        # With split_k, results are written in f32. Use that for the following computations.
-        out_dtype = torch.float32
     compute_num_stages_args = (
         precision_config,
         is_persistent,
-
         block_m,
         block_n,
         block_k,
-        out_dtype,
+        torch.float32 if split_k > 1 else out_dtype,
         lhs_dtype,
         rhs_dtype,
         x_transpose,
@@ -276,11 +268,6 @@ def make_default_opt_flags_nvidia(
     if constraints.get("num_stages", None):
         num_stages = constraints["num_stages"]
     assert num_stages >= 1
-    # fused scatter scratchpad
-    if constraints.get("fused_scatter", None) is not None:
-        fused_scatter = constraints["fused_scatter"]
-    else:
-        fused_scatter = can_use_fused_scatter and split_k == 1
     # Handshake with the HBM swizzling
     num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, is_persistent, precision_config)
     ret = OptFlags(
@@ -289,7 +276,6 @@ def make_default_opt_flags_nvidia(
         block_k=block_k,
         num_warps=num_warps,
         num_stages=num_stages,
-        fused_scatter=fused_scatter,
         group_m=group_m,
         xcd_swizzle=xcd_swizzle,
         w_cache_modifier=None,
@@ -343,16 +329,16 @@ def make_opt_flags(
     k,
     routing_data,
     can_use_persistent_tma,
-    can_use_fused_scatter,
+    can_use_split_k,
     epilogue_effective_itemsize,
     x_transpose,
     has_y_acc_in,
     block_k,
 ):
     if _opt_flags_constraints.get("is_persistent", False) and not can_use_persistent_tma:
         raise InapplicableConstraint("cannot enforce `is_persistent=True` constraint")
-    if _opt_flags_constraints.get("fused_scatter", False) and not can_use_fused_scatter:
-        raise InapplicableConstraint("cannot enforce `fused_scatter=True` constraint")
+    if _opt_flags_constraints.get("split_k") is not None and _opt_flags_constraints.get("split_k") > 1 and not can_use_split_k:
+        raise InapplicableConstraint("cannot enforce `split_k=True` constraint")
     if _opt_flags_constraints.get("max_allowable_mn"):
         if not _opt_flags_constraints.get("split_k"):
             raise InapplicableConstraint("split_k also needs to be provided with max_allowable_mn")
@@ -366,7 +352,7 @@ def make_opt_flags(
         opt_flags_constraints = opt_flags_constraints.copy()
         opt_flags_constraints.update(block_k=block_k, split_k=1)
     args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, batch_size, m, n, k,
-            routing_data, can_use_persistent_tma, can_use_fused_scatter,
+            routing_data, can_use_persistent_tma, can_use_split_k,
             enforce_bitwise_invariance, epilogue_effective_itemsize, x_transpose, has_y_acc_in,
             opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend

-Original file line number
+Diff line change
             k,
             None,
             False,
 -            False,
 +            True,
             False,
 ,
             False,