[KERNELS] Fix and enable batched matmul with split-k. (#8327)

yongjik · web-flow · commit eb7cdbaaa600 · 2025-10-01T12:46:04.000-07:00
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -114,7 +114,7 @@ def _apply_padding_and_fill_unused_part_with_nan(t, is_padded):
 # ---------------
 
 
-def init_precision(out_dtype, act_use_flexpoint, weight_dtype, weight_mxfp, n_expts_tot=1, expt_is_inner=False, device="cuda"):
+def init_precision(out_dtype, act_use_flexpoint, weight_dtype, weight_mxfp, mode, n_expts_tot=1, expt_is_inner=False, device="cuda"):
     weight_use_flexpoint = weight_dtype.itemsize == 1 and not weight_mxfp
     # flexpoint
     make_tensor = lambda val0, val1: torch.tensor([val0, val1] * (n_expts_tot // 2) +
@@ -133,8 +133,8 @@ def init_precision(out_dtype, act_use_flexpoint, weight_dtype, weight_mxfp, n_ex
         ) if weight_use_flexpoint else InFlexData(),
         out_data=OutFlexData(
             dtype=out_dtype,
-            expected_scale=make(4.00, 5.00, expt_is_inner),
-            actual_scale=make(0, 0, expt_is_inner),
+            expected_scale=make(4.00, 5.00, mode == "batched" or expt_is_inner),
+            actual_scale=make(0, 0, mode == "batched" or expt_is_inner),
             checksum_scale=None,
         ) if act_use_flexpoint else OutFlexData(),
     )
@@ -233,6 +233,7 @@ class Case:
             Case(1000, 700, 700, "ragged", "float16", "float16", 8, 2, split_k=9),
             Case(16, 16, 1000, "batched", "float16", "float16", 5, 1, split_k=None),
             Case(16, 16, 1000, "batched", "float8_e5m2", "float8_e5m2", 5, 1, split_k=None),
+            Case(16, 16, 2048, "batched", "float8_e5m2", "float8_e5m2", 6, 1, split_k=5),
             # mx types:
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1),
             Case(16, 256, 256, "plain", "bfloat16", "mxfloat4_e2m1", 1, 1, hbm_swizzling=True),
@@ -412,7 +413,7 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, inner_expt_o
     weight_dtype = dtype_str_to_torch(weight_dtype_str)
     act_dtype = dtype_str_to_torch(act_dtype_str)
     precision_opt = init_precision(act_dtype, act_is_float8, weight_dtype, weight_mxfp,
-                                   n_expts_tot, expt_is_inner, device=device)
+                                   mode, n_expts_tot, expt_is_inner, device=device)
     # precision_opt.x_pad_trans_requires_flexpoint = False
     if mode == "ragged":
         m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, do_gather, do_scatter,
@@ -667,7 +668,7 @@ def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter,
     else:
         rdata = gindx = sindx = None
 
-    precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, n_expts_tot, device=device)
+    precision_opt = init_precision(act_dtype, str(act_dtype).startswith("torch.float8"), weight_dtype, False, mode, n_expts_tot, device=device)
     x, w, bias, _, _ = init_compute_data(m, n, k, rdata, gindx, sindx, n_expts_tot, n_expts_act, mode,
                                          act_dtype, weight_dtype, False, requires_grad=False, device=device)
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -244,8 +244,9 @@ def init_allocation(x, w, precision_config, fused_activation,
     scratchpad = dict()
     if opt_flags.split_k > 1 or (scatter_indx is not None and not opt_flags.fused_scatter):
         scratch_out_dtype = torch.float32 if opt_flags.split_k > 1 else out_dtype
-        scratchpad["matmul"] = ((opt_flags.split_k, 1, M, N), scratch_out_dtype)
+        scratchpad["matmul"] = ((opt_flags.split_k, batch_dim, M, N), scratch_out_dtype)
     if "matmul" in scratchpad and precision_config.out_scale is not None:
+        assert batch_dim == 1, "batch_dim > 1 not supported yet"
         scratchpad["mx_out_scale"] = ((opt_flags.split_k, 1, M, triton.cdiv(N, MXFP_BLOCK_SIZE)), torch.uint8)
     return MatmulAllocation(x.device, output, scratchpad)
 
@@ -323,11 +324,14 @@ def reduce_grouped(x: torch.Tensor, indx: torch.Tensor, out: torch.Tensor, out_m
     Returns
     - The input tensor `x` (modified in place).
     """
+    M = x.shape[2]  # Only used for per-batch flex scale.
     if indx is None and x.shape[0] == 1:
         return x.squeeze(0), None
     if indx is not None:
         num_groups = indx.shape[0]
     else:
+        # Handle batched matmul (K, B, M, N) by pretending it to be (K, 1, B*M, N).
+        x = x.view(x.shape[0], 1, x.shape[1] * x.shape[2], x.shape[3])
         num_groups = x.shape[-2]
     if x_flex is None:
         x_flex = InFlexData()
@@ -351,8 +355,10 @@ def reduce_grouped(x: torch.Tensor, indx: torch.Tensor, out: torch.Tensor, out_m
         x_flex.reinterpret(x), x.stride(0), x.stride(2), x.stride(3),  #
         x_expected_scale,  # scalar input scale
         out_flex.reinterpret(out), out.stride(1), out.stride(2),  #
-        out_expected_scale, out_actual_scale, out_checksum_scale, indx,  #
-        x.shape[0], x.shape[-1],  #
+        out_expected_scale, out_actual_scale, out_checksum_scale,
+        out_flex is not None and out_flex.is_per_batch,
+        indx,
+        x.shape[0], M, x.shape[-1],  #
         x_mx_scale, stride_mxb, stride_mxs,  #
         out_mx_scale, stride_omxs,  #
         *fused_activation.fn_args, fused_activation.reduction_n,
@@ -629,7 +635,7 @@ def matmul_ogs(x, w, bias,
                    precision_config.allow_tf32,
                    precision_config.flexpoint_saturate_inf,
                    flex.rhs_data.is_per_batch,
-                   flex.out_data.is_per_batch,
+                   out_matmul_flex.is_per_batch,
                    flex.acc_data.is_per_batch,
                    opt_flags.block_m,
                    opt_flags.block_n,
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_reduce_grouped.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_reduce_grouped.py
@@ -9,7 +9,7 @@ def _reduce_grouped(X, stride_xb: tl.uint64, stride_xm: tl.uint64, stride_xn,  #
                     XScale,  # input scalar flex scale
                     Out, stride_om: tl.uint64, stride_on,  # output tensor
                     OutExpectedScale, OutActualScale, OutChecksumScale,  # output scalar flex scales
-                    InIndx, B, N,  #
+                    PER_BATCH_OUT_SCALE: tl.constexpr, InIndx, B, M, N,  #
                     XMxScale, stride_mxb: tl.uint64,
                     stride_mxs: tl.uint64,  # optional per-32-col output MXFP scales (uint8)
                     OutMxScale, stride_omxs: tl.uint64,  # optional per-32-col output MXFP scales (uint8)
@@ -42,6 +42,12 @@ def _reduce_grouped(X, stride_xb: tl.uint64, stride_xm: tl.uint64, stride_xn,  #
         XScalePtrs = XMxScale + tl.arange(0, BLOCK_N // 32) * stride_xn
     if HAS_OUT_MX_SCALE:
         OutScalePtrs = OutMxScale + tl.arange(0, BLOCK_N_OUT // 32) * stride_on
+    if PER_BATCH_OUT_SCALE:
+        out_batch_idx = pid_t // M
+        OutExpectedScale += out_batch_idx
+        OutActualScale += out_batch_idx
+        if OutChecksumScale is not None:
+            OutChecksumScale += out_batch_idx
     x_scale = load_scale(XScale)
     for n_curr in tl.range(0, N, BLOCK_N, num_stages=4):
         acc = tl.zeros([BLOCK_N_OUT], dtype=tl.float32)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -90,9 +90,7 @@ def make_default_opt_flags_amd(
     )
     is_persistent = constraints.get("is_persistent", False)
     # split_k:
-    if batch_size > 1:
-        split_k = 1  # currently not supported
-    elif constraints.get("split_k", None) is not None:
+    if constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
     elif is_persistent or enforce_bitwise_invariance:
         split_k = 1
@@ -222,9 +220,7 @@ def make_default_opt_flags_nvidia(
         # TODO: swizzle the HBM layout of the weights instead
         block_n, block_k = block_k, block_n
     # split_k
-    if batch_size > 1:
-        split_k = 1  # currently not supported
-    elif constraints.get("split_k", None) is not None:
+    if constraints.get("split_k", None) is not None:
         split_k = constraints["split_k"]
     elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
         split_k = 1