[KERNELS] Improve block sizes for batched matmul_ogs with small m/n/k. (triton-lang#7897)

yongjik · web-flow · commit f1872edd6327 · 2025-08-20T09:30:10.000-07:00
(Previously, block sizes could be much bigger than m/n/k.)

Example perf difference:
```
H100:
    B=500000 M=8 N=8 K=8
        &gt;&gt; torch.float16     0.850 ms -&gt; 0.388 ms
        &gt;&gt; torch.bfloat16    0.828 ms -&gt; 0.354 ms
        &gt;&gt; torch.float8_e5m2 0.829 ms -&gt; 0.373 ms
    B=500000 M=16 N=16 K=16
        &gt;&gt; torch.float16     0.791 ms -&gt; 0.381 ms
        &gt;&gt; torch.bfloat16    0.790 ms -&gt; 0.382 ms
        &gt;&gt; torch.float8_e5m2 0.779 ms -&gt; 0.366 ms

GB200:
    B=500000 M=8 N=8 K=8
        &gt;&gt; torch.float16     0.676 ms -&gt; 0.314 ms
        &gt;&gt; torch.bfloat16    0.652 ms -&gt; 0.297 ms
        &gt;&gt; torch.float8_e5m2 0.659 ms -&gt; 0.294 ms
    B=500000 M=16 N=16 K=16
        &gt;&gt; torch.float16     0.622 ms -&gt; 0.305 ms
        &gt;&gt; torch.bfloat16    0.606 ms -&gt; 0.306 ms
        &gt;&gt; torch.float8_e5m2 0.616 ms -&gt; 0.296 ms
```
diff --git a/Makefile b/Makefile
@@ -35,7 +35,7 @@ test-unit: all
 		--ignore=language/test_subprocess.py --ignore=test_debug.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
-	$(PYTEST) -s -n 8 python/triton_kernels/tests/
+	$(PYTEST) -s -n 6 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -1,6 +1,7 @@
 # isort: off
 # fmt: off
 from dataclasses import dataclass, fields, replace
+import itertools
 import pytest
 import torch
 from typing import Union
@@ -517,14 +518,66 @@ def round_x(x, idx):
                 tri_y_scale).abs() < 1e-10, f"ref_y_scale: {ref_y_scale}, tri_y_scale: {tri_y_scale.item()}"
 
 
+# Test that we don't use unsupported block sizes.
+@pytest.mark.parametrize("m", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("n", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("k", [8, 16, 32, 64, 128])
+def test_small_batch_matmul(m, n, k):
+    if is_hip():
+        pytest.skip("Not fully tested on AMD")
+
+    if m * n * k > 16384:
+        pytest.skip()
+
+    BATCH_SIZE = 10000
+
+    def _make_tensor(shape, dtype, trans):
+        if trans:
+            shape = (shape[0], shape[2], shape[1])
+        t = alloc_rand(shape, "cuda", dtype)
+        return t.transpose(1, 2) if trans else t
+
+    for x_transpose, w_transpose, bias, dtype in itertools.product(
+        (False, True),
+        (False, True),
+        (False, True),
+        (torch.float16, torch.bfloat16, torch.float8_e5m2),
+    ):
+        if (
+            torch.cuda.get_device_capability()[0] < 10
+            and dtype is torch.float8_e5m2
+            and (not w_transpose)
+        ):
+            continue  # Not supported
+
+        x = _make_tensor((BATCH_SIZE, m, k), dtype, x_transpose)
+        w = _make_tensor((BATCH_SIZE, k, n), dtype, w_transpose)
+        bias = _make_tensor((BATCH_SIZE, n), torch.float32, False) if bias else None
+        tri_y = matmul_ogs(x, w, bias)
+
+        # ref_y = matmul_ogs_torch(x.float(), w.float(), bias)
+
+        # This is faster than matmul_ogs_torch.
+        ref_y = torch.bmm(x.float(), w.float())
+        if bias is not None:
+            ref_y += bias[:, None, :]
+
+        assert_close(
+            ref_y,
+            tri_y,
+            maxtol=4e-1 if dtype is torch.float8_e5m2 else None,
+            rmstol=4e-2 if dtype is torch.float8_e5m2 else None,
+        )
+
+
 def test_set_idle_sms():
     if not is_cuda():
         pytest.skip("Only supported on CUDA")
     from triton_kernels.matmul_ogs_details.opt_flags import make_opt_flags
     num_idle_sms = 24
     matmul_ogs_set_idle_sms(num_idle_sms)
     flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
-                           1024, 1024, 1024, None, True, False, 1)
+                           1, 1024, 1024, 1024, None, True, False, 1)
     assert flags.idle_sms == num_idle_sms
 
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -444,7 +444,7 @@ def matmul_ogs(x, w, bias,
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = scatter_indx is not None and fused_activation.specs.fn is None
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
-        M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
+        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
     )
     if w_scale is not None and opt_flags.is_persistent and not target_info.has_native_mxfp():
         raise NotImplementedError("Must use non-persistent kernel for simulated MXFP")
@@ -631,10 +631,10 @@ def matmul_ogs_torch(x, w, bias,
         assert routing_data is None, "routing not supported in batched mode"
         assert w.ndim == 3 and w.shape[0] == x.shape[0]
     if round_x is None:
-        round_x = lambda x: x
+        round_x = lambda x, idx: x
     if round_y is None:
         round_y = lambda x: x
-    if bias.ndim == 1:
+    if bias is not None and bias.ndim == 1:
         bias = bias.view(1, *bias.shape)
     if w.ndim == 2:
         w = w.view(1, *w.shape)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py
@@ -96,7 +96,8 @@ def _matmul_ogs(
         tl.assume(stride_w_mx_k >= 0)
     if stride_w_mx_n is not None:
         tl.assume(stride_w_mx_n >= 0)
-    tl.assume(stride_b_e >= 0)
+    if B is not None:
+        tl.assume(stride_b_e >= 0)
     tl.assume(batch_size >= 0)
     tl.assume(grid_m >= 0)
     tl.assume(grid_n >= 0)
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -36,6 +36,7 @@ def make_default_opt_flags_amd(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -134,6 +135,7 @@ def make_default_opt_flags_nvidia(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -147,7 +149,7 @@ def make_default_opt_flags_nvidia(
     constraints_supported = ["block_m", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile", "num_stages", "idle_sms"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
     # tokens per expert
-    if routing_data is None:
+    if routing_data is None or batch_size > 1:
         tokens_per_expt = m
     elif routing_data.expected_tokens_per_expt is None:
         tokens_per_expt = max(1, m // routing_data.n_expts_tot)
@@ -165,11 +167,11 @@ def make_default_opt_flags_nvidia(
         block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
     # block n
     arch = None
-    block_n = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
+    block_n, block_n_tma = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
     # is_persistent
-    grid_size = opt_flags_nvidia.compute_grid_size(routing_data, m, n, block_m, block_n)
+    grid_size_tma = opt_flags_nvidia.compute_grid_size(routing_data, batch_size, m, n, block_m, block_n_tma)
     n_sms = torch.cuda.get_device_properties(0).multi_processor_count
-    tiles_per_sm = grid_size / n_sms
+    tiles_per_sm = grid_size_tma / n_sms
     supports_persistent = can_use_persistent_tma and (arch is None or int(arch[2:-1]) >= 9)
     if constraints.get("is_persistent", None) is not None:
         is_persistent = constraints["is_persistent"]
@@ -179,6 +181,10 @@ def make_default_opt_flags_nvidia(
         # TEMP CHANGE
         if precision_config.act_scale is not None or precision_config.out_scale is not None:
             is_persistent = False
+        # TMA is slower for batched matmuls with small m/n/k.
+        if m * n * k < 131072:
+            is_persistent = False
+    block_n = block_n_tma if is_persistent else block_n
     # block k
     if constraints.get("block_k", None) is not None:
         block_k = constraints["block_k"]
@@ -190,7 +196,7 @@ def make_default_opt_flags_nvidia(
     elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
         split_k = 1
     else:
-        estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, m, n, block_m, block_n)
+        estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, batch_size, m, n, block_m, block_n)
         split_k = opt_flags_nvidia.compute_split_k(block_k, k, estimated_actual_grid_size)
     if split_k > 1:
         # With split_k, results are written in f32. Use that for the following computations.
@@ -225,7 +231,7 @@ def make_default_opt_flags_nvidia(
     else:
         fused_scatter = can_use_fused_scatter and split_k == 1
     # Handshake with the HBM swizzling
-    num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, precision_config)
+    num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, is_persistent, precision_config)
     ret = OptFlags(
         block_m=block_m,
         block_n=block_n,
@@ -276,6 +282,7 @@ def make_opt_flags(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -290,7 +297,7 @@ def make_opt_flags(
     if _opt_flags is not None:
         assert not _opt_flags_constraints
         return _opt_flags
-    args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, m, n, k,
+    args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, batch_size, m, n, k,
             routing_data, can_use_persistent_tma, can_use_fused_scatter,
             enforce_bitwise_invariance, epilogue_effective_itemsize,
             _opt_flags_constraints]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -6,24 +6,25 @@
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 
 
-def compute_grid_size(routing_data, m, n, block_m, block_n):
-    if routing_data is not None:
+def compute_grid_size(routing_data, batch_size, m, n, block_m, block_n):
+    if routing_data is not None and batch_size == 1:
         grid_m = routing_data.n_blocks(m, block_m)
     else:
         grid_m = triton.cdiv(m, block_m)
     grid_n = (n + block_n - 1) // block_n
-    return grid_m * grid_n
+    return batch_size * grid_m * grid_n
 
 
 def compute_block_n(n: int, arch, precision_config):
     # block_n:
     layout = get_layout(precision_config.weight_scale)
     if isinstance(layout, HopperMXScaleLayout) and layout.num_warps == 4:
-        return 128
+        return 128, 128
     elif precision_config.max_num_imprecise_acc is None and n > 128:
-        return 256
+        return 256, 256
     else:
-        return max(16, min(128, triton.next_power_of_2(n)))
+        target = min(128, triton.next_power_of_2(n))
+        return max(8, target), max(16, target)
 
 
 def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_dtype, precision_config):
@@ -35,7 +36,8 @@ def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_d
     if rhs_width == 4 and not has_native_mxfp:
         block_k = 128
     elif k is not None:
-        block_k = max(32, min(triton.next_power_of_2(k), block_k))
+        min_block_k = 32 if is_persistent or lhs_width != 16 or rhs_width != 16 else 16
+        block_k = max(min_block_k, min(triton.next_power_of_2(k), block_k))
     has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None
     if has_native_mxfp and is_persistent and has_mx_weight_scale:
         block_k = min(block_k, 128)
@@ -54,11 +56,11 @@ def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
     return split_k
 
 
-def compute_num_warps(block_m, block_n, precision_config):
+def compute_num_warps(block_m, block_n, is_persistent: bool, precision_config):
     layout = get_layout(precision_config.weight_scale)
     if isinstance(layout, HopperMXScaleLayout):
         return layout.num_warps
-    return max(block_m * block_n // 4096, 4)
+    return max(block_m * block_n // 4096, 4 if is_persistent else 1)
 
 
 def compute_num_stages(
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -333,12 +333,18 @@ def compute_expt_data_torch(hist, n_expts_tot, n_gates):
         token_offs_pad[block_m] = torch.cat((torch.zeros(1, device=device), token_offs_pad[block_m]))
         token_offs_pad[block_m] = token_offs_pad[block_m].int()
         # compute data required to drive ragged batch matmul
-        block_pid_map[block_m] = -torch.ones(max_n_tiles, device=device)
-        for e in range(n_expts_tot):
-            offset = token_offs_pad[block_m][e]
-            for b in range(n_tiles[e]):
-                block_pid_map[block_m][offset + b] = (b << 16) + e
-        block_pid_map[block_m] = block_pid_map[block_m].int()
+        block_pid_map[block_m] = -torch.ones(max_n_tiles, dtype=torch.int32, device=device)
+
+        # for e in range(n_expts_tot):
+        #     offset = token_offs_pad[block_m][e]
+        #     for b in range(n_tiles[e]):
+        #         block_pid_map[block_m][offset + b] = (b << 16) + e
+
+        col = torch.arange(max_n_tiles, device=device)
+        map_vals = torch.arange(n_expts_tot, device=device)[:, None] + (col << 16)[None, :]
+        map_idxs = token_offs_pad[block_m][:-1, None] + col[None, :]
+        mask = col[None, :] < n_tiles[:, None]
+        block_pid_map[block_m].index_put_((map_idxs[mask], ), map_vals.int()[mask])
     return ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map)