Reapply "[KERNELS] Improve block sizes for batched matmul_ogs with small m/n/k (#7897)" (#8084)

yongjik · web-flow · commit 9801a7afbaea · 2025-09-05T14:32:45.000-07:00
This reverts commit 0a2e3a3. (Verified that this is still faster on GB200 on top of recent fixes.)
diff --git a/python/triton_kernels/tests/test_matmul.py b/python/triton_kernels/tests/test_matmul.py
@@ -1,6 +1,7 @@
 # isort: off
 # fmt: off
 from dataclasses import dataclass, fields, replace
+import itertools
 import pytest
 import torch
 from typing import Union
@@ -470,14 +471,66 @@ def round_x(x, idx):
                 tri_y_scale).abs() < 1e-10, f"ref_y_scale: {ref_y_scale}, tri_y_scale: {tri_y_scale.item()}"
 
 
+# Test that we don't use unsupported block sizes.
+@pytest.mark.parametrize("m", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("n", [8, 16, 32, 64, 128])
+@pytest.mark.parametrize("k", [8, 16, 32, 64, 128])
+def test_small_batch_matmul(m, n, k):
+    if is_hip():
+        pytest.skip("Not fully tested on AMD")
+
+    if m * n * k > 16384:
+        pytest.skip()
+
+    BATCH_SIZE = 10000
+
+    def _make_tensor(shape, dtype, trans):
+        if trans:
+            shape = (shape[0], shape[2], shape[1])
+        t = alloc_rand(shape, "cuda", dtype)
+        return t.transpose(1, 2) if trans else t
+
+    for x_transpose, w_transpose, bias, dtype in itertools.product(
+        (False, True),
+        (False, True),
+        (False, True),
+        (torch.float16, torch.bfloat16, torch.float8_e5m2),
+    ):
+        if (
+            torch.cuda.get_device_capability()[0] < 10
+            and dtype is torch.float8_e5m2
+            and (not w_transpose)
+        ):
+            continue  # Not supported
+
+        x = _make_tensor((BATCH_SIZE, m, k), dtype, x_transpose)
+        w = _make_tensor((BATCH_SIZE, k, n), dtype, w_transpose)
+        bias = _make_tensor((BATCH_SIZE, n), torch.float32, False) if bias else None
+        tri_y = matmul_ogs(x, w, bias)
+
+        # ref_y = matmul_ogs_torch(x.float(), w.float(), bias)
+
+        # This is faster than matmul_ogs_torch.
+        ref_y = torch.bmm(x.float(), w.float())
+        if bias is not None:
+            ref_y += bias[:, None, :]
+
+        assert_close(
+            ref_y,
+            tri_y,
+            maxtol=4e-1 if dtype is torch.float8_e5m2 else None,
+            rmstol=4e-2 if dtype is torch.float8_e5m2 else None,
+        )
+
+
 def test_set_idle_sms():
     if not is_cuda():
         pytest.skip("Only supported on CUDA")
     from triton_kernels.matmul_ogs_details.opt_flags import make_opt_flags
     num_idle_sms = 24
     matmul_ogs_set_idle_sms(num_idle_sms)
     flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
-                           1024, 1024, 1024, None, True, False, 1)
+                           1, 1024, 1024, 1024, None, True, False, 1)
     assert flags.idle_sms == num_idle_sms
 
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs.py
@@ -368,7 +368,7 @@ def matmul_ogs(x, w, bias,
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
-        M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
+        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
     )
     if not can_use_fused_scatter and opt_flags.fused_scatter:
         raise InapplicableConstraint("Fused scatter is not supported")
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -35,6 +35,7 @@ def make_default_opt_flags_amd(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -133,6 +134,7 @@ def make_default_opt_flags_nvidia(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -146,7 +148,7 @@ def make_default_opt_flags_nvidia(
     constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages", "idle_sms"]
     assert not any([c not in constraints_supported for c in constraints]), constraints.keys()
     # tokens per expert
-    if routing_data is None:
+    if routing_data is None or batch_size > 1:
         tokens_per_expt = m
     elif routing_data.expected_tokens_per_expt is None:
         tokens_per_expt = max(1, m // routing_data.n_expts_tot)
@@ -164,11 +166,11 @@ def make_default_opt_flags_nvidia(
         block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
     # block n
     arch = None
-    block_n = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
+    block_n, block_n_tma = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
     # is_persistent
-    grid_size = opt_flags_nvidia.compute_grid_size(routing_data, m, n, block_m, block_n)
+    grid_size_tma = opt_flags_nvidia.compute_grid_size(routing_data, batch_size, m, n, block_m, block_n_tma)
     n_sms = torch.cuda.get_device_properties(0).multi_processor_count
-    tiles_per_sm = grid_size / n_sms
+    tiles_per_sm = grid_size_tma / n_sms
     supports_persistent = can_use_persistent_tma and (arch is None or int(arch[2:-1]) >= 9)
     if constraints.get("is_persistent", None) is not None:
         is_persistent = constraints["is_persistent"]
@@ -178,6 +180,10 @@ def make_default_opt_flags_nvidia(
         # TEMP CHANGE
         if precision_config.act_scale is not None or precision_config.out_scale is not None:
             is_persistent = False
+        # TMA is slower for batched matmuls with small m/n/k.
+        if m * n * k < 131072:
+            is_persistent = False
+    block_n = block_n_tma if is_persistent else block_n
     # block k
     if constraints.get("block_k", None) is not None:
         block_k = constraints["block_k"]
@@ -189,7 +195,7 @@ def make_default_opt_flags_nvidia(
     elif is_persistent or enforce_bitwise_invariance or precision_config.act_scale is not None or precision_config.out_scale is not None:
         split_k = 1
     else:
-        estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, m, n, block_m, block_n)
+        estimated_actual_grid_size = opt_flags_nvidia.compute_grid_size(None, batch_size, m, n, block_m, block_n)
         split_k = opt_flags_nvidia.compute_split_k(block_k, k, estimated_actual_grid_size)
     if split_k > 1:
         # With split_k, results are written in f32. Use that for the following computations.
@@ -224,7 +230,7 @@ def make_default_opt_flags_nvidia(
     else:
         fused_scatter = can_use_fused_scatter and split_k == 1
     # Handshake with the HBM swizzling
-    num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, precision_config)
+    num_warps = opt_flags_nvidia.compute_num_warps(block_m, block_n, is_persistent, precision_config)
     ret = OptFlags(
         block_m=block_m,
         block_n=block_n,
@@ -275,6 +281,7 @@ def make_opt_flags(
     lhs_dtype,
     rhs_dtype,
     precision_config,
+    batch_size,
     m,
     n,
     k,
@@ -291,7 +298,7 @@ def make_opt_flags(
     if _opt_flags is not None:
         assert not _opt_flags_constraints
         return _opt_flags
-    args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, m, n, k,
+    args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, batch_size, m, n, k,
             routing_data, can_use_persistent_tma, can_use_fused_scatter,
             enforce_bitwise_invariance, epilogue_effective_itemsize,
             _opt_flags_constraints]
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py
@@ -6,24 +6,25 @@
 from triton_kernels.numerics_details.mxfp_details._downcast_to_mxfp import MXFP_BLOCK_SIZE
 
 
-def compute_grid_size(routing_data, m, n, block_m, block_n):
-    if routing_data is not None:
+def compute_grid_size(routing_data, batch_size, m, n, block_m, block_n):
+    if routing_data is not None and batch_size == 1:
         grid_m = routing_data.n_blocks(m, block_m)
     else:
         grid_m = triton.cdiv(m, block_m)
     grid_n = (n + block_n - 1) // block_n
-    return grid_m * grid_n
+    return batch_size * grid_m * grid_n
 
 
 def compute_block_n(n: int, arch, precision_config):
     # block_n:
     layout = get_layout(precision_config.weight_scale)
     if isinstance(layout, HopperMXScaleLayout) and layout.num_warps == 4:
-        return 128
+        return 128, 128
     elif precision_config.max_num_imprecise_acc is None and n > 128:
-        return 256
+        return 256, 256
     else:
-        return max(16, min(128, triton.next_power_of_2(n)))
+        target = min(128, triton.next_power_of_2(n))
+        return max(8, target), max(16, target)
 
 
 def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_dtype, precision_config):
@@ -35,7 +36,8 @@ def compute_block_k(m: int, k: int | None, is_persistent: bool, lhs_dtype, rhs_d
     if rhs_width == 4 and not has_native_mxfp:
         block_k = 128
     elif k is not None:
-        block_k = max(32, min(triton.next_power_of_2(k), block_k))
+        min_block_k = 32 if is_persistent or lhs_width != 16 or rhs_width != 16 else 16
+        block_k = max(min_block_k, min(triton.next_power_of_2(k), block_k))
     has_mx_weight_scale = precision_config is not None and precision_config.weight_scale is not None
     if has_native_mxfp and is_persistent and has_mx_weight_scale:
         block_k = min(block_k, 128)
@@ -54,11 +56,11 @@ def compute_split_k(block_k: int, k: int | None, grid_size: int) -> int:
     return split_k
 
 
-def compute_num_warps(block_m, block_n, precision_config):
+def compute_num_warps(block_m, block_n, is_persistent: bool, precision_config):
     layout = get_layout(precision_config.weight_scale)
     if isinstance(layout, HopperMXScaleLayout):
         return layout.num_warps
-    return max(block_m * block_n // 4096, 4)
+    return max(block_m * block_n // 4096, 4 if is_persistent else 1)
 
 
 def compute_num_stages(

Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ def matmul_ogs(x, w, bias,`
`368`	`368`	`can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)`
`369`	`369`	`can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)`
`370`	`370`	`opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,`
`371`		`- M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,`
	`371`	`+ batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,`
`372`	`372`	`)`
`373`	`373`	`if not can_use_fused_scatter and opt_flags.fused_scatter:`
`374`	`374`	`raise InapplicableConstraint("Fused scatter is not supported")`