[KERNELS] tuning for small batch MoE (#8206)

aeng-openai · web-flow · commit 55613a70d93e · 2025-09-23T11:22:13.000-07:00
small batch MoE should ideally have block_m be not too large, but also
&gt;= the number of tokens per expert for the large majority of cases. This
minimizes the number of times weights are loaded in duplicate. It also
improves mxfp4 in bandwidth bound cases to load a full cache line.
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py
@@ -1,8 +1,10 @@
 # isort: off
 # fmt: off
 from dataclasses import dataclass
+
 import triton
 from triton_kernels.target_info import get_cdna_version
+from triton_kernels.tensor import FP4
 import torch
 from .opt_flags_details import opt_flags_amd, opt_flags_nvidia
 from triton_kernels.tensor import bitwidth
@@ -186,7 +188,11 @@ def make_default_opt_flags_nvidia(
     elif enforce_bitwise_invariance:
         block_m = 128
     else:
-        block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
+        if tokens_per_expt <= 64 and routing_data is not None and routing_data.expt_hist is not None:
+            # Ragged and likely memory bound; set the block size higher to minimize loading weights more than once.
+            block_m = max(16, min(triton.next_power_of_2(2 * tokens_per_expt), 64))
+        else:
+            block_m = max(16, min(triton.next_power_of_2(tokens_per_expt), 128))
     # block n
     arch = None
     block_n, block_n_tma = opt_flags_nvidia.compute_block_n(n, arch, precision_config)
@@ -209,6 +215,10 @@ def make_default_opt_flags_nvidia(
         block_k = constraints["block_k"]
     else:
         block_k = opt_flags_nvidia.compute_block_k(m, k, is_persistent, lhs_dtype, rhs_dtype, precision_config)
+    if block_n == 256 and block_k == 128 and block_m <= 64 and is_persistent and rhs_dtype == FP4 and k >= 4096 and tokens_per_expt > 1:
+        # Swap block_n and block_k for mxfp4 weights so that block_k is a full cacheline, so long as K is sufficiently large.
+        # TODO: swizzle the HBM layout of the weights instead
+        block_n, block_k = block_k, block_n
     # split_k
     if batch_size > 1:
         split_k = 1  # currently not supported