[BENCH] make MoE routing another 4% faster (#7396)

apgoucher · web-flow · commit 80449c22b39a · 2025-07-06T13:03:56.000-07:00
This reduces routing runtime from 12.3us to 11.8us by tweaking block
sizes and conditionally unrolling a loop if the number of iterations is
small.
diff --git a/python/triton_kernels/triton_kernels/reduction_details/reduce_bitmatrix.py b/python/triton_kernels/triton_kernels/reduction_details/reduce_bitmatrix.py
@@ -88,7 +88,7 @@ def sum_bitmatrix_rows(x, out_ret, partials_block_size=None, n_rows_raw=None):
     n_rows_pad, n_cols_raw = x.shape_pad[0], x.shape_raw[1]
     assert out_ret.shape == (n_cols_raw, )
 
-    TILE_SIZE = 2
+    TILE_SIZE = max(1, 128 // PARTIALS_BLOCK_M)
     BLOCK_MM = PARTIALS_BLOCK_M * TILE_SIZE
 
     pids_x = cdiv(n_rows_pad, BLOCK_MM)
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -94,7 +94,7 @@ class SortTokens(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, expt_scal, expt_indx, bitmatrix):
-        HIST_BLOCK_M = 64
+        HIST_BLOCK_M = 32
         INDX_OFFS_BLOCK_M = 512
         MEMSET_BLOCK = 1024
         cdiv = triton.cdiv
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py b/python/triton_kernels/triton_kernels/topk_details/_topk_forward.py
@@ -62,7 +62,7 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
     acc = tl.topk(x, N_EXPTS_ACT, dim=1)
 
     # subsequent iterations:
-    for _i in range(loop_iterations):
+    for _i in (tl.static_range if loop_iterations <= 4 else range)(loop_iterations):
         acc = tl.bitonic_merge(acc)  # ensure sorted ascending for the merge
         X_ptrs -= BLOCK_N
         offs_x_n -= BLOCK_N