[BENCH] Routing improvements (#7369)

apgoucher · web-flow · commit 0b1cf48fff3f · 2025-07-01T23:49:36.000+01:00
This improves end-to-end routing performance by about 30%:

- fp32 logits: 22.8us --&gt; 18.0us
- fp16 logits: 17.3us --&gt; 12.2us

by a combination of several optimisations, including liberally fusing
kernels in order to reduce the total number of launches from 7 to 4.
Now, we only have four obligatory kernel launches:

- `_topk_forward`
- `_sum_bitmatrix_rows`
- `_combined_routing_memset`
- `_combined_routing_compute`

although in an expert-sharding world there are extra kernels inserted
between `_topk_forward` and `_sum_bitmatrix_rows` in order to mutate the
bitmatrix and update the other data structures produced by
`_topk_forward`.
diff --git a/python/triton_kernels/tests/test_routing.py b/python/triton_kernels/tests/test_routing.py
@@ -5,7 +5,7 @@
 from triton_kernels.testing import assert_equal
 
 
-def init_data(n_tokens, n_expts_tot, dtype=torch.float32, device="cuda"):
+def init_data(n_tokens, n_expts_tot, dtype=torch.float16, device="cuda"):
     logits = torch.randn((n_tokens, n_expts_tot), dtype=dtype, device=device, requires_grad=True)
     return logits
 
@@ -26,7 +26,7 @@ def test_op(n_tokens_pad, n_tokens_raw, n_expts_tot, n_expts_act, sm_first, use_
     else:
         n_routing_rows = torch.tensor([n_tokens_raw], dtype=torch.int32, device=device)
     n_gates_raw = n_tokens_raw * n_expts_act
-    tri_logits = init_data(n_tokens_pad, n_expts_tot, device=device).detach()
+    tri_logits = init_data(n_tokens_pad, n_expts_tot, device=device, dtype=torch.float32).detach()
     tri_logits[n_tokens_raw:, :] = float("inf")  # should not be used
     tri_logits = tri_logits.requires_grad_(True)
     ref_logits = tri_logits.clone().detach().requires_grad_(True)
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -1,9 +1,8 @@
 import torch
 import triton
 from dataclasses import dataclass, field
-from .routing_details._routing_compute import _routing_memset_indx
-from .routing_details._routing_compute import _routing_compute_indx_offs
-from .routing_details._routing_compute import _routing_compute_indx
+from .routing_details._routing_compute import _combined_routing_compute
+from .routing_details._routing_compute import _combined_routing_memset
 from .routing_details._routing_compute import _routing_clear_bitmatrix
 from .routing_details._expt_data import _expt_data_memset
 from .routing_details._expt_data import _expt_data_compute
@@ -115,32 +114,42 @@ def forward(ctx, expt_scal, expt_indx, bitmatrix):
         topk_indx = combined_indx[:n_gates_pad]
         gate_indx = combined_indx[n_gates_pad:]
         gate_scal = torch.empty(n_gates_pad, dtype=dtype, device=device)
-        _routing_memset_indx[(cdiv(n_gates_pad * 2, MEMSET_BLOCK) + 1, )](
+
+        token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1a, blocks2a, MEMSET_BLOCK_A, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal(
+            hist, n_expts_tot, n_gates_pad)
+
+        blocks1b = cdiv(n_gates_pad * 2, MEMSET_BLOCK) + n_expts_tot + 1
+        blocks2b = cdiv(n_tokens_pad, HIST_BLOCK_M)
+
+        _combined_routing_memset[(blocks1a + blocks1b, )](
             combined_indx, n_gates_pad * 2, -1, MEMSET_BLOCK, hist,  #
-            expt_offs, hist.shape[0], BLOCK_N=512  #
-        )
-        _routing_compute_indx_offs[(n_expts_tot, )](
-            expt_offs, partial_hist,  # inputs
+            expt_offs, hist.shape[0], n_expts_tot, partial_hist,  # inputs
             partial_hist.shape[0], partial_hist.stride(0), partial_hist.stride(1),  # outputs
-            BLOCK_M=INDX_OFFS_BLOCK_M,  # tunable parameters
+            token_offs_combined, token_offs_combined.stride(0),  #
+            blocks1a, block_pid_map,  #
+            block_m_log2_start, SIZES=block_m_num, BLOCK_A=MEMSET_BLOCK_A,  # optimization parameters
+            BLOCK_N=512, BLOCK_M=INDX_OFFS_BLOCK_M,  # tunable parameters
         )
+
         indx_offs = partial_hist
-        _routing_compute_indx[(cdiv(n_tokens_pad, HIST_BLOCK_M), )](
+
+        _combined_routing_compute[(blocks2a + blocks2b, )](
             topk_indx, gate_indx, gate_scal,  # outputs
             expt_scal, expt_indx, indx_offs, indx_offs.stride(0), indx_offs.stride(1),  # inputs
-            n_tokens_pad, n_tokens_raw,  # input shape
-            BLOCK_M=HIST_BLOCK_M,  # tunable parameters
-            N_EXPTS_ACT=n_expts_act,  # constants
-            num_warps=1 if HIST_BLOCK_M * n_expts_act // 32 < 4 else 4  #
+            expt_offs, n_tokens_pad, n_tokens_raw,  # input shape
+            HIST_BLOCK_M, n_expts_act,  # constants
+            hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0),  # outputs
+            block_m_log2_start, block_m_num, HIST2_BLOCK_M, blocks2a,  # etc.
         )
+
         ctx.n_tokens_raw = n_tokens_raw
         ctx.n_tokens_pad = n_tokens_pad
         ctx.n_expts_act = n_expts_act
         ctx.save_for_backward(gate_indx)
-        return hist, topk_indx, gate_indx, gate_scal
+        return hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map
 
     @staticmethod
-    def backward(ctx, _0, _1, _2, dgate_scal):
+    def backward(ctx, _0, _1, _2, dgate_scal, _3, _4, _5):
         (gate_indx, ) = ctx.saved_tensors
         dgate_scal = dgate_scal[gate_indx]
         dgate_scal = dgate_scal.reshape(ctx.n_tokens_pad, ctx.n_expts_act)
@@ -193,16 +202,17 @@ def log2_power_of_two(x):
     return x.bit_length() - 1
 
 
-def compute_expt_data(expt_hist, n_expts_tot, n_gates):
-    if expt_hist is None:
-        return ExptData(None, None, None, None)
-    MEMSET_BLOCK = 128
+block_m_log2_start = 4
+
+
+def _compute_expt_data_internal(expt_hist, n_expts_tot, n_gates):
+
+    MEMSET_BLOCK = 512
     HIST2_BLOCK_M = 512
     device = expt_hist.device
     n_expts_tot = n_expts_tot
     cdiv = triton.cdiv
     # block_ms are all powers-of-two between 16 and 128 (inclusive)
-    block_m_log2_start = 4
     block_m_log2_end = 9 if is_hip() else 8
     block_m_num = block_m_log2_end - block_m_log2_start
     if n_gates <= n_expts_tot:
@@ -212,26 +222,53 @@ def compute_expt_data(expt_hist, n_expts_tot, n_gates):
     # allocate memory
     pad = lambda x: cdiv(x, MEMSET_BLOCK) * MEMSET_BLOCK
     dtype = torch.int32
-    token_offs_raw = torch.empty((n_expts_tot + 1, ), dtype=dtype, device=device)
-    token_offs_pad = torch.empty((block_m_num, pad(n_expts_tot + 1)), dtype=dtype, device=device)
+
+    token_offs_combined = torch.empty((block_m_num + 1, pad(n_expts_tot + 1)), dtype=dtype, device=device)
+
+    token_offs_raw = token_offs_combined[0][:n_expts_tot + 1]
+    token_offs_pad = token_offs_combined[1:]
+
     block_pid_map = torch.empty((block_m_num, pad(max_n_tiles)), dtype=dtype, device=device)
+    memset_grid = torch.numel(block_pid_map) // MEMSET_BLOCK  # exact division
     # compute outputs
     token_offs_pad = token_offs_pad[:, :n_expts_tot + 1]
     block_pid_map = block_pid_map[:, :max_n_tiles]
-    memset_grid = cdiv(block_pid_map.shape[1], MEMSET_BLOCK) + 1
-    _expt_data_memset[(memset_grid, block_m_num)](
-        expt_hist, n_expts_tot, token_offs_raw,  #
-        token_offs_pad, token_offs_pad.stride(0),  #
-        block_pid_map, block_pid_map.stride(0),  #
-        block_m_log2_start, BLOCK=MEMSET_BLOCK,  # optimization parameters
-        num_warps=1)
-    _expt_data_compute[(n_expts_tot, block_m_num)](
+
+    blocks1 = memset_grid + block_m_num + 1
+    blocks2 = n_expts_tot * block_m_num
+
+    return token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num
+
+
+def _unpack_into_dict(x):
+
+    block_m_log2_end = block_m_log2_start + x.shape[0]
+    x = {2**j: x[i, :] for i, j in enumerate(range(block_m_log2_start, block_m_log2_end))}
+    return x
+
+
+def compute_expt_data(expt_hist, n_expts_tot, n_gates):
+
+    if expt_hist is None:
+        return ExptData(None, None, None, None)
+
+    # this just computes the kernel arguments:
+    token_offs_combined, token_offs_raw, token_offs_pad, block_pid_map, blocks1, blocks2, MEMSET_BLOCK, HIST2_BLOCK_M, block_m_log2_start, block_m_num = _compute_expt_data_internal(
+        expt_hist, n_expts_tot, n_gates)
+
+    _expt_data_memset[(blocks1, )](
+        expt_hist, n_expts_tot,  #
+        token_offs_combined, token_offs_combined.stride(0),  #
+        block_pid_map,  #
+        block_m_log2_start, SIZES=block_m_num, BLOCK=MEMSET_BLOCK,  # optimization parameters
+        num_warps=4)
+    _expt_data_compute[(blocks2, )](
         expt_hist, token_offs_pad, token_offs_pad.stride(0), block_pid_map, block_pid_map.stride(0),  # outputs
-        block_m_log2_start, BLOCK=HIST2_BLOCK_M,  # optimization parameters
+        block_m_log2_start, SIZES=block_m_num, BLOCK=HIST2_BLOCK_M,  # optimization parameters
         num_warps=4)
-    # unpack into datastructure
-    token_offs_pad = {2**j: token_offs_pad[i, :] for i, j in enumerate(range(block_m_log2_start, block_m_log2_end))}
-    block_pid_map = {2**j: block_pid_map[i, :] for i, j in enumerate(range(block_m_log2_start, block_m_log2_end))}
+
+    token_offs_pad = _unpack_into_dict(token_offs_pad)
+    block_pid_map = _unpack_into_dict(block_pid_map)
     return ExptData(expt_hist, token_offs_raw, token_offs_pad, block_pid_map)
 
 
@@ -249,12 +286,18 @@ def routing(logits, n_expts_act, sm_first=False, expt_indx=None, simulated_ep=1,
     # mutate bitmatrix
     if simulated_ep > 1:
         expt_scal, expt_indx, bitmatrix = prune_routing(expt_scal, expt_indx, bitmatrix, simulated_ep)
-    hist, topk_indx, gate_indx, gate_scal = sort_tokens(expt_scal, expt_indx, bitmatrix)
+    hist, topk_indx, gate_indx, gate_scal, token_offs_raw, token_offs_pad, block_pid_map = sort_tokens(
+        expt_scal, expt_indx, bitmatrix)
+
+    token_offs_pad = _unpack_into_dict(token_offs_pad)
+    block_pid_map = _unpack_into_dict(block_pid_map)
+    expt_data = ExptData(hist, token_offs_raw, token_offs_pad, block_pid_map)
+
     # pack the matmul data structure
     n_expts_tot = logits.shape[-1] // simulated_ep
     gather_indx = GatherIndx(src_indx=topk_indx, dst_indx=gate_indx)
     scatter_indx = ScatterIndx(src_indx=gate_indx, dst_indx=topk_indx)
-    expt_data = compute_expt_data(hist, n_expts_tot, topk_indx.numel())
+
     return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act, expt_data), gather_indx, scatter_indx
 
 
diff --git a/python/triton_kernels/triton_kernels/routing_details/_expt_data.py b/python/triton_kernels/triton_kernels/routing_details/_expt_data.py
@@ -8,50 +8,45 @@ def _cdiv_pow2(n, log2_k):
 
 
 @triton.jit
-def _expt_data_memset(Hist, n_expts_tot, MDTokStarts, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_infos_stridem,
-                      first_tile_dim_log2, BLOCK: tl.constexpr):
-    pid_n = tl.program_id(0)
-    pid_m = tl.program_id(1)
+def _expt_data_memset(Hist, n_expts_tot, MDStarts, tile_starts_stridem, MDTileInfo, first_tile_dim_log2,
+                      SIZES: tl.constexpr, BLOCK: tl.constexpr):
 
-    tile_dim_log2 = first_tile_dim_log2 + pid_m
-    # if pid == 0 - initialize cumsums
-    if pid_n == 0:
-        MDTileStarts += pid_m * tile_starts_stridem
+    pid = tl.program_id(0)
 
-        x_tok = tl.zeros([BLOCK], dtype=MDTokStarts.dtype.element_ty)
-        x_tile = tl.zeros([BLOCK], dtype=MDTileStarts.dtype.element_ty)
+    if pid <= SIZES:
 
-        Tok_ptrs = MDTokStarts + tl.arange(0, BLOCK)
-        Tile_ptrs = MDTileStarts + tl.arange(0, BLOCK)
+        MDStarts += pid * tile_starts_stridem
+        x_tile = tl.zeros([BLOCK], dtype=MDStarts.dtype.element_ty)
+        Tile_ptrs = MDStarts + tl.arange(0, BLOCK)
+        tile_dim_log2 = tl.where(pid == 0, 0, pid + first_tile_dim_log2 - 1)
 
         for i in range(0, n_expts_tot + 1, BLOCK):
+
             offs_n = tl.arange(0, BLOCK) + i
             mask_n0 = offs_n < n_expts_tot
-            mask_n1 = offs_n < n_expts_tot + 1
             hist_tok = tl.load(Hist + offs_n, mask=mask_n0, other=0)
             hist_tile = _cdiv_pow2(hist_tok, tile_dim_log2)
-            tok_starts = tl.cumsum(hist_tok, 0) + x_tok
-            x_tok += tl.sum(hist_tok, 0).to(MDTokStarts.dtype.element_ty)
-            tile_starts = tl.cumsum(hist_tile, 0) + x_tile
-            x_tile += tl.sum(hist_tile, 0).to(MDTileStarts.dtype.element_ty)
 
-            tl.store(Tok_ptrs, tok_starts - hist_tok, mask=mask_n1)
-            tl.store(Tile_ptrs, tile_starts - hist_tile, mask=mask_n1)
-
-            Tok_ptrs += BLOCK
+            tile_starts = tl.cumsum(hist_tile, 0) + x_tile
+            x_tile += tl.sum(hist_tile, 0).to(MDStarts.dtype.element_ty)
+            tl.store(Tile_ptrs, tile_starts - hist_tile)
             Tile_ptrs += BLOCK
 
     else:
-        MDTileInfo += pid_m * tile_infos_stridem
-        TileInfoOut = MDTileInfo + (pid_n - 1) * BLOCK + tl.arange(0, BLOCK)
+
+        pid -= (SIZES + 1)
+        TileInfoOut = MDTileInfo + pid * BLOCK + tl.arange(0, BLOCK)
         tl.store(TileInfoOut, 0xffffffff)
 
 
 @triton.jit
 def _expt_data_compute(Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem, first_tile_dim_log2,
-                       BLOCK: tl.constexpr):
-    expt_id = tl.program_id(0)
-    buff_id = tl.program_id(1)
+                       SIZES: tl.constexpr, BLOCK: tl.constexpr):
+
+    pid = tl.program_id(0)
+
+    expt_id = pid // SIZES
+    buff_id = pid % SIZES
 
     MDTileStarts += buff_id * tile_starts_stridem
     MDTileInfo += buff_id * tile_info_stridem
@@ -62,7 +57,7 @@ def _expt_data_compute(Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile
 
     tile_off = tl.load(MDTileStarts + expt_id)
     MDTileInfo += tile_off
-    # MDTileInfo += tl.load(MDTilesStart + expt_id)
+
     for block_off in range(0, n_blocks, BLOCK):
         block_offs = block_off + tl.arange(0, BLOCK)
         data = (block_offs << 16) + expt_id
diff --git a/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py b/python/triton_kernels/triton_kernels/routing_details/_routing_compute.py
@@ -1,6 +1,8 @@
 import triton
 import triton.language as tl
 
+from ._expt_data import _expt_data_compute, _expt_data_memset
+
 
 @triton.jit
 def _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size,  # histogram
@@ -18,13 +20,10 @@ def _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size,  # histog
 
 
 @triton.jit
-def _routing_compute_indx_offs(TokensStart, PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M: tl.constexpr):
-    expt_id = tl.program_id(0)
+def _routing_compute_indx_offs(PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M: tl.constexpr, expt_id):
     offs_m = tl.arange(0, BLOCK_M)
-    # initialize first row of the output
-    start = tl.load(TokensStart + expt_id)
     # iterate over input data
-    curr_sum = start
+    curr_sum = 0
     for _ in range(0, shape_pm, BLOCK_M):
         offs = offs_m * stride_pm + expt_id * stride_pn
         curr = tl.load(PartialHist + offs, mask=offs_m < shape_pm)
@@ -47,10 +46,10 @@ def _keyed_add(x, y):
 
 
 @triton.jit
-def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, stride_pn,
-                          n_tokens_pad, NTokensRaw, BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr):
+def _routing_compute_indx(pid_m, GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm,
+                          stride_pn, TokensStart, n_tokens_pad, NTokensRaw, BLOCK_M: tl.constexpr,
+                          N_EXPTS_ACT: tl.constexpr):
 
-    pid_m = tl.program_id(0)
     n_tokens = n_tokens_pad
     if NTokensRaw is not None:
         n_tokens = tl.load(NTokensRaw)
@@ -75,14 +74,31 @@ def _routing_compute_indx(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx,
     expts_and_inclusive_run_lengths = tl.associative_scan(x, 0, _keyed_add)
     exclusive_run_lengths = (expts_and_inclusive_run_lengths - 1) & 0xffff
 
-    gates = tl.load(PartialOffs + pid_m * stride_pm + expert * stride_pn, mask=(expert != 0xffff))
+    gates = tl.load(PartialOffs + pid_m * stride_pm + expert * stride_pn, mask=mask)
+    gates += tl.load(TokensStart + expert, mask=mask)
     gates += exclusive_run_lengths
 
     tl.store(ScatterIndx + offs, gates, mask=mask)
     tl.store(GatherIndx + gates, offs, mask=mask)
     tl.store(GateScal + gates, gate_scal, mask=mask)
 
 
+@triton.jit
+def _combined_routing_compute(GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm, stride_pn,
+                              TokensStart, n_tokens_pad, NTokensRaw, BLOCK_M: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
+                              Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem,
+                              first_tile_dim_log2, SIZES: tl.constexpr, BLOCK: tl.constexpr, blocks2a):
+
+    pid = tl.program_id(0)
+    if pid < blocks2a:
+        _expt_data_compute(Hist, MDTileStarts, tile_starts_stridem, MDTileInfo, tile_info_stridem, first_tile_dim_log2,
+                           SIZES, BLOCK)
+    else:
+        pid -= blocks2a
+        _routing_compute_indx(pid, GatherIndx, ScatterIndx, GateScal, ExptScal, ExptIndx, PartialOffs, stride_pm,
+                              stride_pn, TokensStart, n_tokens_pad, NTokensRaw, BLOCK_M, N_EXPTS_ACT)
+
+
 @triton.jit
 def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff, BLOCK_N: tl.constexpr):
     pid_m = tl.program_id(0)
@@ -98,13 +114,37 @@ def _routing_clear_bitmatrix(Bitmatrix, stride_bm, stride_bn, shape_bn, cutoff,
 
 
 @triton.jit
-def _routing_memset_indx(Indx, size, sentinel, BLOCK: tl.constexpr, ExpertHist, FinalExpertOffs, hist_size,
-                         BLOCK_N: tl.constexpr):
+def _combined_routing_memset(Indx, size, sentinel, BLOCK: tl.constexpr, ExpertHist, FinalExpertOffs, hist_size,
+                             n_expts_tot, PartialHist, shape_pm, stride_pm, stride_pn, MDStarts, tile_starts_stridem,
+                             blocks1a, MDTileInfo, first_tile_dim_log2, SIZES: tl.constexpr, BLOCK_A: tl.constexpr,
+                             BLOCK_N: tl.constexpr, BLOCK_M: tl.constexpr):
+    """
+    This kernel essentially combines 6 different pieces of functionality,
+    statically branching on the value of tl.program_id(0) to decide which
+    codepath to take.
+
+        pid == 0:                                  create the token cumsum
+        1 <= pid <= SIZES:                         create a tile cumsum
+        SIZES < pid < blocks1a:                    initialise MDTileInfo to 0xffffffff
+        blocks1a <= pid < blocks1a + n_expts_tot:  compute_indx_offs
+        pid == blocks1a + n_expts_tot:             compute_expt_offs
+        pid > blocks1a + n_expts_tot:              initialise Indx to sentinel
+
+    As each of these is a relatively trivial workload, launching them from
+    this single trampoline is beneficial as they can execute on different
+    streaming multiprocesses in parallel.
+    """
+
     pid = tl.program_id(0)
 
-    if pid == 0:
+    if pid < blocks1a:
+        _expt_data_memset(ExpertHist, n_expts_tot, MDStarts, tile_starts_stridem, MDTileInfo, first_tile_dim_log2,
+                          SIZES, BLOCK_A)
+    elif pid == n_expts_tot + blocks1a:
         _routing_compute_expt_offs(ExpertHist, FinalExpertOffs, hist_size, BLOCK_N)
+    elif pid < n_expts_tot + blocks1a:
+        _routing_compute_indx_offs(PartialHist, shape_pm, stride_pm, stride_pn, BLOCK_M, pid - blocks1a)
     else:
-        offs = (pid - 1) * BLOCK + tl.arange(0, BLOCK)
+        offs = (pid - n_expts_tot - blocks1a - 1) * BLOCK + tl.arange(0, BLOCK)
         mask = offs < size
         tl.store(Indx + offs, sentinel, mask=mask)