[BENCH] add renormalize knob in routing, so that the kernel can be used for qwen and mixtral moe family (#6896)

zyongye · web-flow · commit cda4229558c5 · 2025-05-31T16:00:08.000-07:00
In some family of model, such as qwen1.5 or mixtral 7x8b or 7x22b. The
expert_weights are calculated first using softmax and then topk (without
renormalization). For the `routing` kernel to be compatible with those
models, a new knob is added to turn off the softmax after topk and
instead passed in logits that are already softmax-ed for calculation.
diff --git a/python/triton_kernels/tests/test_routing.py b/python/triton_kernels/tests/test_routing.py
@@ -44,7 +44,8 @@ def ref_expt_data(routing_data, n_gates, block_m):
 @pytest.mark.parametrize("n_expts_tot, n_expts_act", [(128, 4), (1500, 8)])
 @pytest.mark.parametrize("block_m", [64, 128])
 @pytest.mark.parametrize("use_expt_indx", [False, True])
-def test_op(n_tokens, n_expts_tot, n_expts_act, block_m, use_expt_indx, device):
+@pytest.mark.parametrize("renormalize", [True, False])
+def test_op(n_tokens, n_expts_tot, n_expts_act, renormalize, block_m, use_expt_indx, device):
     torch.manual_seed(2)
     tri_logits = init_data(n_tokens, n_expts_tot, device=device).detach()
     ref_logits = tri_logits.clone()
@@ -55,8 +56,11 @@ def test_op(n_tokens, n_expts_tot, n_expts_act, block_m, use_expt_indx, device):
         ref_expt_indx = tri_expt_indx[:n_tokens]
     else:
         tri_expt_indx = ref_expt_indx = None
-    ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act, ref_expt_indx)
-    tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act, tri_expt_indx)
+    if not renormalize:
+        tri_logits = torch.softmax(tri_logits, dim=-1)
+        ref_logits = torch.softmax(ref_logits, dim=-1)
+    ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act, renormalize, ref_expt_indx)
+    tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act, renormalize, tri_expt_indx)
     ref_metadata = ref_expt_data(ref_routing_data, n_tokens * n_expts_act, block_m)
     tri_metadata = compute_metadata(tri_routing_data, n_tokens * n_expts_act, block_m)
 
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -53,7 +53,7 @@ def n_blocks(self, n_rows, block_m):
 # --------------------------
 
 
-def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
+def routing(logits, n_expts_act, renormalize=True, expt_indx=None, simulated_ep=1):
     from .topk import topk
     from .compaction import compaction
     cdiv = triton.cdiv
@@ -63,7 +63,7 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     n_tokens, n_expts_tot = logits.shape
     n_gates = n_tokens * n_expts_act
     device = logits.device
-    expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, y_indx=expt_indx)
+    expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, apply_softmax=renormalize, y_indx=expt_indx)
     # mutate bitmatrix
     if simulated_ep > 1:
         assert n_expts_tot % simulated_ep == 0
@@ -108,7 +108,7 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act), gather_indx, scatter_indx
 
 
-def routing_torch(logits, n_expts_act, expt_indx=None):
+def routing_torch(logits, n_expts_act, renormalize=True, expt_indx=None):
 
     def topk(vals, k, expt_indx):
         # topk of experts
@@ -121,7 +121,8 @@ def topk(vals, k, expt_indx):
 
     _, n_expts_tot = logits.shape
     expt_scal, expt_indx = topk(logits, n_expts_act, expt_indx)
-    expt_scal = torch.softmax(expt_scal, dim=-1)
+    if renormalize:
+        expt_scal = torch.softmax(expt_scal, dim=-1)
     # flatten topk data
     expt_scal = expt_scal.reshape(-1)
     expt_indx = expt_indx.reshape(-1).to(torch.int32)
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -3,7 +3,7 @@
 from .bitmatrix import Bitmatrix
 
 
-def topk(x, k, dim=1, return_bitmatrix=True, y_indx=None):
+def topk(x, k, apply_softmax=True, dim=1, return_bitmatrix=True, y_indx=None):
     cdiv = lambda a, b: (a + b - 1) // b
     BLOCK_M = 32
     BLOCK_N = 32
@@ -39,5 +39,5 @@ def topk(x, k, dim=1, return_bitmatrix=True, y_indx=None):
         S, BLOCK_S, s_blocks,  # thing to memset to zero
         BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,  # tunable parameter
         N_EXPTS_PAD=n_cols_pad, N_EXPTS_ACT=k,  # constants
-    )
+        APPLY_SOFTMAX=apply_softmax)
     return y_vals, y_indx, Bitmatrix(bitmatrix, [n_rows, n_cols], S)
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk.py b/python/triton_kernels/triton_kernels/topk_details/_topk.py
@@ -72,7 +72,8 @@ def _topk(X, stride_xm,  # inputs
           Yv, Yi, stride_ym,  # topk values/indices
           USE_PROVIDED_INDX: tl.constexpr, Bits, stride_rm: tl.constexpr, stride_rn: tl.constexpr, n_rows,  # bitmatrix
           n_expts_tot, S, BLOCK_S: tl.constexpr, s_blocks,  # thing to memset
-          BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr):
+          BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr,
+          APPLY_SOFTMAX: tl.constexpr):
 
     pid = tl.program_id(0)
 
@@ -105,8 +106,8 @@ def _topk(X, stride_xm,  # inputs
         y_indices = y & 0x0000FFFF
         y_values = (y >> x_nbits).to(x_utype).to(x_dtype, bitcast=True)
 
-    # normalize selected values
-    y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
+    if APPLY_SOFTMAX:
+        y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
 
     # write back
     Yv_ptrs = Yv + offs_m[:, None] * stride_ym + offs_y_n[None, :]