[KERNELS] support 32-bit inputs in topk.py (#6856)

ptillet · web-flow · commit 2509898fd20b · 2025-05-17T10:50:23.000-07:00
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -38,7 +38,7 @@ def _query_gpu_specs():
 
     gpu_specs = {
         "NVIDIA H100 80GB HBM3": {"MAX_TFLOPS8": 1979, "MAX_TFLOPS16": 989, "MAX_TBPS": 3.35},
-        "HGX GB200": {"MAX_TFLOPS8": 4500, "MAX_TFLOPS16": 2250, "MAX_TBPS": 8.0},
+        "NVIDIA GB200": {"MAX_TFLOPS8": 4500, "MAX_TFLOPS16": 2250, "MAX_TBPS": 8.0},
         "AMD Instinct MI300X": {"MAX_TFLOPS8": 2615, "MAX_TFLOPS16": 1307, "MAX_TBPS": 5.3},
         "AMD Instinct MI325X": {"MAX_TFLOPS8": 2615, "MAX_TFLOPS16": 1307, "MAX_TBPS": 6.0},
     }
@@ -219,10 +219,11 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     has_native_mx4 = torch.cuda.get_device_capability(0)[0] >= 10 or get_cdna_version() == 4
     if SPECS is None:
         print("Current GPU has no specs provided, utilization is N/A")
-    batch_ranges = [(1024, 32768, 1024)]
+    batch_ranges_dense = [(1024, 32768, 1024)]
+    batch_ranges_moe = [(128, 512, 32), (512, 32000, 128)]
     dense_dtypes = ["fp8", "fp8"]
     quantized_dtypes = ["fp8", "mx4"] if has_native_mx4 else ["bf16", "mx4"]
-    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
-    roofline_mlp(batch_ranges, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
+    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
+    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
+    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -62,7 +62,6 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     HIST_BLOCK_M = 64
     INDX_OFFS_BLOCK_M = 512
     MEMSET_BLOCK = 1024
-    assert logits.dtype.itemsize == 2
     n_tokens, n_expts_tot = logits.shape
     n_gates = n_tokens * n_expts_act
     device = logits.device
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -7,7 +7,6 @@ def topk(x, k, dim=1, return_bitmatrix=True):
     cdiv = lambda a, b: (a + b - 1) // b
     BLOCK_M = 8
     BLOCK_N = 128
-    assert x.dtype.itemsize == 2
     assert x.ndim == 2
     assert x.shape[-1] < 32768
     assert dim == 1
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk.py b/python/triton_kernels/triton_kernels/topk_details/_topk.py
@@ -5,6 +5,10 @@
 @triton.jit
 def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr,
                    BLOCK_N: tl.constexpr):
+    x_nbits: tl.constexpr = X.dtype.element_ty.primitive_bitwidth
+    x_utype: tl.constexpr = tl.dtype(f"uint{x_nbits}")
+    x_ultype: tl.constexpr = tl.dtype(f"uint{2*x_nbits}")
+    x_dbtype: tl.constexpr = tl.dtype(f"fp{2*x_nbits}")
 
     # subtract 1 from loop iterations because we peel the first (masked) iteration:
     loop_iterations: tl.constexpr = N_EXPTS_PAD // BLOCK_N - 1
@@ -15,8 +19,8 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
     # first iteration:
     X_ptrs = X + offs_m[:, None] * stride_xm + offs_x_n[None, :]
     x = tl.load(X_ptrs, mask=(mask_m & mask_n), other=float("-inf"))
-    x = (x.to(tl.uint16, bitcast=True).to(tl.int32) << 16) | offs_x_n[None, :]
-    x = x.to(tl.float32, bitcast=True)
+    x = (x.to(x_utype, bitcast=True).to(x_ultype) << x_nbits) | offs_x_n[None, :]
+    x = x.to(x_dbtype, bitcast=True)
 
     acc = tl.topk(x, N_EXPTS_ACT, dim=1)
 
@@ -26,8 +30,8 @@ def streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD: tl.co
         X_ptrs -= BLOCK_N
         offs_x_n -= BLOCK_N
         x = tl.load(X_ptrs, mask=mask_m, other=float("-inf"))
-        x = (x.to(tl.uint16, bitcast=True).to(tl.int32) << 16) | offs_x_n[None, :]
-        x = x.to(tl.float32, bitcast=True)
+        x = (x.to(x_utype, bitcast=True).to(x_ultype) << x_nbits) | offs_x_n[None, :]
+        x = x.to(x_dbtype, bitcast=True)
         acc = tl.maximum(acc, tl.topk(x, N_EXPTS_ACT, dim=1))
 
     return acc
@@ -43,18 +47,21 @@ def _topk(X, stride_xm,  # inputs
     tl.static_assert(BLOCK_N % 32 == 0)
     tl.static_assert(N_EXPTS_PAD % BLOCK_N == 0)
     x_dtype: tl.constexpr = X.dtype.element_ty
+    x_nbits: tl.constexpr = X.dtype.element_ty.primitive_bitwidth
+    x_utype: tl.constexpr = tl.dtype(f"uint{x_nbits}")
+    x_ultype: tl.constexpr = tl.dtype(f"uint{2*x_nbits}")
 
     # load logits
     offs_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
     mask_m = offs_m[:, None] < n_rows
     y = streaming_topk(X, stride_xm, n_expts_tot, offs_m, mask_m, N_EXPTS_PAD, N_EXPTS_ACT, BLOCK_N)
-    y = y.to(tl.uint32, bitcast=True)
+    y = y.to(x_ultype, bitcast=True)
 
     # sort result in direction of ascending expert index
-    y = (y << 16) | (y >> 16)
+    y = (y << x_nbits) | (y >> x_nbits)
     y = tl.sort(y, dim=1)
-    y_indices = y >> 16
-    y_values = (y & 0x0000FFFF).to(tl.uint16).to(x_dtype, bitcast=True)
+    y_indices = y >> x_nbits
+    y_values = (y & ((1 << x_nbits) - 1)).to(x_utype).to(x_dtype, bitcast=True)
     y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
 
     # write back