add triton_softmax_topk

SangChengC · SangChengC · commit 820349733109 · 2025-05-26T14:25:35.000+08:00
diff --git a/lightllm/common/fused_moe/softmax_topk.py b/lightllm/common/fused_moe/softmax_topk.py
@@ -0,0 +1,161 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def softmax_topk_kernel(
+    topk_weights_ptr,
+    topk_indices_ptr,
+    gating_output_ptr,
+    input_row_stride,
+    output_weights_row_stride,
+    output_indices_row_stride,
+    n_rows,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    top_k: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    row_input_ptr = gating_output_ptr + row_idx * input_row_stride
+    row_weights_ptr = topk_weights_ptr + row_idx * output_weights_row_stride
+    row_indices_ptr = topk_indices_ptr + row_idx * output_indices_row_stride
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_cols
+
+    values = tl.load(row_input_ptr + offsets, mask=mask, other=-float("inf"))
+
+    current_max = tl.max(values, axis=0)
+    values = values - current_max
+    numerators = tl.exp(values)
+    denom = tl.sum(numerators, axis=0)
+
+    for i in range(top_k):
+        logit = tl.max(values, axis=0)
+        idx = tl.argmax(values, axis=0)
+
+        prob = tl.exp(logit) / denom
+
+        lane0 = offsets == 0
+        ptr_w = row_weights_ptr + i + offsets * 0
+        ptr_i = row_indices_ptr + i + offsets * 0
+        tl.store(ptr_w, tl.where(lane0, prob, 0.0), mask=lane0)
+        tl.store(ptr_i, tl.where(lane0, idx, 0), mask=lane0)
+
+        values = tl.where(offsets == idx, -float("inf"), values)
+
+
+def softmax_topk(gating_output: torch.Tensor, topk: int):
+    assert gating_output.dim() == 2, "The dim of gating_output must be 2."
+    num_tokens, num_experts = gating_output.shape
+    device = gating_output.device
+
+    if gating_output.dtype != torch.float32:
+        gating_output = gating_output.to(torch.float32)
+
+    topk_vals = torch.empty((num_tokens, topk), dtype=torch.float32, device=device)
+    topk_idxs = torch.empty((num_tokens, topk), dtype=torch.int32, device=device)
+
+    BLOCK_SIZE = triton.next_power_of_2(num_experts)
+
+    grid = (num_tokens,)
+    softmax_topk_kernel[grid](
+        topk_vals,
+        topk_idxs,
+        gating_output,
+        gating_output.stride(0),
+        topk_vals.stride(0),
+        topk_idxs.stride(0),
+        num_tokens,
+        num_experts,
+        BLOCK_SIZE=BLOCK_SIZE,
+        top_k=topk,
+        num_warps=8,
+    )
+    return topk_vals, topk_idxs
+
+
+import sgl_kernel as sgl_ops
+
+
+#
+def benchmark(M, N, K):
+    gating = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    torch.cuda.synchronize()
+
+    # 1. SGL kernel
+    sgl_vals = torch.empty((M, K), dtype=torch.float32, device="cuda")
+    sgl_ids = torch.empty((M, K), dtype=torch.int32, device="cuda")
+    # Warm-up
+    sgl_ops.topk_softmax(sgl_vals, sgl_ids, torch.empty_like(sgl_ids), gating)
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(True)
+    end = torch.cuda.Event(True)
+    start.record()
+    sgl_ops.topk_softmax(sgl_vals, sgl_ids, torch.empty_like(sgl_ids), gating)
+    end.record()
+    torch.cuda.synchronize()
+    t_sgl = start.elapsed_time(end) / 1000.0
+
+    # 2. Triton kernel
+    t0 = torch.cuda.Event(True)
+    t1 = torch.cuda.Event(True)
+    # Warm-up
+    softmax_topk(gating, K)
+    t0.record()
+    triton_vals, triton_ids = softmax_topk(gating, K)
+    t1.record()
+    torch.cuda.synchronize()
+    t_triton = t0.elapsed_time(t1) / 1000.0
+
+    # 3. Native PyTorch
+    start, end = torch.cuda.Event(True), torch.cuda.Event(True)
+    start.record()
+    probs = torch.softmax(gating, dim=-1)
+    torch_vals, torch_ids = torch.topk(probs, K, dim=-1)
+    end.record()
+    torch.cuda.synchronize()
+    t_torch = start.elapsed_time(end) / 1000.0
+
+    # Compare indices and weights
+    # Count mismatches of ordered indices
+    diff_sgl_triton_ids = (sgl_ids != triton_ids).sum().item()
+    diff_torch_triton_ids = (torch_ids != triton_ids).sum().item()
+    # Max absolute difference of weights aligned by position
+    max_err_triton_torch = (triton_vals - torch_vals).abs().max().item()
+    max_err_triton_torch_sgl = (sgl_vals - torch_vals).abs().max().item()
+    max_err_triton_sgl = (triton_vals - sgl_vals).abs().max().item()
+
+    results = {
+        "time_sgl": t_sgl,
+        "time_triton": t_triton,
+        "time_torch": t_torch,
+        "mismatch_sgl_triton_ids": diff_sgl_triton_ids,
+        "mismatch_torch_triton_ids": diff_torch_triton_ids,
+        "max_err_triton_torch": max_err_triton_torch,
+        "max_err_triton_sgl": max_err_triton_sgl,
+        "max_err_triton_torch_sgl": max_err_triton_torch_sgl,
+        "sgl_ids": sgl_ids,
+        "triton_ids": triton_ids,
+        "torch_ids": torch_ids,
+        "sgl_vals": sgl_vals,
+        "triton_vals": triton_vals,
+        "torch_vals": torch_vals,
+    }
+    return results
+
+
+if __name__ == "__main__":
+    # Example: 8192 tokens, 1024 experts, Top-4
+    M, N, K = 8192, 1024, 4
+    res = benchmark(M, N, K)
+    print(f"SGL     time: {res['time_sgl']:.6f}s")
+    print(f"Triton  time: {res['time_triton']:.6f}s")
+    print(f"PyTorch time: {res['time_torch']:.6f}s")
+    print("Mismatch SGL vs Triton ids:", res["mismatch_sgl_triton_ids"])
+    print("Mismatch Torch vs Triton ids:", res["mismatch_torch_triton_ids"])
+    print("Max err Triton vs Torch  :", res["max_err_triton_torch"])
+    print("Max err Triton vs SGL    :", res["max_err_triton_sgl"])
+    print("Max err Torch vs SGL    :", res["max_err_triton_torch_sgl"])
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -22,6 +22,7 @@
 from lightllm.utils.sgl_utils import sgl_ops
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
+from lightllm.common.fused_moe.softmax_topk import softmax_topk
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
@@ -33,24 +34,8 @@ def fused_topk(
     renormalize: bool,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
-    assert (
-        sgl_ops is not None
-    ), "sgl_kernel is not installed, you can't use the cuda fused_topk. \
-                    You can solve it by running `pip install sgl_kernel`."
 
-    M, _ = hidden_states.shape
-
-    topk_weights = torch.empty(M, topk, dtype=torch.float32, device=hidden_states.device)
-    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-    token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
-
-    sgl_ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
-    del token_expert_indicies  # Not used. Will be used in the future.
+    topk_weights, topk_ids = softmax_topk(gating_output, topk)
 
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
@@ -69,7 +54,6 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     if scoring_func == "sigmoid":
         scores = torch.sigmoid(gating_output)
@@ -145,7 +129,6 @@ def cuda_grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     assert light_ops is not None, "lightllm_kernel is not installed."