add triton_softmax_topk (#912)

SangChengC · hiworldwzj · web-flow · commit a5265c42bd20 · 2025-05-27T13:11:21.000+08:00
Co-authored-by: wangzaijun &lt;wzjhelloworld@qq.com&gt;
diff --git a/lightllm/common/fused_moe/softmax_topk.py b/lightllm/common/fused_moe/softmax_topk.py
@@ -0,0 +1,100 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def softmax_topk_kernel(
+    topk_weights_ptr,
+    topk_indices_ptr,
+    gating_output_ptr,
+    input_row_stride,
+    output_weights_row_stride,
+    output_indices_row_stride,
+    n_rows,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_TOPK: tl.constexpr,
+    top_k: tl.constexpr,
+    NEED_MASK: tl.constexpr,
+    RENORM: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+
+    row_input_ptr = gating_output_ptr + row_idx * input_row_stride
+    row_weights_ptr = topk_weights_ptr + row_idx * output_weights_row_stride
+    row_indices_ptr = topk_indices_ptr + row_idx * output_indices_row_stride
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    if NEED_MASK:
+        mask = offsets < n_cols
+        values = tl.load(row_input_ptr + offsets, mask=mask, other=-float("inf"))
+    else:
+        values = tl.load(row_input_ptr + offsets)
+
+    current_max = tl.max(values, axis=0)
+    values = values - current_max
+    numerators = tl.exp(values)
+    denom = tl.sum(numerators, axis=0)
+
+    sum_prob = 0.0
+    for i in range(top_k):
+        logit = tl.max(values, axis=0)
+        idx = tl.argmax(values, axis=0)
+
+        prob = tl.exp(logit) / denom
+        sum_prob += prob
+
+        ptr_w = row_weights_ptr + i
+        ptr_i = row_indices_ptr + i
+
+        tl.store(ptr_w, prob)
+        tl.store(ptr_i, idx)
+
+        values = tl.where(offsets == idx, -float("inf"), values)
+
+    if RENORM:
+        sum_prob = tl.where(sum_prob < 1e-8, 1e-8, sum_prob)
+        topk_offd = tl.arange(0, BLOCK_TOPK)
+        topk_mask = topk_offd < top_k
+        prob = tl.load(row_weights_ptr + topk_offd, mask=topk_mask, other=0.0)
+        prob = prob / sum_prob
+        tl.store(row_weights_ptr + topk_offd, prob, mask=topk_mask)
+    return
+
+
+def softmax_topk(gating_output: torch.Tensor, topk: int, renorm: bool = False):
+    assert gating_output.dim() == 2, "The dim of gating_output must be 2."
+    num_tokens, num_experts = gating_output.shape
+    device = gating_output.device
+
+    if gating_output.dtype != torch.float32:
+        gating_output = gating_output.to(torch.float32)
+
+    topk_vals = torch.empty((num_tokens, topk), dtype=torch.float32, device=device)
+    topk_idxs = torch.empty((num_tokens, topk), dtype=torch.int32, device=device)
+
+    BLOCK_SIZE = triton.next_power_of_2(num_experts)
+    NEED_MASK = BLOCK_SIZE != num_experts
+
+    num_warps = min(max(1, (BLOCK_SIZE // 8 // 32)), 16)
+
+    grid = (num_tokens,)
+    softmax_topk_kernel[grid](
+        topk_vals,
+        topk_idxs,
+        gating_output,
+        gating_output.stride(0),
+        topk_vals.stride(0),
+        topk_idxs.stride(0),
+        num_tokens,
+        num_experts,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BLOCK_TOPK=triton.next_power_of_2(topk),
+        top_k=topk,
+        NEED_MASK=NEED_MASK,
+        RENORM=renorm,
+        num_warps=num_warps,
+    )
+
+    return topk_vals, topk_idxs
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -22,6 +22,7 @@
 from lightllm.utils.sgl_utils import sgl_ops
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
+from lightllm.common.fused_moe.softmax_topk import softmax_topk
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
@@ -33,11 +34,9 @@ def fused_topk(
     renormalize: bool,
 ):
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
-    assert (
-        sgl_ops is not None
-    ), "sgl_kernel is not installed, you can't use the cuda fused_topk. \
-                    You can solve it by running `pip install sgl_kernel`."
 
+    if sgl_ops is None:
+        return softmax_topk(gating_output, topk, renorm=renormalize)
     M, _ = hidden_states.shape
 
     topk_weights = torch.empty(M, topk, dtype=torch.float32, device=hidden_states.device)
@@ -69,7 +68,6 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     if scoring_func == "sigmoid":
         scores = torch.sigmoid(gating_output)
@@ -145,7 +143,6 @@ def cuda_grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     assert light_ops is not None, "lightllm_kernel is not installed."
 
diff --git a/unit_tests/common/fused_moe/test_softmax_topk.py b/unit_tests/common/fused_moe/test_softmax_topk.py
@@ -0,0 +1,115 @@
+import torch
+import time
+import pytest
+import numpy as np
+from lightllm.common.fused_moe.softmax_topk import softmax_topk
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+def benchmark(M, N, K, renorm, runs):
+    import sgl_kernel as sgl_ops
+
+    gating = torch.randn(M, N, device="cuda", dtype=torch.float32)
+    torch.cuda.synchronize()
+
+    # 1. SGL kernel
+    sgl_vals = torch.empty((M, K), dtype=torch.float32, device="cuda")
+    sgl_ids = torch.empty((M, K), dtype=torch.int32, device="cuda")
+    # Warm-up
+    sgl_ops.topk_softmax(sgl_vals, sgl_ids, torch.empty_like(sgl_ids), gating)
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(True)
+    end = torch.cuda.Event(True)
+    start.record()
+    for _ in range(runs):
+        sgl_ops.topk_softmax(sgl_vals, sgl_ids, torch.empty_like(sgl_ids), gating)
+        if renorm:
+            sgl_vals.div_(sgl_vals.sum(-1, keepdim=True).clamp_min(1e-8))
+
+    end.record()
+    torch.cuda.synchronize()
+    t_sgl = start.elapsed_time(end) / runs
+
+    # 2. Triton kernel
+    t0 = torch.cuda.Event(True)
+    t1 = torch.cuda.Event(True)
+    # Warm-up
+    softmax_topk(gating, K)
+    torch.cuda.synchronize()
+    t0.record()
+    for _ in range(runs):
+        triton_vals, triton_ids = softmax_topk(gating, K, renorm)
+    t1.record()
+    torch.cuda.synchronize()
+    t_triton = t0.elapsed_time(t1) / runs
+
+    # 3. Native PyTorch
+    _ = torch.softmax(gating, dim=-1)
+    _, _ = torch.topk(_, K, dim=-1)
+    torch.cuda.synchronize()
+
+    start, end = torch.cuda.Event(True), torch.cuda.Event(True)
+    start.record()
+    for _ in range(runs):
+        probs = torch.softmax(gating, dim=-1)
+        torch_vals, torch_ids = torch.topk(probs, K, dim=-1)
+        if renorm:
+            torch_vals.div_(torch_vals.sum(-1, keepdim=True).clamp_min(1e-8))
+    end.record()
+    torch.cuda.synchronize()
+    t_torch = start.elapsed_time(end) / runs
+
+    # Compare indices and weights
+    # Count mismatches of ordered indices
+    diff_sgl_triton_ids = (sgl_ids != triton_ids).sum().item()
+    diff_torch_triton_ids = (torch_ids != triton_ids).sum().item()
+    # Max absolute difference of weights aligned by position
+    max_err_triton_torch = (triton_vals - torch_vals).abs().max().item()
+    max_err_triton_torch_sgl = (sgl_vals - torch_vals).abs().max().item()
+    max_err_triton_sgl = (triton_vals - sgl_vals).abs().max().item()
+
+    assert diff_sgl_triton_ids == 0, f"Mismatch SGL vs Triton ids: {diff_sgl_triton_ids}"
+    assert diff_torch_triton_ids == 0, f"Mismatch Torch vs Triton ids: {diff_torch_triton_ids}"
+    assert max_err_triton_torch < 1e-3, f"Max err Triton vs Torch: {max_err_triton_torch}"
+    assert max_err_triton_torch_sgl < 1e-3, f"Max err Triton vs SGL: {max_err_triton_torch_sgl}"
+    assert max_err_triton_sgl < 1e-3, f"Max err Torch vs SGL: {max_err_triton_sgl}"
+
+    results = {
+        "time_sgl": t_sgl,
+        "time_triton": t_triton,
+        "time_torch": t_torch,
+        "mismatch_sgl_triton_ids": diff_sgl_triton_ids,
+        "mismatch_torch_triton_ids": diff_torch_triton_ids,
+        "max_err_triton_torch": max_err_triton_torch,
+        "max_err_triton_sgl": max_err_triton_sgl,
+        "max_err_triton_torch_sgl": max_err_triton_torch_sgl,
+        "sgl_ids": sgl_ids,
+        "triton_ids": triton_ids,
+        "torch_ids": torch_ids,
+        "sgl_vals": sgl_vals,
+        "triton_vals": triton_vals,
+        "torch_vals": torch_vals,
+    }
+    return results
+
+
+def test_softmax_topk():
+    M, N, K = 8192, 1024, 8
+    res = benchmark(M, N, K, False, 1000)
+    print(f"SGL     time: {res['time_sgl']:.6f}ms")
+    print(f"Triton  time: {res['time_triton']:.6f}ms")
+    print(f"PyTorch time: {res['time_torch']:.6f}ms")
+    print("Mismatch SGL vs Triton ids:", res["mismatch_sgl_triton_ids"])
+    print("Mismatch Torch vs Triton ids:", res["mismatch_torch_triton_ids"])
+    print("Max err Triton vs Torch  :", res["max_err_triton_torch"])
+    print("Max err Triton vs SGL    :", res["max_err_triton_sgl"])
+    print("Max err Torch vs SGL    :", res["max_err_triton_torch_sgl"])
+    benchmark(M, N, K, True, 10)
+    benchmark(M, 256, 5, True, 10)
+    benchmark(M, 127, 5, True, 10)
+
+
+if __name__ == "__main__":
+    pytest.main()