add grouped_topk_cuda

sangchengmeng · sangchengmeng · commit 0a900c845f6d · 2025-02-11T07:27:10.000Z
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -17,10 +17,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import torch
 from lightllm.common.vllm_kernel import _custom_ops as ops
 from typing import Callable, List, Optional, Tuple
 
+use_cuda_grouped_topk = os.environ.get("GROUPED_TOPK_CUDA", "false").lower()
 
 def fused_topk(
     hidden_states: torch.Tensor,
@@ -61,7 +63,7 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
 ):
-
+    
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
     if scoring_func == "sigmoid":
         scores = torch.sigmoid(gating_output)
@@ -89,6 +91,43 @@ def grouped_topk(
 
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
+# This is used by the Deepseek-V2 model
+def grouped_topk_cuda(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+):
+
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    num_tokens = gating_output.shape[0]
+    num_experts = gating_output.shape[-1]
+    topk_weights = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.float32)
+    topk_indices = torch.empty(num_tokens, topk, device=hidden_states.device, dtype=torch.int32)
+    token_expert_indices = torch.empty(num_tokens, topk_group, device=hidden_states.device, dtype=torch.int32)
+    group_scores  = torch.empty(num_tokens, num_expert_group, device=hidden_states.device, dtype=torch.float32)
+    if correction_bias is None: 
+        correction_bias = torch.zeros_like(gating_output,dtype=torch.float32)
+    ops.grouped_topk(
+            topk_weights, 
+            correction_bias, 
+            topk_indices, 
+            token_expert_indices, 
+            gating_output.float(), 
+            num_expert_group, 
+            topk_group, 
+            topk, 
+            renormalize, 
+            scoring_func,
+            group_scores
+    )
+    
+    return topk_weights, topk_indices
+
 
 def select_experts(
     hidden_states: torch.Tensor,
@@ -103,21 +142,33 @@ def select_experts(
     custom_routing_function: Optional[Callable] = None,
 ):
     from lightllm.common.fused_moe.topk_select import fused_topk, grouped_topk
-
     # DeekSeekv2 uses grouped_top_k
     if use_grouped_topk:
         assert topk_group is not None
         assert num_expert_group is not None
-        topk_weights, topk_ids = grouped_topk(
-            hidden_states=hidden_states,
-            gating_output=router_logits,
-            correction_bias=correction_bias,
-            topk=top_k,
-            renormalize=renormalize,
-            num_expert_group=num_expert_group,
-            topk_group=topk_group,
-            scoring_func=scoring_func,
-        )
+        if use_cuda_grouped_topk == "true":
+            from lightllm.common.vllm_kernel import _custom_ops as ops
+            topk_weights, topk_ids = grouped_topk_cuda(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                correction_bias=correction_bias,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+            )
+        else:
+            topk_weights, topk_ids = grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                correction_bias=correction_bias,
+                topk=top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                scoring_func=scoring_func,
+            )
     elif custom_routing_function is None:
         topk_weights, topk_ids = fused_topk(
             hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize
diff --git a/lightllm/common/vllm_kernel/_ops.py b/lightllm/common/vllm_kernel/_ops.py
@@ -760,6 +760,23 @@ def topk_softmax(
 ) -> None:
     torch.ops.vllm_moe.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output)
 
+def grouped_topk(
+    topk_weights: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_indices: torch.Tensor,
+    group_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    topk: int,
+    renormalize: bool,
+    scoring_func: str,
+    group_scores: torch.Tensor = None
+) -> None:
+    torch.ops.vllm_moe.grouped_topk(
+        topk_weights, correction_bias, topk_indices, group_indices, gating_output, num_expert_group, 
+        topk_group, topk, renormalize, scoring_func, group_scores
+    )
 
 def reshape_and_cache(
     key: torch.Tensor,