fix qwen3moe overlap

sufubao · sufubao · commit 6e880c161ef6 · 2025-09-26T09:39:02.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -1,7 +1,7 @@
 import os
 import torch
 from abc import abstractmethod
-from typing import Optional, Tuple, List, Dict, Union
+from typing import Optional, Tuple, List, Dict, Union, Type
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.common.quantization.quantize_method import QuantizationMethod
 from lightllm.common.basemodel.layer_weights.meta_weights.base_weight import BaseWeightTpl
@@ -88,9 +88,9 @@ def _load_weights(self, weights: Dict[str, torch.Tensor]) -> None:
 class MultiMMWeightTpl(MMWeightTpl):
     def __init__(
         self,
-        weight_names: str,
+        weight_names: list[str],
         data_type: torch.dtype,
-        bias_names: Optional[str] = None,
+        bias_names: Optional[list[str]] = None,
         quant_method: QuantizationMethod = None,
         tp_rank: int = None,
         tp_world_size: int = None,
@@ -183,6 +183,6 @@ def _get_quant_method(cls, quant_cfg: Quantcfg, layer_num_: int, name: str) -> Q
 
     @classmethod
     def _get_mmcls(
-        cls, quant_method: QuantizationMethod
-    ) -> Optional[Union[MMWeightTpl, MultiMMWeightTpl, BMMWeightTpl]]:
-        return None
+        cls, quant_method: QuantizationMethod, quantized_weight: bool
+    ) -> Type[Union[MMWeightTpl, MultiMMWeightTpl, BMMWeightTpl]]:
+        raise NotImplementedError("Subclasses must implement _get_mmcls method")
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -14,6 +14,7 @@
 from functools import partial
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_global_world_size
+from lightllm.distributed.communication_op import all_gather_into_tensor
 
 logger = init_logger(__name__)
 
@@ -82,6 +83,48 @@ def _get_qkv(
         )
         return q, cache_kv
 
+    def _tpsp_get_qkv(
+        self,
+        input: torch.Tensor,
+        cache_kv,
+        infer_state: LlamaInferStateInfo,
+        layer_weight: Qwen3MOETransformerLayerWeight,
+    ) -> torch.Tensor:
+        if self.tp_world_size_ > 1:
+            sp_token_num, hidden_dim = input.shape
+            gather_input = self.alloc_tensor(
+                (sp_token_num * self.tp_world_size_, hidden_dim), dtype=input.dtype, device=input.device
+            )
+            all_gather_into_tensor(gather_input, input, group=infer_state.dist_group, async_op=False)
+            input = gather_input[0 : len(infer_state.position_cos), :]
+
+        input = input.view(-1, self.embed_dim_)
+        q = layer_weight.q_proj.mm(input)
+        cache_kv = layer_weight.kv_proj.mm(
+            input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
+        ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
+
+        rmsnorm_forward(
+            q.view(-1, self.head_dim_),
+            weight=layer_weight.q_norm_weight_.weight,
+            eps=self.eps_,
+            out=q.view(-1, self.head_dim_),
+        )
+
+        cache_kv[:, : self.tp_k_head_num_, :] = rmsnorm_forward(
+            cache_kv[:, : self.tp_k_head_num_, :].reshape(-1, cache_kv.shape[-1]),
+            weight=layer_weight.k_norm_weight_.weight,
+            eps=self.eps_,
+        ).view(-1, self.tp_k_head_num_, cache_kv.shape[-1])
+
+        rotary_emb_fwd(
+            q.view(-1, self.tp_q_head_num_, self.head_dim_),
+            cache_kv[:, : self.tp_k_head_num_, :],
+            infer_state.position_cos,
+            infer_state.position_sin,
+        )
+        return q, cache_kv
+
     def _moe_ffn(
         self, input, infer_state: LlamaInferStateInfo, layer_weight: Qwen3MOETransformerLayerWeight
     ) -> torch.Tensor: