1204

sangchengmeng · sangchengmeng · commit 3ee963eace62 · 2025-12-04T14:00:51.000Z
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_tp.py
@@ -20,6 +20,8 @@ def create_tp_moe_wegiht_obj(
     network_config: Dict[str, Any],
     layer_num: int,
     quant_cfg: Quantcfg = None,
+    fused_gate_up: bool = False,
+    gate_up_proj_name: str = None,
 ) -> Union["FusedMoeWeightTP", "FusedAWQMARLINMoeWeightTP"]:
     quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
     if quant_method is not None and quant_method.method_name == "awq_marlin":
@@ -36,6 +38,8 @@ def create_tp_moe_wegiht_obj(
             network_config=network_config,
             layer_num=layer_num,
             quant_cfg=quant_cfg,
+            fused_gate_up=fused_gate_up,
+            gate_up_proj_name=gate_up_proj_name,
         )
     else:
         return FusedMoeWeightTP(
@@ -51,6 +55,8 @@ def create_tp_moe_wegiht_obj(
             network_config=network_config,
             layer_num=layer_num,
             quant_cfg=quant_cfg,
+            fused_gate_up=fused_gate_up,
+            gate_up_proj_name=gate_up_proj_name,
         )
 
 
@@ -69,6 +75,8 @@ def __init__(
         network_config: Dict[str, Any],
         layer_num: int,
         quant_cfg: Quantcfg = None,
+        fused_gate_up: bool = False,
+        gate_up_proj_name: str = None,
     ) -> None:
         super().__init__()
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
@@ -79,6 +87,8 @@ def __init__(
         self.w1_weight_name = gate_proj_name
         self.w2_weight_name = down_proj_name
         self.w3_weight_name = up_proj_name
+        self.fused_gate_up = fused_gate_up
+        self.gate_up_proj_name = gate_up_proj_name
 
         self.e_score_correction_bias_name = e_score_correction_bias_name
         self.weight_prefix = weight_prefix
@@ -181,8 +191,6 @@ def _fuse(self):
 
                 inter_shape, hidden_size = self.w2_list[0].shape[0], self.w2_list[0].shape[1]
                 w2 = torch._utils._flatten_dense_tensors(self.w2_list).view(len(self.w2_list), inter_shape, hidden_size)
-                if self.fused_gate_up:
-                    w2 = w2.transpose(1, 2).contiguous()
                 if not self.quantized_weight and self.quant_method is not None:
                     qw1, qw1_scale, qw1_zero_point = self.quant_method.quantize(w1)
                     qw2, qw2_scale, qw2_zero_point = self.quant_method.quantize(w2)
@@ -228,51 +236,49 @@ def _fuse_weight_scale(self):
                 delattr(self, "experts_up_proj_scales")
                 delattr(self, "experts_gate_proj_scales")
 
+    def fused_gate_up_weights_load(self, weights):
+        key_gate_up = f"{self.weight_prefix}.{self.gate_up_proj_name}"  # ...experts.gate_up_proj
+        key_down = f"{self.weight_prefix}.{self.w2_weight_name}"
+        if (key_gate_up not in weights) or (key_down not in weights):
+            return
+        gate_up = weights[key_gate_up]
+        down = weights[key_down]
+        _, _, I_double = gate_up.shape
+        I_single = I_double // 2
+        start = self.tp_rank_ * self.split_inter_size
+        end = (self.tp_rank_ + 1) * self.split_inter_size
+
+        for i_experts in range(self.n_routed_experts):
+            gate_up_2d = gate_up[i_experts]
+            self.experts_gate_projs[i_experts] = gate_up_2d[:, :I_single][:, start:end].t().contiguous()
+            self.experts_up_projs[i_experts] = gate_up_2d[:, I_single:][:, start:end].t().contiguous()
+            self.w2_list[i_experts] = down[i_experts].t()[:, start:end].contiguous()
+
+    def normal_weights_load(self, weights):
+        for i_experts in range(self.n_routed_experts):
+
+            w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
+            w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
+            w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
+
+            start = self.tp_rank_ * self.split_inter_size
+            end = (self.tp_rank_ + 1) * self.split_inter_size
+
+            if w1_weight in weights:
+                self.experts_gate_projs[i_experts] = weights[w1_weight][start:end, :]
+            if w3_weight in weights:
+                self.experts_up_projs[i_experts] = weights[w3_weight][start:end, :]
+            if w2_weight in weights:
+                self.w2_list[i_experts] = weights[w2_weight][start:end]
+
     def load_hf_weights(self, weights):
         if self.e_score_correction_bias_name in weights:
             self.e_score_correction_bias = self._cuda(weights[self.e_score_correction_bias_name])
-        self.fused_gate_up = self.w3_weight_name is None  # gate_up: [E,H,2I] down: [E,I,H]
-        key_gateup_3d = f"{self.weight_prefix}.{self.w1_weight_name}"  # ...experts.gate_up_proj
-        key_down_3d = f"{self.weight_prefix}.{self.w2_weight_name}"
-
-        if self.fused_gate_up and (key_gateup_3d in weights) and (key_down_3d in weights):
-            gate_up_3d = weights[key_gateup_3d]
-            down_3d = weights[key_down_3d]
-            assert gate_up_3d.dim() == 3 and down_3d.dim() == 3
-
-            E_ckpt, H_, twoE = gate_up_3d.shape
-            assert E_ckpt == self.n_routed_experts, f"experts mismatch: ckpt {E_ckpt} vs cfg {self.n_routed_experts}"
-            Eint_total = twoE // 2
-            start, end = self.tp_rank_ * self.split_inter_size, (self.tp_rank_ + 1) * self.split_inter_size
-            assert end <= Eint_total, "TP split exceeds total expert-intermediate size"
-
-            for i in range(self.n_routed_experts):
-                gu2d = gate_up_3d[i]
-                gate2d = gu2d[:, :Eint_total][:, start:end].t().contiguous()
-                up2d = gu2d[:, Eint_total:][:, start:end].t().contiguous()
-                self.experts_gate_projs[i] = gate2d
-                self.experts_up_projs[i] = up2d
-
-                self.w2_list[i] = down_3d[i][start:end, :].contiguous()
+        if self.fused_gate_up:  # gate_up: [E,H,2I] down: [E,I,H]
+            self.fused_gate_up_weights_load(weights)
         else:
-            for i_experts in range(self.n_routed_experts):
-                w1_weight = f"{self.weight_prefix}.{i_experts}.{self.w1_weight_name}.weight"
-                w2_weight = f"{self.weight_prefix}.{i_experts}.{self.w2_weight_name}.weight"
-                w3_weight = f"{self.weight_prefix}.{i_experts}.{self.w3_weight_name}.weight"
-
-                if w1_weight in weights:
-                    self.experts_gate_projs[i_experts] = weights[w1_weight][
-                        self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-                    ]
-                if w3_weight in weights:
-                    self.experts_up_projs[i_experts] = weights[w3_weight][
-                        self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-                    ]
-
-                if w2_weight in weights:
-                    self.w2_list[i_experts] = weights[w2_weight][
-                        :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-                    ]
+            self.normal_weights_load(weights)
+
         if self.quant_method is not None:
             if self.fused_gate_up:
                 raise ValueError("qwen3_vl_moe not support quant now")
@@ -342,6 +348,8 @@ def __init__(
         network_config: Dict[str, Any],
         layer_num: int,
         quant_cfg: Quantcfg = None,
+        fused_gate_up: bool = False,
+        gate_up_proj_name: str = None,
     ) -> None:
         super().__init__()
         self.quant_method = quant_cfg.get_quant_method(layer_num, "fused_moe")
diff --git a/lightllm/models/qwen3_vl/layer_weights/pre_and_post_layer_weight.py b/lightllm/models/qwen3_vl/layer_weights/pre_and_post_layer_weight.py
@@ -1,4 +1,3 @@
-import torch
 import numpy as np
 from lightllm.common.basemodel import PreAndPostLayerWeight
 
diff --git a/lightllm/models/qwen3_vl/layer_weights/transformers_layer_weight.py b/lightllm/models/qwen3_vl/layer_weights/transformers_layer_weight.py
@@ -1,18 +1,4 @@
-import os
-import torch
-import math
-import numpy as np
 from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
-from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import (
-    ROWMMWeight,
-    MultiROWMMWeight,
-    COLMMWeight,
-    NormWeight,
-    FusedMoeWeightTP,
-    FusedMoeWeightEP,
-    ROWBMMWeight,
-)
 
 
 class Qwen3VLTransformerLayerWeight(Qwen3TransformerLayerWeight):  # 后面看要不要改
diff --git a/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_vl_moe/layer_infer/transformer_layer_infer.py
@@ -35,10 +35,7 @@ def _get_qkv(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         input = input.view(-1, self.embed_dim_)
         q = layer_weight.q_proj.mm(input)
-        cache_kv = self._pre_cache_kv(infer_state=infer_state, layer_weight=layer_weight)
-        cache_kv = layer_weight.kv_proj.mm(
-            input, out=cache_kv.view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_)
-        ).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
+        cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
         rmsnorm_forward(
             q.view(-1, self.head_dim_),
             weight=layer_weight.q_norm_weight_.weight,
diff --git a/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py b/lightllm/models/qwen3_vl_moe/layer_weights/transformers_layer_weight.py
@@ -1,18 +1,6 @@
 import os
-import torch
-import math
-import numpy as np
-from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
 from lightllm.models.qwen3_moe.layer_weights.transformer_layer_weight import Qwen3MOETransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import (
-    ROWMMWeight,
-    MultiROWMMWeight,
-    COLMMWeight,
-    NormWeight,
-    FusedMoeWeightTP,
-    FusedMoeWeightEP,
-    ROWBMMWeight,
-)
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeightEP, create_tp_moe_wegiht_obj
 
 
 class Qwen3VLMOETransformerLayerWeight(Qwen3MOETransformerLayerWeight):
@@ -39,7 +27,7 @@ def _init_weight_names(self):
     def _init_moe(self):
         moe_intermediate_size = self.network_config_["moe_intermediate_size"]
         self.moe_gate = ROWMMWeight(
-            weight_name=f"model.language_model.layers.{self.layer_num_}.mlp.gate.weight",
+            weight_names=f"model.language_model.layers.{self.layer_num_}.mlp.gate.weight",
             data_type=self.data_type_,
             layer_num=self.layer_num_,
             name="moe_gate",
@@ -50,10 +38,10 @@ def _init_moe(self):
         assert moe_mode in ["EP", "TP"]
 
         if moe_mode == "TP":
-            self.experts = FusedMoeWeightTP(
-                gate_proj_name="gate_up_proj",
+            self.experts = create_tp_moe_wegiht_obj(
+                gate_proj_name="gate_proj",
                 down_proj_name="down_proj",
-                up_proj_name=None,
+                up_proj_name="up_proj",
                 e_score_correction_bias_name="",
                 weight_prefix=f"model.language_model.layers.{self.layer_num_}.mlp.experts",
                 n_routed_experts=self.n_routed_experts,
@@ -63,19 +51,23 @@ def _init_moe(self):
                 layer_num=self.layer_num_,
                 quant_cfg=self.quant_cfg,
                 num_fused_shared_experts=0,
+                fused_gate_up=True,
+                gate_up_proj_name="gate_up_proj",
             )
         elif moe_mode == "EP":
             self.experts = FusedMoeWeightEP(
-                gate_proj_name="gate_up_proj",
+                gate_proj_name="gate_proj",
                 down_proj_name="down_proj",
-                up_proj_name=None,
+                up_proj_name="up_proj",
                 e_score_correction_bias_name="",
                 weight_prefix=f"model.language_model.layers.{self.layer_num_}.mlp.experts",
                 n_routed_experts=self.n_routed_experts,
                 data_type=self.data_type_,
                 network_config=self.network_config_,
                 layer_num=self.layer_num_,
                 quant_cfg=self.quant_cfg,
+                fused_gate_up=True,
+                gate_up_proj_name="gate_up_proj",
             )
         else:
             raise ValueError(f"Unsupported moe mode: {moe_mode}")

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import torch`
`2`	`1`	`import numpy as np`
`3`	`2`	`from lightllm.common.basemodel import PreAndPostLayerWeight`
`4`	`3`