fix

flyinglandlord · flyinglandlord · commit 545b35effedd · 2025-09-03T17:30:47.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -2,23 +2,6 @@
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id
 
-# For special weight
-class DummyWeight(BaseWeightTpl):
-    def __init__(self, weight_name, data_type):
-        super().__init__()
-        self.weight_name = weight_name
-        self.data_type_ = data_type
-        self.weight = None
-
-    def load_hf_weights(self, weights):
-        if self.weight_name in weights:
-            self.weight = weights[self.weight_name].to(self.data_type_).cuda(get_current_device_id())
-
-    def verify_load(self):
-        load_ok = True
-        load_ok = load_ok and self.weight is not None
-        return load_ok
-
 
 class NormWeight(BaseWeightTpl):
     def __init__(self, weight_name, data_type, bias_name=None):
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -3,8 +3,7 @@
 import numpy as np
 
 from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.rowmm_weight import ROWMMWeight
-from lightllm.common.basemodel.layer_weights.meta_weights.norm_weight import DummyWeight
-from lightllm.models.bloom import model
+from lightllm.common.basemodel.layer_weights.meta_weights.norm_weight import NormWeight
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 from lightllm.utils.log_utils import init_logger
 
@@ -54,14 +53,20 @@ def _init_moe(self):
             tp_rank=0,
             tp_world_size=1,
         )
-        self.down_proj_bias = DummyWeight(self._down_bias_name, torch.bfloat16)
-        self.down_proj_weight_blocks = DummyWeight(self._down_blocks_name, torch.uint8)
-        self.down_proj_weight_scales = DummyWeight(self._down_scales_name, torch.uint8)
 
-        self.gate_up_proj_bias = DummyWeight(self._gate_up_bias_name, torch.bfloat16)
-        self.gate_up_proj_weight_blocks = DummyWeight(self._gate_up_blocks_name, torch.uint8)
-        self.gate_up_proj_weight_scales = DummyWeight(self._gate_up_scales_name, torch.uint8)
-        self.attn_sinks = DummyWeight(self._attn_sink_name, torch.bfloat16)
+        # Current defination of experts
+        self.down_proj_bias = NormWeight(self._down_bias_name, torch.bfloat16)
+        self.down_proj_weight_blocks = NormWeight(self._down_blocks_name, torch.uint8)
+        self.down_proj_weight_scales = NormWeight(self._down_scales_name, torch.uint8)
+
+        self.gate_up_proj_bias = NormWeight(self._gate_up_bias_name, torch.bfloat16)
+        self.gate_up_proj_weight_blocks = NormWeight(self._gate_up_blocks_name, torch.uint8)
+        self.gate_up_proj_weight_scales = NormWeight(self._gate_up_scales_name, torch.uint8)
+        self.attn_sinks = NormWeight(self._attn_sink_name, torch.bfloat16)
+
+    def load_hf_weights(self, weights):
+        super().load_hf_weights(weights)
+        self._post_weight_process()
 
     def _init_weight_names(self):
         super()._init_weight_names()
@@ -105,31 +110,34 @@ def _post_weight_process(self):
         self.moe_intermediate_size = self.network_config_["intermediate_size"]
         self.split_inter_size = self.moe_intermediate_size // self.tp_world_size_
 
-        self.down_proj_weight = self._convert_moe_packed_tensors(
-            blocks=self.down_proj_weight_blocks.weight,
-            scales=self.down_proj_weight_scales.weight,
-            dtype=torch.bfloat16,
-        )[
-            :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
-        ]  # (32, 1440, 2880)
-
-        self.gate_up_proj_weight = self._convert_moe_packed_tensors(
-            blocks=self.gate_up_proj_weight_blocks.weight,
-            scales=self.gate_up_proj_weight_scales.weight,
-            dtype=torch.bfloat16,
-        )  # (32, 2880, 5760)
-        expert_num = self.gate_up_proj_weight.shape[0]
-        self.gate_up_proj_weight = self.gate_up_proj_weight.reshape(expert_num, -1, 2, self.moe_intermediate_size)[
-            :, :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-        ].reshape(
-            expert_num, -1, 2 * self.split_inter_size
-        )  # (32, 2880, 2880)
-
-        self.gate_up_proj_bias.weight = self.gate_up_proj_bias.weight.reshape(
-            expert_num, 2, self.moe_intermediate_size
-        )[:, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)].reshape(
-            expert_num, 2 * self.split_inter_size
-        )  # (32, 2880)
+        if self.down_proj_weight_blocks.verify_load() and self.down_proj_weight_scales.verify_load():
+            self.down_proj_weight = self._convert_moe_packed_tensors(
+                blocks=self.down_proj_weight_blocks.weight,
+                scales=self.down_proj_weight_scales.weight,
+                dtype=torch.bfloat16,
+            )[
+                :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
+            ]  # (32, 1440, 2880)
+
+        if self.gate_up_proj_weight_blocks.verify_load() and self.gate_up_proj_weight_scales.verify_load():
+            self.gate_up_proj_weight = self._convert_moe_packed_tensors(
+                blocks=self.gate_up_proj_weight_blocks.weight,
+                scales=self.gate_up_proj_weight_scales.weight,
+                dtype=torch.bfloat16,
+            )  # (32, 2880, 5760)
+            expert_num = self.gate_up_proj_weight.shape[0]
+            self.gate_up_proj_weight = self.gate_up_proj_weight.reshape(expert_num, -1, 2, self.moe_intermediate_size)[
+                :, :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
+            ].reshape(
+                expert_num, -1, 2 * self.split_inter_size
+            )  # (32, 2880, 2880)
+
+        if self.gate_up_proj_bias.verify_load():
+            self.gate_up_proj_bias.weight = self.gate_up_proj_bias.weight.reshape(
+                expert_num, 2, self.moe_intermediate_size
+            )[:, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)].reshape(
+                expert_num, 2 * self.split_inter_size
+            )  # (32, 2880)
 
     def _convert_moe_packed_tensors(
         self,
diff --git a/lightllm/models/gpt_oss/model.py b/lightllm/models/gpt_oss/model.py
@@ -17,29 +17,15 @@
 
 logger = init_logger(__name__)
 
+
 @ModelRegistry("gpt_oss")
 class GptOssTpPartModel(LlamaTpPartModel):
     # weight class
-    pre_and_post_weight_class = LlamaPreAndPostLayerWeight
     transformer_weight_class = GptOssTransformerLayerWeight
 
     # infer class
-    pre_layer_infer_class = LlamaPreLayerInfer
-    post_layer_infer_class = LlamaPostLayerInfer
     transformer_layer_infer_class = GptOssTransformerLayerInfer
 
-    # infer state class
-    infer_state_class = LlamaInferStateInfo
-
     def __init__(self, kvargs):
         super().__init__(kvargs)
         assert get_env_start_args().enable_fa3, "For now GPT-OSS type model only support flashattention-3"
-    
-    def _init_weights(self):
-        super()._init_weights()
-        self._post_weight_process()
-
-    def _post_weight_process(self):
-        for i in range(self.config["n_layer"]):
-            self.trans_layers_weight[i]._post_weight_process()
-