the draft of add gpt-oss model

flyinglandlord · flyinglandlord · commit 89cb65de628d · 2025-08-27T15:02:48.000+08:00
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -2,6 +2,7 @@
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id
 
+# For special weight
 class DummyWeight(BaseWeightTpl):
     def __init__(self, weight_name, data_type):
         super().__init__()
@@ -15,10 +16,10 @@ def load_hf_weights(self, weights):
 
     def verify_load(self):
         load_ok = True
-        # Verify weight. The weight must be not None.
         load_ok = load_ok and self.weight is not None
         return load_ok
 
+
 class NormWeight(BaseWeightTpl):
     def __init__(self, weight_name, data_type, bias_name=None):
         super().__init__()
diff --git a/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py b/lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py
@@ -14,34 +14,40 @@
 
 logger = init_logger(__name__)
 
+
 class GptOssTransformerLayerInfer(LlamaTransformerLayerInfer):
     def __init__(self, layer_num, network_config, mode=[]):
         super().__init__(layer_num, network_config, mode)
-        self.hidden_size = self.network_config_['hidden_size']
+        self.hidden_size = self.network_config_["hidden_size"]
         self.alpha = 1.702
         self.limit = 7.0
-        self.top_k = network_config['num_experts_per_tok']
-        self.sliding_window = network_config['sliding_window']
+        self.top_k = network_config["num_experts_per_tok"]
+        self.sliding_window = network_config["sliding_window"]
         self.head_dim_ = network_config["head_dim"]
 
     def _bind_attention(self):
         self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
         self._context_attention_kernel = self._conext_sliding_attention_flashattention
         self._token_attention_kernel = self._token_sliding_attention_flashattention
-    
+
     def _bind_norm(self):
         self._att_norm = self._att_norm
         self._ffn_norm = self._ffn_norm
         return
 
-    def _experts(self, hidden_states: torch.Tensor, router_indices, routing_weights, layer_weight: GptOssTransformerLayerWeight):
+    def _experts(
+        self, hidden_states: torch.Tensor, router_indices, routing_weights, layer_weight: GptOssTransformerLayerWeight
+    ):
         batch_size = hidden_states.shape[0]
         hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
         num_experts = routing_weights.shape[1]
 
         hidden_states = hidden_states.repeat(num_experts, 1)
         hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
-        gate_up = torch.bmm(hidden_states, layer_weight.gate_up_proj_weight) + layer_weight.gate_up_proj_bias.weight[..., None, :]
+        gate_up = (
+            torch.bmm(hidden_states, layer_weight.gate_up_proj_weight)
+            + layer_weight.gate_up_proj_bias.weight[..., None, :]
+        )
         gate, up = gate_up[..., ::2], gate_up[..., 1::2]
         gate = gate.clamp(min=None, max=self.limit)
         up = up.clamp(min=-self.limit, max=self.limit)
@@ -52,21 +58,17 @@ def _experts(self, hidden_states: torch.Tensor, router_indices, routing_weights,
         next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
         next_states = next_states.sum(dim=0)
         return next_states
-    
-    def _att_norm(
-        self, input, infer_state, layer_weight
-    ) -> torch.Tensor:
+
+    def _att_norm(self, input, infer_state, layer_weight) -> torch.Tensor:
         out = self.alloc_tensor(input.shape, input.dtype)
         out = self._gpt_oss_rmsnorm(input, weight=layer_weight.att_norm_weight_.weight, eps=self.eps_)
         return out
-    
-    def _ffn_norm(
-        self, input, infer_state, layer_weight
-    ) -> torch.Tensor:
+
+    def _ffn_norm(self, input, infer_state, layer_weight) -> torch.Tensor:
         out = self.alloc_tensor(input.shape, input.dtype)
         out = self._gpt_oss_rmsnorm(input, weight=layer_weight.ffn_norm_weight_.weight, eps=self.eps_)
         return out
-    
+
     def _gpt_oss_rmsnorm(self, hidden_states, weight, eps=1e-6):
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
@@ -81,18 +83,24 @@ def _router(self, hidden_states, layer_weight: GptOssTransformerLayerWeight):
         router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
         router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
         return router_scores, router_indices
-    
-    def _ffn(self, input, infer_state: FlashAttentionStateInfo, layer_weight: GptOssTransformerLayerWeight) -> torch.Tensor:
+
+    def _ffn(
+        self, input, infer_state: FlashAttentionStateInfo, layer_weight: GptOssTransformerLayerWeight
+    ) -> torch.Tensor:
         router_scores, router_indices = self._router(input, layer_weight)  # (num_experts, seq_len)
-        routed_out = self._experts(input, router_indices=router_indices, routing_weights=router_scores, layer_weight=layer_weight)
+        routed_out = self._experts(
+            input, router_indices=router_indices, routing_weights=router_scores, layer_weight=layer_weight
+        )
         return routed_out
-    
-    def _conext_sliding_attention_flashattention(self, q, kv, infer_state: FlashAttentionStateInfo, layer_weight, out=None):
-        if self.network_config_['layer_types'][self.layer_num_] == "sliding_attention":
-            window_size = (self.sliding_window-1, self.sliding_window-1)
+
+    def _conext_sliding_attention_flashattention(
+        self, q, kv, infer_state: FlashAttentionStateInfo, layer_weight, out=None
+    ):
+        if self.network_config_["layer_types"][self.layer_num_] == "sliding_attention":
+            window_size = (self.sliding_window - 1, self.sliding_window - 1)
         else:
             window_size = (-1, -1)
-        
+
         cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
             -1, 1, self.tp_k_head_num_, self.head_dim_
         )
@@ -114,7 +122,7 @@ def _conext_sliding_attention_flashattention(self, q, kv, infer_state: FlashAtte
             max_seqlen_q=infer_state.q_max_seq_len,
             softmax_scale=sm_scale,
             causal=True,
-            window_size=(-1, -1),
+            window_size=window_size,
             softcap=0.0,
             k_descale=k_descale,
             v_descale=v_descale,
@@ -124,11 +132,11 @@ def _conext_sliding_attention_flashattention(self, q, kv, infer_state: FlashAtte
         return o
 
     def _token_sliding_attention_flashattention(self, q, infer_state: FlashAttentionStateInfo, layer_weight, out=None):
-        if self.network_config_['layer_types'][self.layer_num_] == "sliding_attention":
-            window_size = (self.sliding_window-1, self.sliding_window-1)
+        if self.network_config_["layer_types"][self.layer_num_] == "sliding_attention":
+            window_size = (self.sliding_window - 1, self.sliding_window - 1)
         else:
             window_size = (-1, -1)
-        
+
         cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
             -1, 1, self.tp_k_head_num_, self.head_dim_
         )
@@ -157,4 +165,4 @@ def _token_sliding_attention_flashattention(self, q, infer_state: FlashAttention
             return_softmax_lse=False,
             sinks=layer_weight.attn_sinks.weight,
         )
-        return o
+        return o
diff --git a/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py b/lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py
@@ -29,6 +29,7 @@
     -6.0,
 ]
 
+
 class GptOssTransformerLayerWeight(LlamaTransformerLayerWeight):
     def __init__(
         self,
@@ -75,14 +76,14 @@ def _init_weight_names(self):
         self._o_bias_name = f"model.layers.{self.layer_num_}.self_attn.o_proj.bias"
 
         # MOE Layers
-        # model.layers.0.mlp.experts.down_proj_bias	[32, 2 880]	
-        # model.layers.0.mlp.experts.down_proj_blocks	[32, 2 880, 90, 16]	
-        # model.layers.0.mlp.experts.down_proj_scales	[32, 2 880, 90]	
-        # model.layers.0.mlp.experts.gate_up_proj_bias	[32, 5 760]	
-        # model.layers.0.mlp.experts.gate_up_proj_blocks	[32, 5 760, 90, 16]	
-        # model.layers.0.mlp.experts.gate_up_proj_scales	[32, 5 760, 90]	
-        # model.layers.0.mlp.router.bias	[32]	
-        # model.layers.0.mlp.router.weight	[32, 2 880]	
+        # model.layers.0.mlp.experts.down_proj_bias	[32, 2 880]
+        # model.layers.0.mlp.experts.down_proj_blocks	[32, 2 880, 90, 16]
+        # model.layers.0.mlp.experts.down_proj_scales	[32, 2 880, 90]
+        # model.layers.0.mlp.experts.gate_up_proj_bias	[32, 5 760]
+        # model.layers.0.mlp.experts.gate_up_proj_blocks	[32, 5 760, 90, 16]
+        # model.layers.0.mlp.experts.gate_up_proj_scales	[32, 5 760, 90]
+        # model.layers.0.mlp.router.bias	[32]
+        # model.layers.0.mlp.router.weight	[32, 2 880]
 
         self._router_bias_name = f"model.layers.{self.layer_num_}.mlp.router.bias"
         self._router_weight_name = f"model.layers.{self.layer_num_}.mlp.router.weight"
@@ -108,25 +109,28 @@ def _post_weight_process(self):
             blocks=self.down_proj_weight_blocks.weight,
             scales=self.down_proj_weight_scales.weight,
             dtype=torch.bfloat16,
-        )[:, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :]
-        # (32, 1440, 2880)
+        )[
+            :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :
+        ]  # (32, 1440, 2880)
 
         self.gate_up_proj_weight = self._convert_moe_packed_tensors(
             blocks=self.gate_up_proj_weight_blocks.weight,
             scales=self.gate_up_proj_weight_scales.weight,
             dtype=torch.bfloat16,
-        ) # (32, 2880, 5760)
+        )  # (32, 2880, 5760)
         expert_num = self.gate_up_proj_weight.shape[0]
         self.gate_up_proj_weight = self.gate_up_proj_weight.reshape(expert_num, -1, 2, self.moe_intermediate_size)[
             :, :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-        ].reshape(expert_num, -1, 2*self.split_inter_size)
-        # (32, 2880, 2880)
-
-        self.gate_up_proj_bias.weight = self.gate_up_proj_bias.weight.reshape(expert_num, 2, self.moe_intermediate_size)[
-            :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
-        ].reshape(expert_num, 2*self.split_inter_size)
-        # (32, 2880)
-    
+        ].reshape(
+            expert_num, -1, 2 * self.split_inter_size
+        )  # (32, 2880, 2880)
+
+        self.gate_up_proj_bias.weight = self.gate_up_proj_bias.weight.reshape(
+            expert_num, 2, self.moe_intermediate_size
+        )[:, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)].reshape(
+            expert_num, 2 * self.split_inter_size
+        )  # (32, 2880)
+
     def _convert_moe_packed_tensors(
         self,
         blocks,
@@ -179,4 +183,4 @@ def _convert_moe_packed_tensors(
 
         out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)
         del blocks, scales, lut
-        return out.transpose(1, 2).contiguous()
+        return out.transpose(1, 2).contiguous()