ModelTC
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 16 additions & 0 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎lightllm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/models/gpt_oss/__init__.py‎ b/‎lightllm/models/gpt_oss/__init__.py‎
diff --git a/‎lightllm/models/gpt_oss/layer_infer/__init__.py‎ b/‎lightllm/models/gpt_oss/layer_infer/__init__.py‎
diff --git a/‎lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py‎
Lines changed: 160 additions & 0 deletions b/‎lightllm/models/gpt_oss/layer_infer/transformer_layer_infer.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎lightllm/models/gpt_oss/layer_weights/__init__.py‎ b/‎lightllm/models/gpt_oss/layer_weights/__init__.py‎
diff --git a/‎lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py‎
Lines changed: 182 additions & 0 deletions b/‎lightllm/models/gpt_oss/layer_weights/transformer_layer_weight.py‎
Lines changed: 182 additions & 0 deletions
@@ -2,6 +2,22 @@
 from .base_weight import BaseWeightTpl
 from lightllm.utils.dist_utils import get_current_device_id
 
+class DummyWeight(BaseWeightTpl):
+    def __init__(self, weight_name, data_type):
+        super().__init__()
+        self.weight_name = weight_name
+        self.data_type_ = data_type
+        self.weight = None
+
+    def load_hf_weights(self, weights):
+        if self.weight_name in weights:
+            self.weight = weights[self.weight_name].to(self.data_type_).cuda(get_current_device_id())
+
+    def verify_load(self):
+        load_ok = True
+        # Verify weight. The weight must be not None.
+        load_ok = load_ok and self.weight is not None
+        return load_ok
 
 class NormWeight(BaseWeightTpl):
     def __init__(self, weight_name, data_type, bias_name=None):
 
@@ -35,4 +35,5 @@
     Tarsier2Qwen2VLTpPartModel,
     Tarsier2LlamaTpPartModel,
 )
+from lightllm.models.gpt_oss.model import GptOssTpPartModel
 from .registry import get_model
@@ -0,0 +1,160 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from functools import partial
+from typing import Optional
+
+from lightllm.models.gpt_oss.layer_weights.transformer_layer_weight import GptOssTransformerLayerWeight
+from lightllm.models.llama.flashattention_infer_struct import FlashAttentionStateInfo
+from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
+from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
+from lightllm.utils.sgl_utils import flash_attn_with_kvcache
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+class GptOssTransformerLayerInfer(LlamaTransformerLayerInfer):
+    def __init__(self, layer_num, network_config, mode=[]):
+        super().__init__(layer_num, network_config, mode)
+        self.hidden_size = self.network_config_['hidden_size']
+        self.alpha = 1.702
+        self.limit = 7.0
+        self.top_k = network_config['num_experts_per_tok']
+        self.sliding_window = network_config['sliding_window']
+        self.head_dim_ = network_config["head_dim"]
+
+    def _bind_attention(self):
+        self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
+        self._context_attention_kernel = self._conext_sliding_attention_flashattention
+        self._token_attention_kernel = self._token_sliding_attention_flashattention
+    
+    def _bind_norm(self):
+        self._att_norm = self._att_norm
+        self._ffn_norm = self._ffn_norm
+        return
+
+    def _experts(self, hidden_states: torch.Tensor, router_indices, routing_weights, layer_weight: GptOssTransformerLayerWeight):
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        num_experts = routing_weights.shape[1]
+
+        hidden_states = hidden_states.repeat(num_experts, 1)
+        hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+        gate_up = torch.bmm(hidden_states, layer_weight.gate_up_proj_weight) + layer_weight.gate_up_proj_bias.weight[..., None, :]
+        gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+        gate = gate.clamp(min=None, max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+        glu = gate * torch.sigmoid(gate * self.alpha)
+        next_states = torch.bmm(((up + 1) * glu), layer_weight.down_proj_weight)
+        next_states = next_states + layer_weight.down_proj_bias.weight[..., None, :]
+        next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+        next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+        next_states = next_states.sum(dim=0)
+        return next_states
+    
+    def _att_norm(
+        self, input, infer_state, layer_weight
+    ) -> torch.Tensor:
+        out = self.alloc_tensor(input.shape, input.dtype)
+        out = self._gpt_oss_rmsnorm(input, weight=layer_weight.att_norm_weight_.weight, eps=self.eps_)
+        return out
+    
+    def _ffn_norm(
+        self, input, infer_state, layer_weight
+    ) -> torch.Tensor:
+        out = self.alloc_tensor(input.shape, input.dtype)
+        out = self._gpt_oss_rmsnorm(input, weight=layer_weight.ffn_norm_weight_.weight, eps=self.eps_)
+        return out
+    
+    def _gpt_oss_rmsnorm(self, hidden_states, weight, eps=1e-6):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + eps)
+        return (weight * hidden_states).to(input_dtype)  # main diff with Llama
+
+    def _router(self, hidden_states, layer_weight: GptOssTransformerLayerWeight):
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        router_logits = layer_weight.moe_gate.mm(hidden_states)  # (seq_len, num_experts)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)  # (seq_len, top_k)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+    
+    def _ffn(self, input, infer_state: FlashAttentionStateInfo, layer_weight: GptOssTransformerLayerWeight) -> torch.Tensor:
+        router_scores, router_indices = self._router(input, layer_weight)  # (num_experts, seq_len)
+        routed_out = self._experts(input, router_indices=router_indices, routing_weights=router_scores, layer_weight=layer_weight)
+        return routed_out
+    
+    def _conext_sliding_attention_flashattention(self, q, kv, infer_state: FlashAttentionStateInfo, layer_weight, out=None):
+        if self.network_config_['layer_types'][self.layer_num_] == "sliding_attention":
+            window_size = (self.sliding_window-1, self.sliding_window-1)
+        else:
+            window_size = (-1, -1)
+        
+        cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
+            -1, 1, self.tp_k_head_num_, self.head_dim_
+        )
+        cache_v = infer_state.mem_manager.kv_buffer[self.layer_num_][
+            :, self.tp_k_head_num_ : self.tp_k_head_num_ + self.tp_v_head_num_, :
+        ].reshape(-1, 1, self.tp_v_head_num_, self.head_dim_)
+        q = q.reshape(-1, self.tp_q_head_num_, self.head_dim_)
+        k_descale, v_descale = None, None  # disable quantization
+        Lq = q.shape[-1]
+        sm_scale = 1.0 / (Lq ** 0.5)
+        o = flash_attn_with_kvcache(
+            q=q,
+            k_cache=cache_k,
+            v_cache=cache_v,
+            page_table=infer_state.page_table,
+            cache_seqlens=infer_state.b_seq_len,
+            cu_seqlens_q=infer_state.cu_seqlens_q,
+            cu_seqlens_k_new=infer_state.cu_seqlens_k,
+            max_seqlen_q=infer_state.q_max_seq_len,
+            softmax_scale=sm_scale,
+            causal=True,
+            window_size=(-1, -1),
+            softcap=0.0,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            return_softmax_lse=False,
+            sinks=layer_weight.attn_sinks.weight,
+        )
+        return o
+
+    def _token_sliding_attention_flashattention(self, q, infer_state: FlashAttentionStateInfo, layer_weight, out=None):
+        if self.network_config_['layer_types'][self.layer_num_] == "sliding_attention":
+            window_size = (self.sliding_window-1, self.sliding_window-1)
+        else:
+            window_size = (-1, -1)
+        
+        cache_k = infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :].reshape(
+            -1, 1, self.tp_k_head_num_, self.head_dim_
+        )
+        cache_v = infer_state.mem_manager.kv_buffer[self.layer_num_][
+            :, self.tp_k_head_num_ : self.tp_k_head_num_ + self.tp_v_head_num_, :
+        ].reshape(-1, 1, self.tp_v_head_num_, self.head_dim_)
+        q = q.reshape(-1, self.tp_q_head_num_, self.head_dim_)
+        k_descale, v_descale = None, None  # disable quantization
+        Lq = q.shape[-1]
+        sm_scale = 1.0 / (Lq ** 0.5)
+        o = flash_attn_with_kvcache(
+            q=q,
+            k_cache=cache_k,
+            v_cache=cache_v,
+            page_table=infer_state.page_table,
+            cache_seqlens=infer_state.b_seq_len,
+            cu_seqlens_q=infer_state.cu_seqlens_q,
+            cu_seqlens_k_new=infer_state.cu_seqlens_k,
+            max_seqlen_q=1,
+            softmax_scale=sm_scale,
+            causal=True,
+            window_size=window_size,
+            softcap=0.0,
+            k_descale=k_descale,
+            v_descale=v_descale,
+            return_softmax_lse=False,
+            sinks=layer_weight.attn_sinks.weight,
+        )
+        return o
@@ -0,0 +1,182 @@
+import os
+import torch
+import numpy as np
+
+from lightllm.common.basemodel.layer_weights.meta_weights.mm_weight.rowmm_weight import ROWMMWeight
+from lightllm.common.basemodel.layer_weights.meta_weights.norm_weight import DummyWeight
+from lightllm.models.bloom import model
+from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+FP4_VALUES = [
+    +0.0,
+    +0.5,
+    +1.0,
+    +1.5,
+    +2.0,
+    +3.0,
+    +4.0,
+    +6.0,
+    -0.0,
+    -0.5,
+    -1.0,
+    -1.5,
+    -2.0,
+    -3.0,
+    -4.0,
+    -6.0,
+]
+
+class GptOssTransformerLayerWeight(LlamaTransformerLayerWeight):
+    def __init__(
+        self,
+        layer_num,
+        data_type,
+        network_config,
+        mode=[],
+        quant_cfg=None,
+    ):
+        super().__init__(layer_num, data_type, network_config, mode, quant_cfg)
+        return
+
+    def _init_moe(self):
+        moe_mode = os.getenv("MOE_MODE", "TP")
+        assert moe_mode in ["TP"], "For now, GPT-OSS type model only support MOE TP mode."
+        self.moe_gate = ROWMMWeight(
+            weight_name=self._router_weight_name,
+            data_type=self.data_type_,
+            layer_num=self.layer_num_,
+            bias_name=self._router_bias_name,
+            name="moe_gate",
+            tp_rank=0,
+            tp_world_size=1,
+        )
+        self.down_proj_bias = DummyWeight(self._down_bias_name, torch.bfloat16)
+        self.down_proj_weight_blocks = DummyWeight(self._down_blocks_name, torch.uint8)
+        self.down_proj_weight_scales = DummyWeight(self._down_scales_name, torch.uint8)
+
+        self.gate_up_proj_bias = DummyWeight(self._gate_up_bias_name, torch.bfloat16)
+        self.gate_up_proj_weight_blocks = DummyWeight(self._gate_up_blocks_name, torch.uint8)
+        self.gate_up_proj_weight_scales = DummyWeight(self._gate_up_scales_name, torch.uint8)
+        self.attn_sinks = DummyWeight(self._attn_sink_name, torch.bfloat16)
+
+    def _init_weight_names(self):
+        super()._init_weight_names()
+
+        # Sinks
+        self._attn_sink_name = f"model.layers.{self.layer_num_}.self_attn.sinks"
+
+        # Bias
+        self._q_bias_name = f"model.layers.{self.layer_num_}.self_attn.q_proj.bias"
+        self._k_bias_name = f"model.layers.{self.layer_num_}.self_attn.k_proj.bias"
+        self._v_bias_name = f"model.layers.{self.layer_num_}.self_attn.v_proj.bias"
+        self._o_bias_name = f"model.layers.{self.layer_num_}.self_attn.o_proj.bias"
+
+        # MOE Layers
+        # model.layers.0.mlp.experts.down_proj_bias	[32, 2 880]	
+        # model.layers.0.mlp.experts.down_proj_blocks	[32, 2 880, 90, 16]	
+        # model.layers.0.mlp.experts.down_proj_scales	[32, 2 880, 90]	
+        # model.layers.0.mlp.experts.gate_up_proj_bias	[32, 5 760]	
+        # model.layers.0.mlp.experts.gate_up_proj_blocks	[32, 5 760, 90, 16]	
+        # model.layers.0.mlp.experts.gate_up_proj_scales	[32, 5 760, 90]	
+        # model.layers.0.mlp.router.bias	[32]	
+        # model.layers.0.mlp.router.weight	[32, 2 880]	
+
+        self._router_bias_name = f"model.layers.{self.layer_num_}.mlp.router.bias"
+        self._router_weight_name = f"model.layers.{self.layer_num_}.mlp.router.weight"
+
+        self._down_bias_name = f"model.layers.{self.layer_num_}.mlp.experts.down_proj_bias"
+        self._down_blocks_name = f"model.layers.{self.layer_num_}.mlp.experts.down_proj_blocks"
+        self._down_scales_name = f"model.layers.{self.layer_num_}.mlp.experts.down_proj_scales"
+        self._down_weight_name = None
+
+        self._gate_up_bias_name = f"model.layers.{self.layer_num_}.mlp.experts.gate_up_proj_bias"
+        self._gate_up_blocks_name = f"model.layers.{self.layer_num_}.mlp.experts.gate_up_proj_blocks"
+        self._gate_up_scales_name = f"model.layers.{self.layer_num_}.mlp.experts.gate_up_proj_scales"
+        self._gate_up_weight_name = None
+
+    def _init_ffn(self):
+        self._init_moe()
+
+    def _post_weight_process(self):
+        self.moe_intermediate_size = self.network_config_["intermediate_size"]
+        self.split_inter_size = self.moe_intermediate_size // self.tp_world_size_
+
+        self.down_proj_weight = self._convert_moe_packed_tensors(
+            blocks=self.down_proj_weight_blocks.weight,
+            scales=self.down_proj_weight_scales.weight,
+            dtype=torch.bfloat16,
+        )[:, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1), :]
+        # (32, 1440, 2880)
+
+        self.gate_up_proj_weight = self._convert_moe_packed_tensors(
+            blocks=self.gate_up_proj_weight_blocks.weight,
+            scales=self.gate_up_proj_weight_scales.weight,
+            dtype=torch.bfloat16,
+        ) # (32, 2880, 5760)
+        expert_num = self.gate_up_proj_weight.shape[0]
+        self.gate_up_proj_weight = self.gate_up_proj_weight.reshape(expert_num, -1, 2, self.moe_intermediate_size)[
+            :, :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
+        ].reshape(expert_num, -1, 2*self.split_inter_size)
+        # (32, 2880, 2880)
+
+        self.gate_up_proj_bias.weight = self.gate_up_proj_bias.weight.reshape(expert_num, 2, self.moe_intermediate_size)[
+            :, :, self.split_inter_size * self.tp_rank_ : self.split_inter_size * (self.tp_rank_ + 1)
+        ].reshape(expert_num, 2*self.split_inter_size)
+        # (32, 2880)
+    
+    def _convert_moe_packed_tensors(
+        self,
+        blocks,
+        scales,
+        *,
+        dtype: torch.dtype = torch.bfloat16,
+        rows_per_chunk: int = 32768 * 1024,  # TODO these values are not here by mistake ;)
+    ) -> torch.Tensor:
+        """
+        Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
+        pass of GPT_OSS.
+        """
+        import math
+
+        # Check if blocks and scales are on CPU, and move to GPU if so
+        if not blocks.is_cuda and torch.cuda.is_available():
+            blocks = blocks.cuda()
+            scales = scales.cuda()
+
+        scales = scales.to(torch.int32) - 127  # TODO that's because 128=2**7
+
+        assert blocks.shape[:-1] == scales.shape, f"{blocks.shape[:-1]=} does not match {scales.shape=}"
+
+        lut = torch.tensor(FP4_VALUES, dtype=dtype, device=blocks.device)
+
+        *prefix_shape, G, B = blocks.shape
+        rows_total = math.prod(prefix_shape) * G
+
+        blocks = blocks.reshape(rows_total, B)
+        scales = scales.reshape(rows_total, 1)
+
+        out = torch.empty(rows_total, B * 2, dtype=dtype, device=blocks.device)
+
+        for r0 in range(0, rows_total, rows_per_chunk):
+            r1 = min(r0 + rows_per_chunk, rows_total)
+
+            blk = blocks[r0:r1]
+            exp = scales[r0:r1]
+
+            # nibble indices -> int64
+            idx_lo = (blk & 0x0F).to(torch.long)
+            idx_hi = (blk >> 4).to(torch.long)
+
+            sub = out[r0:r1]
+            sub[:, 0::2] = lut[idx_lo]
+            sub[:, 1::2] = lut[idx_hi]
+
+            torch.ldexp(sub, exp, out=sub)
+            del idx_lo, idx_hi, blk, exp, sub
+
+        out = out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)
+        del blocks, scales, lut
+        return out.transpose(1, 2).contiguous()
Original file line number	Diff line number	Diff line change
`@@ -35,4 +35,5 @@`
`35`	`35`	`Tarsier2Qwen2VLTpPartModel,`
`36`	`36`	`Tarsier2LlamaTpPartModel,`
`37`	`37`	`)`
	`38`	`+from lightllm.models.gpt_oss.model import GptOssTpPartModel`
`38`	`39`	`from .registry import get_model`