adding Starcoder2 PyTorch flow support

yibinl-nvidia · yibinl-nvidia · commit adbefc243a50 · 2025-11-04T21:13:57.000Z
Signed-off-by: Yibin Li &lt;109242046+yibinl-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
@@ -30,6 +30,7 @@
 from .modeling_qwen_moe import Qwen2MoeForCausalLM
 from .modeling_seedoss import SeedOssForCausalLM
 from .modeling_siglip import SiglipVisionModel
+from .modeling_starcoder2 import Starcoder2ForCausalLM
 from .modeling_utils import get_model_architecture
 from .modeling_vila import VilaModel
 
@@ -61,6 +62,7 @@
     "Qwen2ForRewardModel",
     "Qwen2MoeForCausalLM",
     "SiglipVisionModel",
+    "Starcoder2ForCausalLM",
     "get_model_architecture",
     "VilaModel",
     "Qwen2VLModel",
diff --git a/tensorrt_llm/_torch/models/modeling_starcoder2.py b/tensorrt_llm/_torch/models/modeling_starcoder2.py
@@ -0,0 +1,268 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import Starcoder2Config
+
+from tensorrt_llm._torch.attention_backend import AttentionMetadata
+from tensorrt_llm._torch.attention_backend.interface import (
+    PositionalEmbeddingParams, RopeParams)
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.modeling_utils import (DecoderModel,
+                                                       DecoderModelForCausalLM,
+                                                       register_auto_model)
+from tensorrt_llm._torch.modules.attention import Attention
+from tensorrt_llm._torch.modules.decoder_layer import DecoderLayer
+from tensorrt_llm._torch.modules.embedding import Embedding
+from tensorrt_llm._torch.modules.gated_mlp import GatedMLP
+from tensorrt_llm._torch.modules.linear import Linear, TensorParallelMode
+from tensorrt_llm._torch.modules.mlp import MLP
+from tensorrt_llm._torch.speculative import SpecMetadata
+from tensorrt_llm.functional import PositionEmbeddingType
+
+
+class Starcoder2LayerNorm(nn.LayerNorm):
+    """
+    Custom LayerNorm that skips weight initialization to support meta tensor initialization.
+    
+    StarCoder2ForCausalLM inherits from DecoderModelForCausalLM which uses the PostInitCaller
+    metaclass to enable meta tensor initialization (memory optimization). During model construction
+    with meta tensors, PyTorch's nn.LayerNorm.reset_parameters() tries to initialize weights with
+    ones_() which fails on meta tensors. This class skips that initialization step.
+    
+    The weights will be properly initialized later when loaded from the HuggingFace checkpoint.
+    """
+    
+    def reset_parameters(self) -> None:
+        # Skip initialization operations that conflict with meta tensor initialization
+        pass
+
+
+class Starcoder2Attention(Attention):
+    """
+    StarCoder2 Attention with Grouped Query Attention and Sliding Window support.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig[Starcoder2Config],
+        layer_idx: Optional[int] = None,
+    ):
+        config = model_config.pretrained_config
+        super().__init__(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            bias=config.use_bias,
+            pos_embd_params=PositionalEmbeddingParams(
+                type=PositionEmbeddingType.rope_gpt_neox,
+                rope=RopeParams.from_config(config),
+            ),
+            layer_idx=layer_idx,
+            dtype=config.torch_dtype,
+            config=model_config,
+        )
+
+
+class Starcoder2DecoderLayer(DecoderLayer):
+    """
+    StarCoder2 Decoder Layer.
+    
+    Architecture:
+        - Layer normalization before attention (with bias)
+        - Self-attention with GQA and sliding window
+        - Layer normalization before MLP (with bias)
+        - MLP with GELU activation
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig[Starcoder2Config],
+        layer_idx: int,
+    ):
+        super().__init__()
+        config = model_config.pretrained_config
+        self.layer_idx = layer_idx
+
+        self.self_attn = Starcoder2Attention(
+            model_config,
+            layer_idx=layer_idx,
+        )
+
+        if config.mlp_type == "default":
+            self.mlp = MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                bias=config.use_bias,
+                activation=nn.GELU(),
+                dtype=config.torch_dtype,
+                config=model_config,
+            )
+        else:
+            raise ValueError(f"Unsupported mlp_type: {config.mlp_type}")
+
+        norm_eps = getattr(config, 'norm_epsilon', 1e-5)
+        self.input_layernorm = Starcoder2LayerNorm(
+            config.hidden_size,
+            eps=norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+        self.post_attention_layernorm = Starcoder2LayerNorm(
+            config.hidden_size,
+            eps=norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        position_ids: torch.IntTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor] = None,
+        spec_metadata: Optional[SpecMetadata] = None,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states + residual), hidden_states + residual
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+        # Fully Connected (MLP)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        
+        if spec_metadata is not None:
+            spec_metadata.maybe_capture_hidden_states(self.layer_idx,
+                                                     hidden_states, residual)
+        
+        return hidden_states, residual
+
+
+class Starcoder2Model(DecoderModel):
+    """
+    StarCoder2 Transformer Model.
+    """
+
+    def __init__(self, model_config: ModelConfig[Starcoder2Config]):
+        super().__init__(model_config)
+        config = self.model_config.pretrained_config
+
+        self.embed_tokens = Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            dtype=config.torch_dtype,
+            mapping=model_config.mapping,
+            tensor_parallel_mode=TensorParallelMode.COLUMN,
+            gather_output=True,
+        )
+        
+        self.layers = nn.ModuleList([
+            Starcoder2DecoderLayer(
+                model_config,
+                layer_idx,
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+        
+        # Use norm_epsilon (Starcoder2Config attribute name)
+        norm_eps = getattr(config, 'norm_epsilon', 1e-5)
+        self.norm = Starcoder2LayerNorm(
+            config.hidden_size,
+            eps=norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: Optional[torch.IntTensor] = None,
+        position_ids: Optional[torch.IntTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        spec_metadata: Optional[SpecMetadata] = None,
+        lora_params=None,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, "
+                "and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        residual = None
+        for decoder_layer in self.layers:
+            hidden_states, residual = decoder_layer(
+                position_ids=position_ids,
+                hidden_states=hidden_states,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                spec_metadata=spec_metadata,
+                lora_params=lora_params,
+            )
+
+        hidden_states = self.norm(hidden_states + residual)
+        return hidden_states
+
+
+@register_auto_model("Starcoder2ForCausalLM")
+class Starcoder2ForCausalLM(DecoderModelForCausalLM[Starcoder2Model, Starcoder2Config]):
+    
+    def __init__(
+        self,
+        model_config: ModelConfig[Starcoder2Config],
+    ):
+        # Ensure torch_dtype is set on pretrained_config (StarCoder2 uses bfloat16)
+        if model_config.pretrained_config.torch_dtype is None:
+            model_config.pretrained_config.torch_dtype = torch.bfloat16
+        
+        super().__init__(
+            Starcoder2Model(model_config),
+            config=model_config,
+            hidden_size=model_config.pretrained_config.hidden_size,
+            vocab_size=model_config.pretrained_config.vocab_size,
+        )
+
+    def load_weights(self, weights, weight_mapper=None, skip_modules=[]):
+        """
+        Load weights with custom mapping for StarCoder2.
+        
+        StarCoder2 uses GPT-2 style MLP naming (c_fc, c_proj)
+        while our MLP module expects (up_proj, down_proj).
+        """
+        # Map HuggingFace StarCoder2 weight names to TensorRT-LLM names
+        params_map = {
+            r'(.*?)\.mlp\.c_fc\.(.*)': r'\1.mlp.up_proj.\2',
+            r'(.*?)\.mlp\.c_proj\.(.*)': r'\1.mlp.down_proj.\2',
+        }
+        
+        if weight_mapper is None:
+            # Use _load_weights_impl for non-weight-mapper path
+            from tensorrt_llm._torch.models.modeling_utils import _load_weights_impl
+            preload_weight_modules = getattr(self, "preload_weight_modules", None)
+            _load_weights_impl(self, weights, skip_modules, 
+                             params_map=params_map,
+                             preload_weight_modules=preload_weight_modules)
+        else:
+            # Use _load_weights_impl_v2 for weight-mapper path  
+            from tensorrt_llm._torch.models.modeling_utils import _load_weights_impl_v2
+            preload_weight_modules = getattr(self, "preload_weight_modules", None)
+            _load_weights_impl_v2(self, weights, weight_mapper, skip_modules,
+                                params_map=params_map,
+                                preload_weight_modules=preload_weight_modules)
+