Add starcoder2 model (#1066)

liuchuting · web-flow · commit aab46859a4b7 · 2025-07-24T08:42:07.000Z
diff --git a/mindone/transformers/__init__.py b/mindone/transformers/__init__.py
@@ -391,6 +391,13 @@
     SpeechT5Model,
     SpeechT5PreTrainedModel,
 )
+from .models.starcoder2 import (
+    Starcoder2ForCausalLM,
+    Starcoder2ForSequenceClassification,
+    Starcoder2ForTokenClassification,
+    Starcoder2Model,
+    Starcoder2PreTrainedModel,
+)
 from .models.switch_transformers import (
     SwitchTransformersEncoderModel,
     SwitchTransformersForConditionalGeneration,
diff --git a/mindone/transformers/integrations/sdpa_attention.py b/mindone/transformers/integrations/sdpa_attention.py
@@ -14,7 +14,7 @@ def repeat_kv(hidden_states: ms.Tensor, n_rep: int) -> ms.Tensor:
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape  # BNSD format
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    hidden_states = hidden_states[:, :, None, :, :].expand((batch, num_key_value_heads, n_rep, slen, head_dim))
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
@@ -33,7 +33,7 @@ def sdpa_attention_forward(
     value_states = repeat_kv(value, module.num_key_value_groups)
 
     attn_weights = mint.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
+    if attention_mask is not None and attention_mask.dim() == 4:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 
diff --git a/mindone/transformers/models/__init__.py b/mindone/transformers/models/__init__.py
@@ -67,6 +67,7 @@
     roberta,
     siglip,
     speecht5,
+    starcoder2,
     switch_transformers,
     t5,
     umt5,
diff --git a/mindone/transformers/models/auto/configuration_auto.py b/mindone/transformers/models/auto/configuration_auto.py
@@ -76,6 +76,7 @@
         ("mistral", "MistralConfig"),
         ("mobilebert", "MobileBertConfig"),
         ("mpt", "MptConfig"),
+        ("starcoder2", "Starcoder2Config"),
         ("mt5", "MT5Config"),
         ("megatron-bert", "MegatronBertConfig"),
         ("mixtral", "MixtralConfig"),
@@ -119,6 +120,7 @@
         ("chameleon", "Chameleon"),
         ("clap", "CLAP"),
         ("clip", "CLIP"),
+        ("starcoder2", "Starcoder2"),
         ("clip_vision_model", "CLIPVisionModel"),
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
diff --git a/mindone/transformers/models/auto/modeling_auto.py b/mindone/transformers/models/auto/modeling_auto.py
@@ -41,6 +41,7 @@
         ("bit", "BitModel"),
         ("blip", "BlipModel"),
         ("blip-2", "Blip2Model"),
+        ("starcoder2", "Starcoder2Model"),
         ("chameleon", "ChameleonModel"),
         ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
@@ -163,6 +164,7 @@
         ("bert-generation", "BertGenerationDecoder"),
         ("gemma", "GemmaForCausalLM"),
         ("gemma2", "Gemma2ForCausalLM"),
+        ("starcoder2", "Starcoder2ForCausalLM"),
         ("gemma3", "Gemma3ForCausalLM"),
         ("gemma3_text", "Gemma3ForCausalLM"),
         ("granite", "GraniteForCausalLM"),
@@ -355,6 +357,7 @@
         ("glm", "GlmForSequenceClassification"),
         ("helium", "HeliumForSequenceClassification"),
         ("led", "LEDForSequenceClassification"),
+        ("starcoder2", "Starcoder2ForSequenceClassification"),
         ("llama", "LlamaForSequenceClassification"),
         ("persimmon", "PersimmonForSequenceClassification"),
         ("mobilebert", "MobileBertForSequenceClassification"),
@@ -417,6 +420,7 @@
         ("camembert", "CamembertForTokenClassification"),
         ("deberta", "DebertaForTokenClassification"),
         ("deberta-v2", "DebertaV2ForTokenClassification"),
+        ("starcoder2", "Starcoder2ForTokenClassification"),
         ("glm", "GlmForTokenClassification"),
         ("helium", "HeliumForTokenClassification"),
         ("mistral", "MistralForTokenClassification"),
diff --git a/mindone/transformers/models/starcoder2/__init__.py b/mindone/transformers/models/starcoder2/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling_starcoder2 import (
+    Starcoder2ForCausalLM,
+    Starcoder2ForSequenceClassification,
+    Starcoder2ForTokenClassification,
+    Starcoder2Model,
+    Starcoder2PreTrainedModel,
+)
diff --git a/mindone/transformers/models/starcoder2/modeling_starcoder2.py b/mindone/transformers/models/starcoder2/modeling_starcoder2.py
diff --git a/tests/transformers_tests/models/starcoder2/__init__.py b/tests/transformers_tests/models/starcoder2/__init__.py
diff --git a/tests/transformers_tests/models/starcoder2/test_modeling_starcoder2.py b/tests/transformers_tests/models/starcoder2/test_modeling_starcoder2.py