Skip to content

Commit f52e327

Browse files
authored
[model] Support MiMo-VL (#4429)
1 parent 1454e1f commit f52e327

File tree

11 files changed

+53
-6
lines changed

11 files changed

+53
-6
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@
540540
|[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|✘|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
541541
|[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|✘|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
542542
|[XiaomiMiMo/MiMo-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL)|mimo|qwen|transformers>=4.37|✘|-|[XiaomiMiMo/MiMo-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL)|
543+
|[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|✘|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
543544
|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
544545
|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|✘|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
545546
|[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|✘|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|
@@ -614,6 +615,8 @@
614615
|[AIDC-AI/Ovis2-8B](https://modelscope.cn/models/AIDC-AI/Ovis2-8B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-8B](https://huggingface.co/AIDC-AI/Ovis2-8B)|
615616
|[AIDC-AI/Ovis2-16B](https://modelscope.cn/models/AIDC-AI/Ovis2-16B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-16B](https://huggingface.co/AIDC-AI/Ovis2-16B)|
616617
|[AIDC-AI/Ovis2-34B](https://modelscope.cn/models/AIDC-AI/Ovis2-34B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-34B](https://huggingface.co/AIDC-AI/Ovis2-34B)|
618+
|[XiaomiMiMo/MiMo-VL-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-SFT)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-SFT)|
619+
|[XiaomiMiMo/MiMo-VL-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-RL)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL)|
617620
|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42,<4.45|&#x2718;|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
618621
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
619622
|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,7 @@ The table below introduces the models integrated with ms-swift:
540540
|[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|&#x2718;|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
541541
|[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|&#x2718;|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
542542
|[XiaomiMiMo/MiMo-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL)|mimo|qwen|transformers>=4.37|&#x2718;|-|[XiaomiMiMo/MiMo-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL)|
543+
|[XiaomiMiMo/MiMo-7B-RL-0530](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-0530)|mimo_rl|mimo_rl|transformers>=4.37|&#x2718;|-|[XiaomiMiMo/MiMo-7B-RL-0530](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-0530)|
543544
|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
544545
|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|transformers>=4.48|&#x2718;|bert|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
545546
|[iic/gte-modernbert-base](https://modelscope.cn/models/iic/gte-modernbert-base)|modern_bert_gte|dummy|transformers>=4.48|&#x2718;|bert, embedding|[Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base)|
@@ -614,6 +615,8 @@ The table below introduces the models integrated with ms-swift:
614615
|[AIDC-AI/Ovis2-8B](https://modelscope.cn/models/AIDC-AI/Ovis2-8B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-8B](https://huggingface.co/AIDC-AI/Ovis2-8B)|
615616
|[AIDC-AI/Ovis2-16B](https://modelscope.cn/models/AIDC-AI/Ovis2-16B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-16B](https://huggingface.co/AIDC-AI/Ovis2-16B)|
616617
|[AIDC-AI/Ovis2-34B](https://modelscope.cn/models/AIDC-AI/Ovis2-34B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-34B](https://huggingface.co/AIDC-AI/Ovis2-34B)|
618+
|[XiaomiMiMo/MiMo-VL-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-SFT)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-SFT)|
619+
|[XiaomiMiMo/MiMo-VL-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-RL)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL)|
617620
|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42,<4.45|&#x2718;|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
618621
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
619622
|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|

swift/llm/model/constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class LLMModelType:
114114
aya = 'aya'
115115
moonlight = 'moonlight'
116116
mimo = 'mimo'
117+
mimo_rl = 'mimo_rl'
117118

118119

119120
class BertModelType:
@@ -143,6 +144,7 @@ class MLLMModelType:
143144
ovis1_6 = 'ovis1_6'
144145
ovis1_6_llama3 = 'ovis1_6_llama3'
145146
ovis2 = 'ovis2'
147+
mimo_vl = 'mimo_vl'
146148

147149
glm4v = 'glm4v'
148150
glm_edge_v = 'glm_edge_v'

swift/llm/model/model/llm.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,3 +317,14 @@ def forward(self, **kwargs):
317317
model_arch=ModelArch.llama,
318318
architectures=['MiMoForCausalLM'],
319319
requires=['transformers>=4.37']))
320+
321+
register_model(
322+
ModelMeta(
323+
LLMModelType.mimo_rl, [ModelGroup([
324+
Model('XiaomiMiMo/MiMo-7B-RL-0530', 'XiaomiMiMo/MiMo-7B-RL-0530'),
325+
])],
326+
TemplateType.mimo_rl,
327+
get_model_tokenizer_with_flash_attn,
328+
model_arch=ModelArch.llama,
329+
architectures=['MiMoForCausalLM'],
330+
requires=['transformers>=4.37']))

swift/llm/model/model/microsoft.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def get_model_tokenizer_phi(model_dir: str,
195195
Model('LLM-Research/Phi-3-medium-128k-instruct', 'microsoft/Phi-3-medium-128k-instruct'),
196196
Model('LLM-Research/Phi-3.5-mini-instruct', 'microsoft/Phi-3.5-mini-instruct'),
197197
]),
198-
ModelGroup(Model('LLM-Research/Phi-4-mini-instruct', 'microsoft/Phi-4-mini-instruct'))
198+
ModelGroup([Model('LLM-Research/Phi-4-mini-instruct', 'microsoft/Phi-4-mini-instruct')])
199199
],
200200
TemplateType.phi3,
201201
get_model_tokenizer_with_flash_attn,

swift/llm/model/model/qwen.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,21 @@ def get_model_tokenizer_qwen2_5_vl(*args, **kwargs):
685685
requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'],
686686
tags=['vision', 'video']))
687687

688+
register_model(
689+
ModelMeta(
690+
MLLMModelType.mimo_vl, [
691+
ModelGroup([
692+
Model('XiaomiMiMo/MiMo-VL-7B-SFT', 'XiaomiMiMo/MiMo-VL-7B-SFT'),
693+
Model('XiaomiMiMo/MiMo-VL-7B-RL', 'XiaomiMiMo/MiMo-VL-7B-RL'),
694+
])
695+
],
696+
TemplateType.mimo_vl,
697+
get_model_tokenizer_qwen2_5_vl,
698+
model_arch=ModelArch.qwen2_vl,
699+
architectures=['Qwen2_5_VLForConditionalGeneration'],
700+
requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'],
701+
tags=['vision', 'video']))
702+
688703

689704
def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
690705
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, Qwen2_5OmniConfig

swift/llm/model/register.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ class ModelGroup:
4848
tags: List[str] = field(default_factory=list)
4949

5050
def __post_init__(self):
51-
if not isinstance(self.models, (tuple, list)):
52-
self.models = [self.models]
51+
assert isinstance(self.models, (tuple, list)), f'self.models: {self.models}'
5352

5453

5554
@dataclass

swift/llm/template/constant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class LLMTemplateType:
8080
bluelm = 'bluelm'
8181
orion = 'orion'
8282
moonlight = 'moonlight'
83+
mimo_rl = 'mimo_rl'
8384

8485
aya = 'aya'
8586
c4ai = 'c4ai'
@@ -102,6 +103,7 @@ class MLLMTemplateType:
102103
ovis1_6 = 'ovis1_6'
103104
ovis1_6_llama3 = 'ovis1_6_llama3'
104105
ovis2 = 'ovis2'
106+
mimo_vl = 'mimo_vl'
105107

106108
llama3_1_omni = 'llama3_1_omni'
107109
llama3_2_vision = 'llama3_2_vision'

swift/llm/template/template/llm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,3 +272,9 @@ class TeleChatTemplateMeta(TemplateMeta):
272272
chat_sep=[],
273273
suffix=['<|endoftext|>'],
274274
))
275+
276+
register_template(
277+
QwenTemplateMeta(
278+
LLMTemplateType.mimo_rl,
279+
default_system='You are MiMo, an AI assistant developed by Xiaomi.',
280+
))

swift/llm/template/template/qwen.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,12 @@ class Qwen2_5VLTemplate(Qwen2VLTemplate):
397397

398398
register_template(QwenTemplateMeta(MLLMTemplateType.qwen2_5_vl, template_cls=Qwen2_5VLTemplate))
399399

400+
register_template(
401+
QwenTemplateMeta(
402+
MLLMTemplateType.mimo_vl,
403+
template_cls=Qwen2_5VLTemplate,
404+
default_system='You are MiMo, an AI assistant developed by Xiaomi.'))
405+
400406

401407
class Qwen2_5OmniTemplate(Qwen2_5VLTemplate):
402408
version = 'omni'

0 commit comments

Comments
 (0)