Skip to content

Commit a33857a

Browse files
JimmyMa99Jintao-Huang
authored andcommitted
[model] support Midashenglm (#5325)
1 parent 4096a6a commit a33857a

File tree

23 files changed

+155
-57
lines changed

23 files changed

+155
-57
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,7 @@
693693
|[AIDC-AI/Ovis2-34B](https://modelscope.cn/models/AIDC-AI/Ovis2-34B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-34B](https://huggingface.co/AIDC-AI/Ovis2-34B)|
694694
|[XiaomiMiMo/MiMo-VL-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-SFT)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-SFT)|
695695
|[XiaomiMiMo/MiMo-VL-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-RL)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL)|
696+
|[mispeech/midashenglm-7b](https://modelscope.cn/models/mispeech/midashenglm-7b)|midashenglm|midashenglm|transformers>=4.52, soundfile|&#x2718;|audio|[mispeech/midashenglm-7b](https://huggingface.co/mispeech/midashenglm-7b)|
696697
|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42,<4.45|&#x2718;|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
697698
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
698699
|[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[THUDM/GLM-4.1V-9B-Base](https://huggingface.co/THUDM/GLM-4.1V-9B-Base)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,7 @@ The table below introduces the models integrated with ms-swift:
693693
|[AIDC-AI/Ovis2-34B](https://modelscope.cn/models/AIDC-AI/Ovis2-34B)|ovis2|ovis2|transformers>=4.46.2, moviepy<2|&#x2718;|vision|[AIDC-AI/Ovis2-34B](https://huggingface.co/AIDC-AI/Ovis2-34B)|
694694
|[XiaomiMiMo/MiMo-VL-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-SFT)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-SFT)|
695695
|[XiaomiMiMo/MiMo-VL-7B-RL](https://modelscope.cn/models/XiaomiMiMo/MiMo-VL-7B-RL)|mimo_vl|mimo_vl|transformers>=4.49, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[XiaomiMiMo/MiMo-VL-7B-RL](https://huggingface.co/XiaomiMiMo/MiMo-VL-7B-RL)|
696+
|[mispeech/midashenglm-7b](https://modelscope.cn/models/mispeech/midashenglm-7b)|midashenglm|midashenglm|transformers>=4.52, soundfile|&#x2718;|audio|[mispeech/midashenglm-7b](https://huggingface.co/mispeech/midashenglm-7b)|
696697
|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42,<4.45|&#x2718;|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
697698
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[THUDM/cogagent-9b-20241220](https://huggingface.co/THUDM/cogagent-9b-20241220)|
698699
|[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[THUDM/GLM-4.1V-9B-Base](https://huggingface.co/THUDM/GLM-4.1V-9B-Base)|

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ class MLLMModelType:
158158
ovis1_6_llama3 = 'ovis1_6_llama3'
159159
ovis2 = 'ovis2'
160160
mimo_vl = 'mimo_vl'
161+
midashenglm = 'midashenglm'
161162

162163
glm4v = 'glm4v'
163164
glm4_1v = 'glm4_1v'

swift/llm/model/model/qwen.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,29 @@ def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
800800
))
801801

802802

803+
def get_model_tokenizer_midashenglm(*args, **kwargs):
804+
model, tokenizer = get_model_tokenizer_multimodal(*args, **kwargs)
805+
if model is not None:
806+
model.audio_encoder.float()
807+
patch_output_clone(model.decoder.model.embed_tokens)
808+
return model, tokenizer
809+
810+
811+
register_model(
812+
ModelMeta(
813+
MLLMModelType.midashenglm,
814+
[ModelGroup([
815+
Model('mispeech/midashenglm-7b', 'mispeech/midashenglm-7b'),
816+
])],
817+
TemplateType.midashenglm,
818+
get_model_tokenizer_midashenglm,
819+
model_arch=ModelArch.midashenglm,
820+
architectures=['MiDashengLMModel'],
821+
requires=['transformers>=4.52', 'soundfile'],
822+
tags=['audio'],
823+
))
824+
825+
803826
def get_model_tokenizer_qwen2_audio(*args, **kwargs):
804827
from transformers import Qwen2AudioForConditionalGeneration
805828
kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2AudioForConditionalGeneration

swift/llm/model/model_arch.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ class MLLMModelArch:
7676
mistral_2503 = 'mistral_2503'
7777
keye_vl = 'keye_vl'
7878

79+
midashenglm = 'midashenglm'
80+
7981

8082
class ModelArch(LLMModelArch, MLLMModelArch):
8183
pass
@@ -517,6 +519,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
517519
generator=['talker', 'token2wav'],
518520
))
519521

522+
register_model_arch(
523+
MultiModelKeys(
524+
MLLMModelArch.midashenglm,
525+
language_model='decoder',
526+
aligner=['audio_projector'],
527+
vision_tower=['audio_encoder'],
528+
))
529+
520530
register_model_arch(
521531
MultiModelKeys(
522532
MLLMModelArch.glm4v,

swift/llm/template/base.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -298,31 +298,23 @@ def _replace_start_image_tags(inputs: StdTemplateInputs):
298298
inputs.generate_mode = generate_mode
299299

300300
@staticmethod
301-
def _extend_tokens(input_ids: List[int], labels: Optional[List[int]], replace_idx_list: List[int],
302-
get_new_tokens: Callable[[int], List[int]]) -> Tuple[List[int], Optional[List[int]]]:
301+
def _extend_tokens(
302+
input_ids: List[int], labels: Optional[List[int]], loss_scale: Optional[List[float]],
303+
replace_idx_list: List[int],
304+
get_new_tokens: Callable[[int], List[int]]) -> Tuple[List[int], Optional[List[int]], Optional[List[float]]]:
303305
added_tokens_len = 0
304306
for i, idx in enumerate(replace_idx_list):
305307
new_tokens = get_new_tokens(i)
306308
token_len = len(new_tokens)
307309
input_ids = input_ids[:idx + added_tokens_len] + new_tokens + input_ids[added_tokens_len + idx + 1:]
308310
if labels:
309311
labels = labels[:idx + added_tokens_len] + [-100] * token_len + labels[added_tokens_len + idx + 1:]
310-
added_tokens_len += token_len - 1
311-
return input_ids, labels
312-
313-
@staticmethod
314-
def _extend_loss_scale(loss_scale: Optional[List[float]], replace_idx_list: List[int],
315-
get_new_tokens: Callable[[int], List[int]]) -> Optional[List[float]]:
316-
if loss_scale:
317-
added_tokens_len = 0
318-
for i, idx in enumerate(replace_idx_list):
319-
new_tokens = get_new_tokens(i)
320-
token_len = len(new_tokens)
312+
if loss_scale:
321313
scale_idx = loss_scale[idx + added_tokens_len]
322314
loss_scale = loss_scale[:idx + added_tokens_len] + [scale_idx] * token_len + loss_scale[added_tokens_len
323315
+ idx + 1:]
324-
added_tokens_len += token_len - 1
325-
return loss_scale
316+
added_tokens_len += token_len - 1
317+
return input_ids, labels, loss_scale
326318

327319
def forward_context(self, model, inputs):
328320
return nullcontext()

swift/llm/template/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class MLLMTemplateType:
117117
ovis1_6_llama3 = 'ovis1_6_llama3'
118118
ovis2 = 'ovis2'
119119
mimo_vl = 'mimo_vl'
120+
midashenglm = 'midashenglm'
120121

121122
llama3_1_omni = 'llama3_1_omni'
122123
llama3_2_vision = 'llama3_2_vision'
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from . import (baidu, bert, deepseek, emu3, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm, megrez,
2-
microsoft, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen, stepfun, valley,
3-
yi)
2+
microsoft, midashenglm, minicpm, minimax, mistral, molmo, moonshot, mplug, openbuddy, pixtral, qwen,
3+
stepfun, valley, yi)

swift/llm/template/template/emu3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,8 +179,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
179179
image_prompts.append(self.tokenizer.encode(image_prompt))
180180

181181
# Insert image tokens into input_ids
182-
input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, lambda i: image_prompts[i])
183-
loss_scale = self._extend_loss_scale(loss_scale, idx_list, lambda i: image_prompts[i])
182+
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
183+
lambda i: image_prompts[i])
184184
return {'input_ids': input_ids, 'labels': labels, 'loss_scale': loss_scale}
185185

186186

swift/llm/template/template/gemma.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
112112
loss_scale = encoded.get('loss_scale', None)
113113
idx_list = findall(input_ids, self.boi_token_id)
114114
img_tokens = self._tokenize(self.processor.full_image_sequence)
115-
input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, lambda _: img_tokens)
116-
loss_scale = self._extend_loss_scale(loss_scale, idx_list, lambda _: img_tokens)
115+
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
116+
lambda _: img_tokens)
117117

118118
# TODO: customize
119119
processor_kwargs = Gemma3ProcessorKwargs._defaults['images_kwargs']
@@ -171,8 +171,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
171171
if inputs.images:
172172
idx_list = findall(input_ids, self.boi_token_id)
173173
img_tokens = self._tokenize(processor.full_image_sequence)
174-
input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, lambda _: img_tokens)
175-
loss_scale = self._extend_loss_scale(loss_scale, idx_list, lambda _: img_tokens)
174+
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list,
175+
lambda _: img_tokens)
176176

177177
# Process images
178178
processor_kwargs = Gemma3nProcessorKwargs._defaults.get('images_kwargs', {})
@@ -188,8 +188,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
188188
if audio_idx_list:
189189
# Get audio token sequence from processor
190190
audio_tokens = self._tokenize(processor.full_audio_sequence)
191-
input_ids, labels = self._extend_tokens(input_ids, labels, audio_idx_list, lambda _: audio_tokens)
192-
loss_scale = self._extend_loss_scale(loss_scale, audio_idx_list, lambda _: audio_tokens)
191+
input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, audio_idx_list,
192+
lambda _: audio_tokens)
193193

194194
# Process audios
195195
processor_kwargs = Gemma3nProcessorKwargs._defaults.get('audio_kwargs', {})

0 commit comments

Comments
 (0)