Skip to content

Commit d8bdeeb

Browse files
authored
[model] Support ZhipuAI/GLM-4.5V (#5346)
1 parent 1a08750 commit d8bdeeb

File tree

17 files changed

+246
-152
lines changed

17 files changed

+246
-152
lines changed

docs/source/Instruction/命令行参数.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还
735735
- HD_NUM: 图片数量为1时,默认值为24。大于1,默认为6。参考[这里](https://modelscope.cn/models/AI-ModelScope/internlm-xcomposer2d5-7b/file/view/master?fileName=modeling_internlm_xcomposer2.py&status=1#L254)
736736

737737
### video_cogvlm2
738-
- NUM_FRAMES: 默认为24,参考[这里](https://github.com/THUDM/CogVLM2/blob/main/video_demo/inference.py#L22)
738+
- NUM_FRAMES: 默认为24,参考[这里](https://github.com/zai-org/CogVLM2/blob/main/video_demo/inference.py#L22)
739739

740740
### phi3_vision
741741
- NUM_CROPS: 默认为4,参考[这里](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 42 additions & 40 deletions
Large diffs are not rendered by default.

docs/source_en/Instruction/Command-line-parameters.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,7 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
752752
- HD_NUM: Default is 24 when the number of images is 1. Greater than 1, the default is 6. Refer to [here](https://modelscope.cn/models/AI-ModelScope/internlm-xcomposer2d5-7b/file/view/master?fileName=modeling_internlm_xcomposer2.py&status=1#L254)
753753

754754
### video_cogvlm2
755-
- NUM_FRAMES: Default is 24, refer to [here](https://github.com/THUDM/CogVLM2/blob/main/video_demo/inference.py#L22)
755+
- NUM_FRAMES: Default is 24, refer to [here](https://github.com/zai-org/CogVLM2/blob/main/video_demo/inference.py#L22)
756756

757757
### phi3_vision
758758
- NUM_CROPS: Default is 4, refer to [here](https://modelscope.cn/models/LLM-Research/Phi-3.5-vision-instruct)

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 42 additions & 40 deletions
Large diffs are not rendered by default.

swift/llm/dataset/data/dataset_info.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@
574574
{
575575
"ms_dataset_id": "ZhipuAI/LongWriter-6k",
576576
"tags": ["long", "chat", "sft", "🔥"],
577-
"hf_dataset_id": "THUDM/LongWriter-6k"
577+
"hf_dataset_id": "zai-org/LongWriter-6k"
578578
},
579579
{
580580
"ms_dataset_id": "swift/longwriter-6k-filtered",

swift/llm/model/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ class MLLMModelType:
162162

163163
glm4v = 'glm4v'
164164
glm4_1v = 'glm4_1v'
165+
glm4_5v = 'glm4_5v'
165166
glm_edge_v = 'glm_edge_v'
166167
cogvlm = 'cogvlm'
167168
cogagent_vqa = 'cogagent_vqa'

swift/llm/model/model/glm.py

Lines changed: 62 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,12 @@ def cross_entropy_forward(self, inputs: torch.Tensor, target: torch.Tensor) -> t
7676
ModelMeta(
7777
LLMModelType.chatglm2, [
7878
ModelGroup([
79-
Model('ZhipuAI/chatglm2-6b', 'THUDM/chatglm2-6b'),
80-
Model('ZhipuAI/chatglm2-6b-32k', 'THUDM/chatglm2-6b-32k')
79+
Model('ZhipuAI/chatglm2-6b', 'zai-org/chatglm2-6b'),
80+
Model('ZhipuAI/chatglm2-6b-32k', 'zai-org/chatglm2-6b-32k')
8181
],
8282
requires=['transformers<4.42']),
8383
ModelGroup(
84-
[Model('ZhipuAI/codegeex2-6b', 'THUDM/codegeex2-6b')],
84+
[Model('ZhipuAI/codegeex2-6b', 'zai-org/codegeex2-6b')],
8585
requires=['transformers<4.34'],
8686
tags=['coding'],
8787
),
@@ -95,10 +95,10 @@ def cross_entropy_forward(self, inputs: torch.Tensor, target: torch.Tensor) -> t
9595
ModelMeta(
9696
LLMModelType.chatglm3, [
9797
ModelGroup([
98-
Model('ZhipuAI/chatglm3-6b', 'THUDM/chatglm3-6b'),
99-
Model('ZhipuAI/chatglm3-6b-base', 'THUDM/chatglm3-6b-base'),
100-
Model('ZhipuAI/chatglm3-6b-32k', 'THUDM/chatglm3-6b-32k'),
101-
Model('ZhipuAI/chatglm3-6b-128k', 'THUDM/chatglm3-6b-128k'),
98+
Model('ZhipuAI/chatglm3-6b', 'zai-org/chatglm3-6b'),
99+
Model('ZhipuAI/chatglm3-6b-base', 'zai-org/chatglm3-6b-base'),
100+
Model('ZhipuAI/chatglm3-6b-32k', 'zai-org/chatglm3-6b-32k'),
101+
Model('ZhipuAI/chatglm3-6b-128k', 'zai-org/chatglm3-6b-128k'),
102102
])
103103
],
104104
TemplateType.glm4,
@@ -128,12 +128,12 @@ def get_model_tokenizer_glm4(model_dir: str,
128128
LLMModelType.glm4,
129129
[
130130
ModelGroup([
131-
Model('ZhipuAI/glm-4-9b-chat', 'THUDM/glm-4-9b-chat'),
132-
Model('ZhipuAI/glm-4-9b', 'THUDM/glm-4-9b'),
133-
Model('ZhipuAI/glm-4-9b-chat-1m', 'THUDM/glm-4-9b-chat-1m'),
131+
Model('ZhipuAI/glm-4-9b-chat', 'zai-org/glm-4-9b-chat'),
132+
Model('ZhipuAI/glm-4-9b', 'zai-org/glm-4-9b'),
133+
Model('ZhipuAI/glm-4-9b-chat-1m', 'zai-org/glm-4-9b-chat-1m'),
134134
]),
135135
ModelGroup([
136-
Model('ZhipuAI/LongWriter-glm4-9b', 'THUDM/LongWriter-glm4-9b'),
136+
Model('ZhipuAI/LongWriter-glm4-9b', 'zai-org/LongWriter-glm4-9b'),
137137
])
138138
],
139139
TemplateType.glm4,
@@ -148,11 +148,11 @@ def get_model_tokenizer_glm4(model_dir: str,
148148
LLMModelType.glm4_0414,
149149
[
150150
ModelGroup([
151-
Model('ZhipuAI/GLM-4-9B-0414', 'THUDM/GLM-4-9B-0414'),
152-
Model('ZhipuAI/GLM-4-32B-0414', 'THUDM/GLM-4-32B-0414'),
153-
Model('ZhipuAI/GLM-4-32B-Base-0414', 'THUDM/GLM-4-32B-Base-0414'),
154-
Model('ZhipuAI/GLM-Z1-9B-0414', 'THUDM/GLM-Z1-9B-0414'),
155-
Model('ZhipuAI/GLM-Z1-32B-0414', 'THUDM/GLM-Z1-32B-0414'),
151+
Model('ZhipuAI/GLM-4-9B-0414', 'zai-org/GLM-4-9B-0414'),
152+
Model('ZhipuAI/GLM-4-32B-0414', 'zai-org/GLM-4-32B-0414'),
153+
Model('ZhipuAI/GLM-4-32B-Base-0414', 'zai-org/GLM-4-32B-Base-0414'),
154+
Model('ZhipuAI/GLM-Z1-9B-0414', 'zai-org/GLM-Z1-9B-0414'),
155+
Model('ZhipuAI/GLM-Z1-32B-0414', 'zai-org/GLM-Z1-32B-0414'),
156156
])
157157
],
158158
TemplateType.glm4_0414,
@@ -166,7 +166,7 @@ def get_model_tokenizer_glm4(model_dir: str,
166166
ModelMeta(
167167
LLMModelType.glm4_z1_rumination,
168168
[ModelGroup([
169-
Model('ZhipuAI/GLM-Z1-Rumination-32B-0414', 'THUDM/GLM-Z1-Rumination-32B-0414'),
169+
Model('ZhipuAI/GLM-Z1-Rumination-32B-0414', 'zai-org/GLM-Z1-Rumination-32B-0414'),
170170
])],
171171
TemplateType.glm4_z1_rumination,
172172
get_model_tokenizer_with_flash_attn,
@@ -179,7 +179,7 @@ def get_model_tokenizer_glm4(model_dir: str,
179179
ModelMeta(
180180
LLMModelType.longwriter_llama3_1,
181181
[ModelGroup([
182-
Model('ZhipuAI/LongWriter-llama3.1-8b', 'THUDM/LongWriter-llama3.1-8b'),
182+
Model('ZhipuAI/LongWriter-llama3.1-8b', 'zai-org/LongWriter-llama3.1-8b'),
183183
])],
184184
TemplateType.longwriter_llama,
185185
get_model_tokenizer_with_flash_attn,
@@ -192,7 +192,7 @@ def get_model_tokenizer_glm4(model_dir: str,
192192
ModelMeta(
193193
LLMModelType.codegeex4,
194194
[ModelGroup([
195-
Model('ZhipuAI/codegeex4-all-9b', 'THUDM/codegeex4-all-9b'),
195+
Model('ZhipuAI/codegeex4-all-9b', 'zai-org/codegeex4-all-9b'),
196196
])],
197197
TemplateType.codegeex4,
198198
get_model_tokenizer_glm4,
@@ -231,13 +231,13 @@ def get_model_tokenizer_glm4v(model_dir: str,
231231
[
232232
ModelGroup(
233233
[
234-
Model('ZhipuAI/glm-4v-9b', 'THUDM/glm-4v-9b'),
234+
Model('ZhipuAI/glm-4v-9b', 'zai-org/glm-4v-9b'),
235235
],
236236
requires=['transformers>=4.42,<4.45'],
237237
),
238238
ModelGroup(
239239
[
240-
Model('ZhipuAI/cogagent-9b-20241220', 'THUDM/cogagent-9b-20241220'),
240+
Model('ZhipuAI/cogagent-9b-20241220', 'zai-org/cogagent-9b-20241220'),
241241
],
242242
requires=['transformers>=4.42'],
243243
)
@@ -268,8 +268,8 @@ def get_model_tokenizer_glm4_1v(*args, **kwargs):
268268
[
269269
ModelGroup(
270270
[
271-
Model('ZhipuAI/GLM-4.1V-9B-Base', 'THUDM/GLM-4.1V-9B-Base'),
272-
Model('ZhipuAI/GLM-4.1V-9B-Thinking', 'THUDM/GLM-4.1V-9B-Thinking'),
271+
Model('ZhipuAI/GLM-4.1V-9B-Base', 'zai-org/GLM-4.1V-9B-Base'),
272+
Model('ZhipuAI/GLM-4.1V-9B-Thinking', 'zai-org/GLM-4.1V-9B-Thinking'),
273273
],
274274
requires=['transformers>=4.53'],
275275
),
@@ -301,7 +301,7 @@ def get_model_tokenizer_cogvlm(model_dir: str,
301301
ModelMeta(
302302
MLLMModelType.cogvlm, [
303303
ModelGroup([
304-
Model('ZhipuAI/cogvlm-chat', 'THUDM/cogvlm-chat-hf'),
304+
Model('ZhipuAI/cogvlm-chat', 'zai-org/cogvlm-chat-hf'),
305305
]),
306306
],
307307
TemplateType.cogvlm,
@@ -314,7 +314,7 @@ def get_model_tokenizer_cogvlm(model_dir: str,
314314
ModelMeta(
315315
MLLMModelType.cogagent_chat, [
316316
ModelGroup([
317-
Model('ZhipuAI/cogagent-chat', 'THUDM/cogagent-chat-hf'),
317+
Model('ZhipuAI/cogagent-chat', 'zai-org/cogagent-chat-hf'),
318318
]),
319319
],
320320
TemplateType.cogagent_chat,
@@ -326,7 +326,7 @@ def get_model_tokenizer_cogvlm(model_dir: str,
326326
register_model(
327327
ModelMeta(
328328
MLLMModelType.cogagent_vqa, [ModelGroup([
329-
Model('ZhipuAI/cogagent-vqa', 'THUDM/cogagent-vqa-hf'),
329+
Model('ZhipuAI/cogagent-vqa', 'zai-org/cogagent-vqa-hf'),
330330
])],
331331
TemplateType.cogagent_vqa,
332332
get_model_tokenizer_cogvlm,
@@ -353,8 +353,8 @@ def get_model_tokenizer_cogvlm2(*args, **kwargs):
353353
ModelMeta(
354354
MLLMModelType.cogvlm2, [
355355
ModelGroup([
356-
Model('ZhipuAI/cogvlm2-llama3-chat-19B', 'THUDM/cogvlm2-llama3-chat-19B'),
357-
Model('ZhipuAI/cogvlm2-llama3-chinese-chat-19B', 'THUDM/cogvlm2-llama3-chinese-chat-19B'),
356+
Model('ZhipuAI/cogvlm2-llama3-chat-19B', 'zai-org/cogvlm2-llama3-chat-19B'),
357+
Model('ZhipuAI/cogvlm2-llama3-chinese-chat-19B', 'zai-org/cogvlm2-llama3-chinese-chat-19B'),
358358
]),
359359
],
360360
TemplateType.cogvlm2,
@@ -368,7 +368,7 @@ def get_model_tokenizer_cogvlm2(*args, **kwargs):
368368
MLLMModelType.cogvlm2_video,
369369
[
370370
ModelGroup([
371-
Model('ZhipuAI/cogvlm2-video-llama3-chat', 'THUDM/cogvlm2-video-llama3-chat'),
371+
Model('ZhipuAI/cogvlm2-video-llama3-chat', 'zai-org/cogvlm2-video-llama3-chat'),
372372
]),
373373
],
374374
TemplateType.cogvlm2_video,
@@ -384,8 +384,8 @@ def get_model_tokenizer_cogvlm2(*args, **kwargs):
384384
LLMModelType.glm_edge,
385385
[
386386
ModelGroup([
387-
Model('ZhipuAI/glm-edge-1.5b-chat', 'THUDM/glm-edge-1.5b-chat'),
388-
Model('ZhipuAI/glm-edge-4b-chat', 'THUDM/glm-edge-4b-chat'),
387+
Model('ZhipuAI/glm-edge-1.5b-chat', 'zai-org/glm-edge-1.5b-chat'),
388+
Model('ZhipuAI/glm-edge-4b-chat', 'zai-org/glm-edge-4b-chat'),
389389
]),
390390
],
391391
TemplateType.glm4,
@@ -408,8 +408,8 @@ def get_model_tokenizer_glm_edge_v(model_dir: str, *args, **kwargs):
408408
MLLMModelType.glm_edge_v,
409409
[
410410
ModelGroup([
411-
Model('ZhipuAI/glm-edge-v-2b', 'THUDM/glm-edge-v-2b'),
412-
Model('ZhipuAI/glm-edge-4b-chat', 'THUDM/glm-edge-4b-chat'),
411+
Model('ZhipuAI/glm-edge-v-2b', 'zai-org/glm-edge-v-2b'),
412+
Model('ZhipuAI/glm-edge-4b-chat', 'zai-org/glm-edge-4b-chat'),
413413
]),
414414
],
415415
TemplateType.glm_edge_v,
@@ -425,16 +425,39 @@ def get_model_tokenizer_glm_edge_v(model_dir: str, *args, **kwargs):
425425
LLMModelType.glm4_5,
426426
[
427427
ModelGroup([
428-
Model('ZhipuAI/GLM-4.5-Air-Base', 'THUDM/GLM-4.5-Air-Base'),
429-
Model('ZhipuAI/GLM-4.5-Air', 'THUDM/GLM-4.5-Air'),
430-
Model('ZhipuAI/GLM-4.5-Air-FP8', 'THUDM/GLM-4.5-Air-FP8'),
431-
Model('ZhipuAI/GLM-4.5-Base', 'THUDM/GLM-4.5-Base'),
432-
Model('ZhipuAI/GLM-4.5', 'THUDM/GLM-4.5'),
433-
Model('ZhipuAI/GLM-4.5-FP8', 'THUDM/GLM-4.5-FP8'),
428+
Model('ZhipuAI/GLM-4.5-Air-Base', 'zai-org/GLM-4.5-Air-Base'),
429+
Model('ZhipuAI/GLM-4.5-Air', 'zai-org/GLM-4.5-Air'),
430+
Model('ZhipuAI/GLM-4.5-Air-FP8', 'zai-org/GLM-4.5-Air-FP8'),
431+
Model('ZhipuAI/GLM-4.5-Base', 'zai-org/GLM-4.5-Base'),
432+
Model('ZhipuAI/GLM-4.5', 'zai-org/GLM-4.5'),
433+
Model('ZhipuAI/GLM-4.5-FP8', 'zai-org/GLM-4.5-FP8'),
434434
]),
435435
],
436436
TemplateType.glm4_5,
437437
get_model_tokenizer_with_flash_attn,
438438
architectures=['Glm4MoeForCausalLM'],
439439
requires=['transformers>=4.54'],
440440
))
441+
442+
443+
def get_model_tokenizer_glm4_5v(*args, **kwargs):
444+
from transformers import Glm4vMoeForConditionalGeneration
445+
kwargs['automodel_class'] = kwargs['automodel_class'] or Glm4vMoeForConditionalGeneration
446+
return get_model_tokenizer_multimodal(*args, **kwargs)
447+
448+
449+
register_model(
450+
ModelMeta(
451+
MLLMModelType.glm4_5v,
452+
[
453+
ModelGroup([
454+
Model('ZhipuAI/GLM-4.5V', 'zai-org/GLM-4.5V'),
455+
Model('ZhipuAI/GLM-4.5V-FP8', 'zai-org/GLM-4.5V-FP8'),
456+
]),
457+
],
458+
TemplateType.glm4_5v,
459+
get_model_tokenizer_glm4_5v,
460+
model_arch=ModelArch.glm4_1v,
461+
architectures=['Glm4vMoeForConditionalGeneration'],
462+
requires=['transformers>=4.56.0.dev'],
463+
))

swift/llm/template/base.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1694,6 +1694,11 @@ def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
16941694
pixel_values_videos = [b['pixel_values_videos'] for b in batch if b.get('pixel_values_videos') is not None]
16951695
if len(pixel_values_videos) > 0:
16961696
res['pixel_values_videos'] = torch.concat(pixel_values_videos)
1697+
1698+
for media_type in ['image', 'video']:
1699+
grid_thw = self.concat_tensor(batch, f'{media_type}_grid_thw', 0)
1700+
if grid_thw is not None:
1701+
res[f'{media_type}_grid_thw'] = grid_thw
16971702
return res
16981703

16991704
def _sp_data_collator(self, res, padding_to, tokenizer, padding_side):

swift/llm/template/constant.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class MLLMTemplateType:
160160
glm4v = 'glm4v'
161161
glm4_1v = 'glm4_1v'
162162
glm_edge_v = 'glm_edge_v'
163+
glm4_5v = 'glm4_5v'
163164

164165
minicpmv = 'minicpmv'
165166
minicpmv2_5 = 'minicpmv2_5'

swift/llm/template/template/dots.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,6 @@ def _get_new_tokens(i):
4949
encoded['loss_scale'] = loss_scale
5050
return encoded
5151

52-
def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
53-
res = super()._data_collator_mm_data(batch)
54-
grid_thw = self.concat_tensor(batch, 'image_grid_thw', 0)
55-
if grid_thw is not None:
56-
res['image_grid_thw'] = grid_thw
57-
return res
58-
5952

6053
register_template(
6154
TemplateMeta(

0 commit comments

Comments
 (0)