modelscope
diff --git a/‎docs/source/Instruction/命令行参数.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/Instruction/命令行参数.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/Instruction/支持的模型和数据集.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source/Instruction/支持的模型和数据集.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/Megatron-SWIFT/多模态模型.md‎
Lines changed: 1 addition & 3 deletions b/‎docs/source/Megatron-SWIFT/多模态模型.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source_en/Instruction/Command-line-parameters.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source_en/Instruction/Supported-models-and-datasets.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source_en/Instruction/Supported-models-and-datasets.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source_en/Megatron-SWIFT/Multimodal-Model.md‎
Lines changed: 1 addition & 3 deletions b/‎docs/source_en/Megatron-SWIFT/Multimodal-Model.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/megatron/multimodal/moe/glm4_5v.sh‎
Lines changed: 45 additions & 0 deletions b/‎examples/megatron/multimodal/moe/glm4_5v.sh‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎examples/megatron/multimodal/moe/lora.sh‎
Lines changed: 0 additions & 1 deletion b/‎examples/megatron/multimodal/moe/lora.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎swift/llm/model/model/glm.py‎
Lines changed: 4 additions & 1 deletion b/‎swift/llm/model/model/glm.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎swift/llm/model/model/moonshot.py‎
Lines changed: 3 additions & 0 deletions b/‎swift/llm/model/model/moonshot.py‎
Lines changed: 3 additions & 0 deletions
@@ -148,6 +148,7 @@
   - 注意：多模态模型且是LoRA训练时，当设置了`--freeze_vit false`，且命令行中出现以下警告：`UserWarning: None of the inputs have requires_grad=True. Gradients will be None`，请设置`--vit_gradient_checkpointing false`，或提相关issue。全参数训练则不会出现该问题。
 - 🔥deepspeed: 默认为None。可以设置为'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'来使用ms-swift内置的deepspeed配置文件。你也可以传入自定义deepspeed配置文件的路径。
 - zero_hpz_partition_size: 默认为None，这个参数是ZeRO++的特性，即node内模型分片，node间数据分片，如果遇到grad_norm NaN，请尝试使用`--torch_dtype float16`。
+- deepspeed_autotp_size: DeepSpeed张量并行大小，默认为1。使用DeepSpeed AutoTP时需将参数`--deepspeed`设置为'zero0'、'zero1'或'zero2'。（注意：该功能只支持全参数）
 - 🔥per_device_train_batch_size: 默认值1。
 - 🔥per_device_eval_batch_size: 默认值1。
 - 🔥gradient_accumulation_steps: 梯度累加，默认为None，即设置gradient_accumulation_steps使得total_batch_size>=16。total_batch_size等于`per_device_train_batch_size * gradient_accumulation_steps * world_size`, 在GRPO训练中，默认为1。
 
@@ -589,6 +589,7 @@
 |[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)|
 |[moonshotai/Kimi-K2-Base](https://modelscope.cn/models/moonshotai/Kimi-K2-Base)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Base](https://huggingface.co/moonshotai/Kimi-K2-Base)|
 |[moonshotai/Kimi-K2-Instruct](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)|
+|[moonshotai/Kimi-K2-Instruct-0905](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct-0905)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct-0905](https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905)|
 |[XiaomiMiMo/MiMo-7B-Base](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-Base](https://huggingface.co/XiaomiMiMo/MiMo-7B-Base)|
 |[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
 |[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
@@ -709,7 +710,7 @@
 |[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
 |[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
 |[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
-|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2718;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
+|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2714;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
 |[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2718;|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 
@@ -1,6 +1,6 @@
 # 多模态模型
 
-ms-swift引入了Megatron的并行技术来加速多模态大模型的训练。目前支持Qwen2.5-VL, Qwen2.5-Omni等模型的CPT/SFT/DPO。完整支持的模型可以参考[支持的模型与数据集文档](../Instruction/支持的模型和数据集.md)。
+ms-swift引入了Megatron的并行技术来加速多模态大模型的训练。目前支持Qwen2.5-VL, Qwen2.5-Omni, InternVL3.5, GLM4.5v等模型的CPT/SFT/DPO。完整支持的模型可以参考[支持的模型与数据集文档](../Instruction/支持的模型和数据集.md)。
 
 环境准备请参考Megatron-SWIFT的[快速开始文档](./快速开始.md)。
 
@@ -165,7 +165,6 @@ Moe模型的模型转换步骤和Dense模型一致（请参考Dense进行修改
 # 2 * 43GiB, 8s/it
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=2 \
-MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load InternVL3_5-30B-A3B-mcore \
@@ -210,7 +209,6 @@ megatron sft \
 
 训练结束后，我们使用生成的HF格式权重对验证集进行推理：
 ```shell
-MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --model megatron_output/InternVL3_5-30B-A3B/vx-xxx-hf \
 
@@ -150,7 +150,8 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - 🔥vit_gradient_checkpointing: Whether to enable gradient_checkpointing for the vit part during multi-modal model training. Defaults to None, meaning it is set to `gradient_checkpointing`. For an example, please refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/vit_gradient_checkpointing.sh).
   - Note: For multimodal models using LoRA training, when `--freeze_vit false` is set and the following warning appears in the command line: `UserWarning: None of the inputs have requires_grad=True. Gradients will be None`, please set `--vit_gradient_checkpointing false`, or raise a related issue. This problem does not occur during full-parameter training.
 - 🔥deepspeed: Defaults to None. It can be set to 'zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload' to use the built-in deepspeed configuration file of ms-swift. You can also provide a path to a custom DeepSpeed configuration file.
-- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16`
+- zero_hpz_partition_size: Default is `None`. This parameter is a feature of `ZeRO++`, which implements model sharding within nodes and data sharding between nodes. If you encounter grad_norm `NaN` issues, please try using `--torch_dtype float16`.
+- deepspeed_autotp_size: DeepSpeed tensor parallelism size, default is 1. When using DeepSpeed AutoTP, the argument `--deepspeed` must be set to 'zero0', 'zero1', or 'zero2'. (Note: This feature only supports full-parameter training.)
 - 🔥per_device_train_batch_size: Default is 1.
 - 🔥per_device_eval_batch_size: Default is 1.
 - 🔥gradient_accumulation_steps: Gradient accumulation, default is None, meaning set gradient_accumulation_steps such that total_batch_size >= 16. The total_batch_size equals `per_device_train_batch_size * gradient_accumulation_steps * world_size`. In GRPO Training, the default is 1.
 
@@ -589,6 +589,7 @@ The table below introduces the models integrated with ms-swift:
 |[moonshotai/Moonlight-16B-A3B-Instruct](https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Moonlight-16B-A3B-Instruct](https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct)|
 |[moonshotai/Kimi-K2-Base](https://modelscope.cn/models/moonshotai/Kimi-K2-Base)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Base](https://huggingface.co/moonshotai/Kimi-K2-Base)|
 |[moonshotai/Kimi-K2-Instruct](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct)|
+|[moonshotai/Kimi-K2-Instruct-0905](https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct-0905)|moonlight|moonlight|transformers<4.49|&#x2714;|-|[moonshotai/Kimi-K2-Instruct-0905](https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905)|
 |[XiaomiMiMo/MiMo-7B-Base](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-Base](https://huggingface.co/XiaomiMiMo/MiMo-7B-Base)|
 |[XiaomiMiMo/MiMo-7B-SFT](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-SFT](https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT)|
 |[XiaomiMiMo/MiMo-7B-RL-Zero](https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero)|mimo|qwen|transformers>=4.37|&#x2714;|-|[XiaomiMiMo/MiMo-7B-RL-Zero](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero)|
@@ -709,7 +710,7 @@ The table below introduces the models integrated with ms-swift:
 |[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
 |[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
 |[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
-|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2718;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
+|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2714;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
 |[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56.0.dev|&#x2718;|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|
 |[ZhipuAI/glm-edge-4b-chat](https://modelscope.cn/models/ZhipuAI/glm-edge-4b-chat)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-4b-chat](https://huggingface.co/zai-org/glm-edge-4b-chat)|
 
@@ -1,6 +1,6 @@
 # Multimodal Models
 
-ms-swift introduces Megatron's parallelization techniques to accelerate the training of large multimodal models. Currently, it supports CPT/SFT/DPO for models such as Qwen2.5-VL, Qwen2.5-Omni. For a complete list of supported models, please refer to the [Supported Models and Datasets documentation](../Instruction/Supported-models-and-datasets.md).
+ms-swift introduces Megatron's parallelization techniques to accelerate the training of large multimodal models. Currently, it supports CPT/SFT/DPO for models such as Qwen2.5-VL, Qwen2.5-Omni, InternVL3.5, GLM4.5v. For a complete list of supported models, please refer to the [Supported Models and Datasets documentation](../Instruction/Supported-models-and-datasets.md).
 
 For environment setup, please refer to the Megatron-SWIFT [Quick Start guide](./Quick-start.md).
 
@@ -167,7 +167,6 @@ The model conversion steps for MoE models are the same as those for Dense models
 # 2 * 43GiB, 8s/it
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=2 \
-MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load InternVL3_5-30B-A3B-mcore \
@@ -212,7 +211,6 @@ megatron sft \
 
 After training is completed, we use the generated Hugging Face format weights to perform inference on the validation set:
 ```shell
-MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --model megatron_output/InternVL3_5-30B-A3B/vx-xxx-hf \
 
@@ -0,0 +1,45 @@
+# 4 * 66GiB, 6.4s/it
+PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+megatron sft \
+    --load GLM-4.5V-mcore \
+    --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
+    --train_type lora \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --sequence_parallel true \
+    --freeze_llm false \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --packing true \
+    --split_dataset_ratio 0.01 \
+    --tensor_model_parallel_size 4 \
+    --expert_tensor_parallel_size 1 \
+    --expert_model_parallel_size 4 \
+    --moe_permute_fusion true \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-3 \
+    --micro_batch_size 1 \
+    --global_batch_size 2 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --max_epochs 1 \
+    --save megatron_output/GLM-4.5V-mcore \
+    --eval_interval 200 \
+    --save_interval 200 \
+    --vit_gradient_checkpointing true \
+    --max_length 2048 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --attention_backend flash
@@ -1,7 +1,6 @@
 # 2 * 43GiB, 8s/it
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=2 \
-MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1 \
 megatron sft \
     --load InternVL3_5-30B-A3B-mcore \
 
@@ -443,7 +443,10 @@ def get_model_tokenizer_glm_edge_v(model_dir: str, *args, **kwargs):
 def get_model_tokenizer_glm4_5v(*args, **kwargs):
     from transformers import Glm4vMoeForConditionalGeneration
     kwargs['automodel_class'] = kwargs['automodel_class'] or Glm4vMoeForConditionalGeneration
-    return get_model_tokenizer_multimodal(*args, **kwargs)
+    model, processor = get_model_tokenizer_multimodal(*args, **kwargs)
+    if model is not None:
+        patch_get_input_embeddings(model.visual, 'patch_embed')
+    return model, processor
 
 
 register_model(
 
@@ -18,6 +18,9 @@
                 Model('moonshotai/Kimi-K2-Base', 'moonshotai/Kimi-K2-Base'),
                 Model('moonshotai/Kimi-K2-Instruct', 'moonshotai/Kimi-K2-Instruct'),
             ]),
+            ModelGroup([
+                Model('moonshotai/Kimi-K2-Instruct-0905', 'moonshotai/Kimi-K2-Instruct-0905'),
+            ]),
         ],
         TemplateType.moonlight,
         get_model_tokenizer_with_flash_attn,