File tree Expand file tree Collapse file tree 7 files changed +50
-5
lines changed
examples/megatron/multimodal/moe Expand file tree Collapse file tree 7 files changed +50
-5
lines changed Original file line number Diff line number Diff line change 589589| [ moonshotai/Moonlight-16B-A3B-Instruct] ( https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Moonlight-16B-A3B-Instruct] ( https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct ) |
590590| [ moonshotai/Kimi-K2-Base] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Base ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Base] ( https://huggingface.co/moonshotai/Kimi-K2-Base ) |
591591| [ moonshotai/Kimi-K2-Instruct] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Instruct] ( https://huggingface.co/moonshotai/Kimi-K2-Instruct ) |
592+ | [ moonshotai/Kimi-K2-Instruct-0905] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct-0905 ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Instruct-0905] ( https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905 ) |
592593| [ XiaomiMiMo/MiMo-7B-Base] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-Base] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-Base ) |
593594| [ XiaomiMiMo/MiMo-7B-SFT] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-SFT] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT ) |
594595| [ XiaomiMiMo/MiMo-7B-RL-Zero] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-RL-Zero] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero ) |
Original file line number Diff line number Diff line change @@ -165,7 +165,6 @@ Moe模型的模型转换步骤和Dense模型一致(请参考Dense进行修改
165165# 2 * 43GiB, 8s/it
166166PYTORCH_CUDA_ALLOC_CONF=' expandable_segments:True' \
167167NPROC_PER_NODE=2 \
168- MAX_PIXELS=1003520 \
169168CUDA_VISIBLE_DEVICES=0,1 \
170169megatron sft \
171170 --load InternVL3_5-30B-A3B-mcore \
@@ -210,7 +209,6 @@ megatron sft \
210209
211210训练结束后,我们使用生成的HF格式权重对验证集进行推理:
212211``` shell
213- MAX_PIXELS=1003520 \
214212CUDA_VISIBLE_DEVICES=0 \
215213swift infer \
216214 --model megatron_output/InternVL3_5-30B-A3B/vx-xxx-hf \
Original file line number Diff line number Diff line change @@ -589,6 +589,7 @@ The table below introduces the models integrated with ms-swift:
589589| [ moonshotai/Moonlight-16B-A3B-Instruct] ( https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Moonlight-16B-A3B-Instruct] ( https://huggingface.co/moonshotai/Moonlight-16B-A3B-Instruct ) |
590590| [ moonshotai/Kimi-K2-Base] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Base ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Base] ( https://huggingface.co/moonshotai/Kimi-K2-Base ) |
591591| [ moonshotai/Kimi-K2-Instruct] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Instruct] ( https://huggingface.co/moonshotai/Kimi-K2-Instruct ) |
592+ | [ moonshotai/Kimi-K2-Instruct-0905] ( https://modelscope.cn/models/moonshotai/Kimi-K2-Instruct-0905 ) | moonlight| moonlight| transformers<4.49| ✔ ; | -| [ moonshotai/Kimi-K2-Instruct-0905] ( https://huggingface.co/moonshotai/Kimi-K2-Instruct-0905 ) |
592593| [ XiaomiMiMo/MiMo-7B-Base] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-Base ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-Base] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-Base ) |
593594| [ XiaomiMiMo/MiMo-7B-SFT] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-SFT ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-SFT] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-SFT ) |
594595| [ XiaomiMiMo/MiMo-7B-RL-Zero] ( https://modelscope.cn/models/XiaomiMiMo/MiMo-7B-RL-Zero ) | mimo| qwen| transformers>=4.37| ✔ ; | -| [ XiaomiMiMo/MiMo-7B-RL-Zero] ( https://huggingface.co/XiaomiMiMo/MiMo-7B-RL-Zero ) |
Original file line number Diff line number Diff line change @@ -167,7 +167,6 @@ The model conversion steps for MoE models are the same as those for Dense models
167167# 2 * 43GiB, 8s/it
168168PYTORCH_CUDA_ALLOC_CONF=' expandable_segments:True' \
169169NPROC_PER_NODE=2 \
170- MAX_PIXELS=1003520 \
171170CUDA_VISIBLE_DEVICES=0,1 \
172171megatron sft \
173172 --load InternVL3_5-30B-A3B-mcore \
@@ -212,7 +211,6 @@ megatron sft \
212211
213212After training is completed, we use the generated Hugging Face format weights to perform inference on the validation set:
214213``` shell
215- MAX_PIXELS=1003520 \
216214CUDA_VISIBLE_DEVICES=0 \
217215swift infer \
218216 --model megatron_output/InternVL3_5-30B-A3B/vx-xxx-hf \
Original file line number Diff line number Diff line change 1+ # 4 * 66GiB, 6.4s/it
2+ PYTORCH_CUDA_ALLOC_CONF=' expandable_segments:True' \
3+ NPROC_PER_NODE=4 \
4+ CUDA_VISIBLE_DEVICES=0,1,2,3 \
5+ megatron sft \
6+ --load GLM-4.5V-mcore \
7+ --dataset ' AI-ModelScope/LaTeX_OCR:human_handwrite#5000' \
8+ --train_type lora \
9+ --lora_rank 8 \
10+ --lora_alpha 32 \
11+ --target_modules all-linear \
12+ --sequence_parallel true \
13+ --freeze_llm false \
14+ --freeze_vit true \
15+ --freeze_aligner true \
16+ --packing true \
17+ --split_dataset_ratio 0.01 \
18+ --tensor_model_parallel_size 4 \
19+ --expert_tensor_parallel_size 1 \
20+ --expert_model_parallel_size 4 \
21+ --moe_permute_fusion true \
22+ --moe_grouped_gemm true \
23+ --moe_shared_expert_overlap true \
24+ --moe_aux_loss_coeff 1e-3 \
25+ --micro_batch_size 1 \
26+ --global_batch_size 2 \
27+ --recompute_granularity full \
28+ --recompute_method uniform \
29+ --recompute_num_layers 1 \
30+ --finetune true \
31+ --cross_entropy_loss_fusion true \
32+ --lr 1e-4 \
33+ --lr_warmup_fraction 0.05 \
34+ --min_lr 1e-5 \
35+ --max_epochs 1 \
36+ --save megatron_output/GLM-4.5V-mcore \
37+ --eval_interval 200 \
38+ --save_interval 200 \
39+ --vit_gradient_checkpointing true \
40+ --max_length 2048 \
41+ --num_workers 8 \
42+ --dataset_num_proc 8 \
43+ --no_save_optim true \
44+ --no_save_rng true \
45+ --attention_backend flash
Original file line number Diff line number Diff line change 11# 2 * 43GiB, 8s/it
22PYTORCH_CUDA_ALLOC_CONF=' expandable_segments:True' \
33NPROC_PER_NODE=2 \
4- MAX_PIXELS=1003520 \
54CUDA_VISIBLE_DEVICES=0,1 \
65megatron sft \
76 --load InternVL3_5-30B-A3B-mcore \
Original file line number Diff line number Diff line change 1818 Model ('moonshotai/Kimi-K2-Base' , 'moonshotai/Kimi-K2-Base' ),
1919 Model ('moonshotai/Kimi-K2-Instruct' , 'moonshotai/Kimi-K2-Instruct' ),
2020 ]),
21+ ModelGroup ([
22+ Model ('moonshotai/Kimi-K2-Instruct-0905' , 'moonshotai/Kimi-K2-Instruct-0905' ),
23+ ]),
2124 ],
2225 TemplateType .moonlight ,
2326 get_model_tokenizer_with_flash_attn ,
You can’t perform that action at this time.
0 commit comments