[megatron] update to mcore 0.13 (#4903)

Jintao-Huang · Jintao-Huang · commit 96c6492a374d · 2025-07-28T15:14:26.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -727,3 +727,4 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - NODE_RANK: torchrun中`--node_rank`的参数透传。
 - LOG_LEVEL: 日志的level，默认为'INFO'，你可以设置为'WARNING', 'ERROR'等。
 - SWIFT_DEBUG: 在`engine.infer(...)`时，若设置为'1'，则会打印input_ids和generate_ids的内容。
+- VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -744,3 +744,4 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - NODE_RANK: Pass-through for the `--node_rank` parameter in torchrun.
 - LOG_LEVEL: The log level, default is 'INFO'. You can set it to 'WARNING', 'ERROR', etc.
 - SWIFT_DEBUG: During `engine.infer(...)`, if set to '1', the content of input_ids and generate_ids will be printed.
+- VLLM_USE_V1: Used to switch between V0 and V1 versions of vLLM.
diff --git a/examples/train/megatron/dense/72b_offload.sh b/examples/train/megatron/dense/72b_offload.sh
@@ -1,4 +1,5 @@
 # 8 * 65GiB. 80s/it
+# use mcore==0.13
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=8 \
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
diff --git a/examples/train/megatron/moe/deepseek_v3.sh b/examples/train/megatron/moe/deepseek_v3.sh
@@ -1,4 +1,4 @@
-# 8 * 56GiB
+# 8 * 60GiB; 9s/it
 # For ease of use, we use moonshotai/Moonlight-16B-A3B-Instruct, which is also based on the DeepseekV3ForCausalLM architecture.
 # https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct/file/view/master/config.json?status=1
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
@@ -8,7 +8,8 @@ megatron sft \
     --load Moonlight-16B-A3B-Instruct-mcore \
     --dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
     --split_dataset_ratio 0.01 \
-    --tensor_model_parallel_size 2 \
+    --pipeline_model_parallel_size 2 \
+    --decoder_last_pipeline_num_layers 13 \
     --expert_model_parallel_size 4 \
     --moe_grouped_gemm true \
     --moe_shared_expert_overlap true \
diff --git a/examples/train/megatron/moe/qwen3_moe_offload.sh b/examples/train/megatron/moe/qwen3_moe_offload.sh
@@ -1,4 +1,5 @@
 # 28s/it; 4 * 75GiB
+# use mcore==0.13
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=4 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/examples/train/megatron/rlhf/dpo/moe.sh b/examples/train/megatron/rlhf/dpo/moe.sh
@@ -1,4 +1,4 @@
-# 8 * 64GiB
+# 8 * 62GiB; 0.9s/it
 PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
 NPROC_PER_NODE=8 \
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
@@ -7,7 +7,7 @@ megatron rlhf \
     --load Qwen1.5-MoE-A2.7B-mcore \
     --dataset 'hjh0119/shareAI-Llama3-DPO-zh-en-emoji#20000' \
     --split_dataset_ratio 0.01 \
-    --tensor_model_parallel_size 2 \
+    --pipeline_model_parallel_size 2 \
     --expert_model_parallel_size 4 \
     --moe_grouped_gemm true \
     --moe_shared_expert_overlap true \
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -3,8 +3,10 @@
 import sys
 from datetime import datetime
 
+import megatron.core
 import torch
 import torch.nn.functional as F
+from packaging import version
 
 from swift.llm import git_clone_github
 from swift.utils import get_logger, is_megatron_available, safe_ddp_context, subprocess_run
@@ -334,8 +336,13 @@ def forward(
         # Adjust key, value for inference
         # ===================================================
         # rotary_pos_emb = None
-        query, key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
-            inference_context, query, key, value, rotary_pos_emb=None)
+        megatron_core_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
+        if megatron_core_013:
+            query, key, value, _, attn_mask_type, _ = self._adjust_key_value_for_inference(
+                inference_context, query, key, value, rotary_pos_emb=None)
+        else:
+            query, key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
+                inference_context, query, key, value, rotary_pos_emb=None)
 
         # TODO: Currently, TE can only accept contiguous tensors for MLA
         query = query.contiguous()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# 8 * 65GiB. 80s/it`
	`2`	`+# use mcore==0.13`
`2`	`3`	`PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \`
`3`	`4`	`NPROC_PER_NODE=8 \`
`4`	`5`	`CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# 28s/it; 4 * 75GiB`
	`2`	`+# use mcore==0.13`
`2`	`3`	`PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \`
`3`	`4`	`NPROC_PER_NODE=4 \`
`4`	`5`	`CUDA_VISIBLE_DEVICES=0,1,2,3 \`