Skip to content

Commit 96c6492

Browse files
committed
[megatron] update to mcore 0.13 (#4903)
1 parent 7fa95c6 commit 96c6492

File tree

7 files changed

+18
-6
lines changed

7 files changed

+18
-6
lines changed

docs/source/Instruction/命令行参数.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,3 +727,4 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还
727727
- NODE_RANK: torchrun中`--node_rank`的参数透传。
728728
- LOG_LEVEL: 日志的level,默认为'INFO',你可以设置为'WARNING', 'ERROR'等。
729729
- SWIFT_DEBUG: 在`engine.infer(...)`时,若设置为'1',则会打印input_ids和generate_ids的内容。
730+
- VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。

docs/source_en/Instruction/Command-line-parameters.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,3 +744,4 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
744744
- NODE_RANK: Pass-through for the `--node_rank` parameter in torchrun.
745745
- LOG_LEVEL: The log level, default is 'INFO'. You can set it to 'WARNING', 'ERROR', etc.
746746
- SWIFT_DEBUG: During `engine.infer(...)`, if set to '1', the content of input_ids and generate_ids will be printed.
747+
- VLLM_USE_V1: Used to switch between V0 and V1 versions of vLLM.

examples/train/megatron/dense/72b_offload.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# 8 * 65GiB. 80s/it
2+
# use mcore==0.13
23
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
34
NPROC_PER_NODE=8 \
45
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \

examples/train/megatron/moe/deepseek_v3.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 8 * 56GiB
1+
# 8 * 60GiB; 9s/it
22
# For ease of use, we use moonshotai/Moonlight-16B-A3B-Instruct, which is also based on the DeepseekV3ForCausalLM architecture.
33
# https://modelscope.cn/models/moonshotai/Moonlight-16B-A3B-Instruct/file/view/master/config.json?status=1
44
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
@@ -8,7 +8,8 @@ megatron sft \
88
--load Moonlight-16B-A3B-Instruct-mcore \
99
--dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
1010
--split_dataset_ratio 0.01 \
11-
--tensor_model_parallel_size 2 \
11+
--pipeline_model_parallel_size 2 \
12+
--decoder_last_pipeline_num_layers 13 \
1213
--expert_model_parallel_size 4 \
1314
--moe_grouped_gemm true \
1415
--moe_shared_expert_overlap true \

examples/train/megatron/moe/qwen3_moe_offload.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# 28s/it; 4 * 75GiB
2+
# use mcore==0.13
23
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
34
NPROC_PER_NODE=4 \
45
CUDA_VISIBLE_DEVICES=0,1,2,3 \

examples/train/megatron/rlhf/dpo/moe.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 8 * 64GiB
1+
# 8 * 62GiB; 0.9s/it
22
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
33
NPROC_PER_NODE=8 \
44
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
@@ -7,7 +7,7 @@ megatron rlhf \
77
--load Qwen1.5-MoE-A2.7B-mcore \
88
--dataset 'hjh0119/shareAI-Llama3-DPO-zh-en-emoji#20000' \
99
--split_dataset_ratio 0.01 \
10-
--tensor_model_parallel_size 2 \
10+
--pipeline_model_parallel_size 2 \
1111
--expert_model_parallel_size 4 \
1212
--moe_grouped_gemm true \
1313
--moe_shared_expert_overlap true \

swift/megatron/init.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import sys
44
from datetime import datetime
55

6+
import megatron.core
67
import torch
78
import torch.nn.functional as F
9+
from packaging import version
810

911
from swift.llm import git_clone_github
1012
from swift.utils import get_logger, is_megatron_available, safe_ddp_context, subprocess_run
@@ -334,8 +336,13 @@ def forward(
334336
# Adjust key, value for inference
335337
# ===================================================
336338
# rotary_pos_emb = None
337-
query, key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
338-
inference_context, query, key, value, rotary_pos_emb=None)
339+
megatron_core_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
340+
if megatron_core_013:
341+
query, key, value, _, attn_mask_type, _ = self._adjust_key_value_for_inference(
342+
inference_context, query, key, value, rotary_pos_emb=None)
343+
else:
344+
query, key, value, _, attn_mask_type = self._adjust_key_value_for_inference(
345+
inference_context, query, key, value, rotary_pos_emb=None)
339346

340347
# TODO: Currently, TE can only accept contiguous tensors for MLA
341348
query = query.contiguous()

0 commit comments

Comments
 (0)