Skip to content

Commit 5712d6a

Browse files
authored
update megatron shell (#4773)
1 parent 382be6a commit 5712d6a

File tree

18 files changed

+65
-24
lines changed

18 files changed

+65
-24
lines changed

docs/source/BestPractices/Qwen3最佳实践.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ megatron sft \
354354
--finetune true \
355355
--cross_entropy_loss_fusion true \
356356
--lr 1e-5 \
357-
--lr_warmup_iters 100 \
357+
--lr_warmup_fraction 0.05 \
358358
--min_lr 1e-6 \
359359
--save megatron_output/Qwen3-30B-A3B-Base \
360360
--eval_interval 200 \

docs/source/Instruction/Megatron-SWIFT训练.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ megatron sft \
6969
--finetune true \
7070
--cross_entropy_loss_fusion true \
7171
--lr 1e-5 \
72-
--lr_warmup_iters 10 \
72+
--lr_warmup_fraction 0.05 \
7373
--min_lr 1e-6 \
7474
--max_epochs 1 \
7575
--save megatron_output/Qwen2.5-7B-Instruct \

docs/source_en/BestPractices/Qwen3-Best-Practice.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ megatron sft \
358358
--finetune true \
359359
--cross_entropy_loss_fusion true \
360360
--lr 1e-5 \
361-
--lr_warmup_iters 100 \
361+
--lr_warmup_fraction 0.05 \
362362
--min_lr 1e-6 \
363363
--save megatron_output/Qwen3-30B-A3B-Base \
364364
--eval_interval 200 \

docs/source_en/Instruction/Megatron-SWIFT-Training.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ megatron sft \
7070
--finetune true \
7171
--cross_entropy_loss_fusion true \
7272
--lr 1e-5 \
73-
--lr_warmup_iters 10 \
73+
--lr_warmup_fraction 0.05 \
7474
--min_lr 1e-6 \
7575
--max_epochs 1 \
7676
--save megatron_output/Qwen2.5-7B-Instruct \

examples/train/megatron/base_to_chat.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ megatron sft \
1414
--finetune true \
1515
--cross_entropy_loss_fusion true \
1616
--lr 1e-5 \
17-
--lr_warmup_iters 100 \
17+
--lr_warmup_fraction 0.05 \
1818
--min_lr 1e-6 \
1919
--save megatron_output/Qwen2.5-14B \
2020
--eval_interval 200 \

examples/train/megatron/dense/72b.sh renamed to examples/train/megatron/dense/72b_offload.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ megatron sft \
1818
--finetune true \
1919
--cross_entropy_loss_fusion true \
2020
--lr 1e-5 \
21-
--lr_warmup_iters 100 \
21+
--lr_warmup_fraction 0.05 \
2222
--min_lr 1e-6 \
2323
--save megatron_output/Qwen2.5-72B-Instruct \
2424
--eval_interval 500 \

examples/train/megatron/dense/qwen3_32b.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ megatron sft \
1818
--finetune true \
1919
--cross_entropy_loss_fusion true \
2020
--lr 1e-5 \
21-
--lr_warmup_iters 100 \
21+
--lr_warmup_fraction 0.05 \
2222
--min_lr 1e-6 \
2323
--save megatron_output/Qwen3-32B \
2424
--eval_interval 500 \

examples/train/megatron/long_text.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ megatron sft \
1919
--finetune true \
2020
--cross_entropy_loss_fusion true \
2121
--lr 1e-5 \
22-
--lr_warmup_iters 100 \
22+
--lr_warmup_fraction 0.05 \
2323
--min_lr 1e-6 \
2424
--save megatron_output/Qwen2.5-7B \
2525
--eval_interval 200 \

examples/train/megatron/moe/qwen3_moe.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,11 @@ megatron sft \
1818
--recompute_granularity full \
1919
--recompute_method uniform \
2020
--recompute_num_layers 1 \
21-
--train_iters 2000 \
22-
--eval_iters 50 \
21+
--max_epochs 3 \
2322
--finetune true \
2423
--cross_entropy_loss_fusion true \
2524
--lr 1e-5 \
26-
--lr_warmup_iters 100 \
25+
--lr_warmup_fraction 0.05 \
2726
--min_lr 1e-6 \
2827
--save megatron_output/Qwen3-30B-A3B-Base \
2928
--eval_interval 200 \
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# 28s/it; 4 * 75GiB
2+
NPROC_PER_NODE=4 \
3+
CUDA_VISIBLE_DEVICES=0,1,2,3 \
4+
megatron sft \
5+
--load Qwen3-30B-A3B-Base-mcore \
6+
--dataset 'liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT' \
7+
--expert_model_parallel_size 4 \
8+
--moe_grouped_gemm true \
9+
--moe_shared_expert_overlap true \
10+
--moe_aux_loss_coeff 0.01 \
11+
--micro_batch_size 1 \
12+
--global_batch_size 16 \
13+
--packing true \
14+
--recompute_granularity full \
15+
--recompute_method uniform \
16+
--recompute_num_layers 1 \
17+
--finetune true \
18+
--cross_entropy_loss_fusion true \
19+
--lr 1e-5 \
20+
--lr_warmup_fraction 0.05 \
21+
--min_lr 1e-6 \
22+
--save megatron_output/Qwen3-30B-A3B-Base \
23+
--eval_interval 200 \
24+
--save_interval 200 \
25+
--max_length 8192 \
26+
--max_epochs 3 \
27+
--num_workers 8 \
28+
--dataset_num_proc 8 \
29+
--no_save_optim true \
30+
--no_save_rng true \
31+
--sequence_parallel true \
32+
--optimizer_cpu_offload true \
33+
--use_precision_aware_optimizer true \
34+
--attention_backend flash

0 commit comments

Comments
 (0)