Skip to content

Commit 72dc153

Browse files
authored
add final config (#10933)
1 parent 38751cb commit 72dc153

File tree

4 files changed

+104
-37
lines changed

4 files changed

+104
-37
lines changed

llm/config/deepseek-v3/pretrain_argument.json

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,20 @@
44
"input_dir": "./data",
55
"output_dir": "./checkpoints/pretrain_ckpts",
66
"per_device_train_batch_size": 1,
7-
"gradient_accumulation_steps": 24,
7+
"gradient_accumulation_steps": 60,
88
"per_device_eval_batch_size": 1,
99
"tensor_parallel_degree": 1,
10-
"pipeline_parallel_degree": 4,
10+
"pipeline_parallel_degree": 8,
1111
"pipeline_parallel_config": "use_dualpipev",
12-
"sharding_parallel_degree": 2,
12+
"sharding_parallel_degree": 64,
1313
"sharding_parallel_config": "split_param enable_fuse_optimizer_states",
1414
"sharding_comm_buffer_size_MB": 4096,
15-
"expert_parallel_degree": 2,
15+
"expert_parallel_degree": 64,
1616
"sharding": "stage1",
1717
"virtual_pp_degree": 1,
1818
"sequence_parallel": 0,
1919
"use_flash_attention": true,
20-
"max_seq_length": 4096,
20+
"max_seq_length": 4097,
2121
"learning_rate": 3e-05,
2222
"min_learning_rate": 3e-06,
2323
"warmup_steps": 30,
@@ -44,5 +44,10 @@
4444
"skip_profile_timer": false,
4545
"use_fused_rms_norm": true,
4646
"fuse_attention_ffn": true,
47-
"use_fused_rope": true
47+
"use_fused_rope": true,
48+
"save_sharded_model": false,
49+
"load_sharded_model": false,
50+
"unified_checkpoint": true,
51+
"use_expert_parallel": true,
52+
"unified_checkpoint_config": "skip_save_model_weight"
4853
}

llm/model_config/DeepSeek-V3/config.json

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@
2424
"moe_intermediate_size": 2048,
2525
"moe_layer_freq": 1,
2626
"n_group": 8,
27-
"n_routed_experts": 8,
27+
"n_routed_experts": 256,
2828
"n_shared_experts": 1,
2929
"norm_topk_prob": true,
3030
"num_attention_heads": 128,
3131
"num_experts_per_tok": 8,
32-
"num_hidden_layers": 13,
32+
"num_hidden_layers": 61,
3333
"num_key_value_heads": 128,
34-
"num_nextn_predict_layers": 0,
34+
"num_nextn_predict_layers": 1,
3535
"pretraining_tp": 1,
3636
"q_lora_rank": 1536,
3737
"qk_nope_head_dim": 128,
@@ -64,7 +64,10 @@
6464
"fuse_attention_ffn": true,
6565
"use_fused_rope": true,
6666
"token_drop_steps": 0,
67-
"recompute_fwd_gate_up": false,
68-
"is_split_group_gemm": true,
69-
"use_dualpipev": true
67+
"recompute_fwd_gate_up": true,
68+
"adaptive_remained_O1_recompute_ratio": 2.0,
69+
"using_post_norm_recompute": true,
70+
"is_split_group_gemm": false,
71+
"use_dualpipev": true,
72+
"send_mtp_embed": false
7073
}

llm/script/selective_launch.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Selective launch script.
17+
18+
Usage: python script/selective_launch.py <port> <ranks> <ranks> <ranks> ...
19+
"""
20+
import os
21+
import sys
22+
23+
24+
def parse_ranks(ranks_strs):
25+
"""
26+
parse_ranks
27+
"""
28+
# NOTE: You can return ranks directly here to change script/train_gpu.sh
29+
# and script/kill_process.sh together
30+
31+
# Example 1: Use contiguous nodes [8, 16)
32+
# return range(8, 16)
33+
34+
# Example 2: Use non-contiguous nodes [4, 8) + {10} + [30, 32), i.e., [4, 5, 6, 7, 10, 30, 31]
35+
# return list(range(0, 16)) + list(range(24, 40))
36+
37+
# Example 3:
38+
# Just Python code, return any nodes you want!
39+
40+
if not ranks_strs:
41+
return None
42+
43+
ranks = []
44+
for r in ranks_strs:
45+
r = eval(r)
46+
if isinstance(r, int):
47+
ranks.append(r)
48+
else:
49+
ranks.extend(r)
50+
return ranks
51+
52+
53+
def main(port, ranks):
54+
"""
55+
main
56+
"""
57+
ips = [ip.strip() for ip in os.getenv("TRAINER_INSTANCES").split(",") if ip.strip()]
58+
if ranks is None:
59+
ranks = list(range(len(ips)))
60+
ranks = sorted(list(set(ranks)))
61+
my_rank = int(os.getenv("POD_INDEX", "0"))
62+
if my_rank not in ranks:
63+
return
64+
65+
rank = ranks.index(my_rank)
66+
nranks = len(ranks)
67+
68+
master = ips[ranks[0]]
69+
print(f"--master {master}:{port} --rank {rank} --nnodes {nranks}")
70+
71+
72+
if __name__ == "__main__":
73+
main(int(sys.argv[1]), parse_ranks(sys.argv[2:]))

llm/script/train_gpu.sh

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,46 +35,32 @@ export NVSHMEM_IB_TRAFFIC_CLASS=162
3535
#export NVSHMEM_IB_ENABLE_IBGDA=true
3636
##export NVSHMEM_DISABLE_P2P=1
3737
export NVSHMEM_BOOTSTRAP=UID
38-
export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME==eth0
3938

40-
export FLAGS_cudnn_deterministic=1
41-
export FLAGS_embedding_deterministic=1
39+
unset NVSHMEM_HCA_LIST
40+
unset NVSHMEM_ENABLE_NIC_PE_MAPPING
4241

43-
# Use nodes in the range [START_RANK, END_RANK)
44-
START_RANK=0
45-
END_RANK=1
46-
47-
if [[ $rank -lt $START_RANK ]]; then
48-
exit 0
49-
fi
50-
51-
if [[ $rank -ge $END_RANK ]]; then
42+
LAUNCH_CMD=`python script/selective_launch.py 36677`
43+
if [[ -z "$LAUNCH_CMD" ]]; then
5244
exit 0
5345
fi
5446

55-
rank=$(($rank-$START_RANK))
56-
nnodes=$(($END_RANK-$START_RANK))
57-
58-
master=`hostname -i`
59-
port=36679
6047
export PYTHONPATH=../:$PYTHONPATH
61-
export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin/:$PATH
48+
export CUDA_PATH=/usr/local/cuda-12.9
6249

6350
export DSV3_USE_FP8_GEMM=true
6451
export DSV3_USE_ATTEN_RECOMPUTE=true
65-
# export FA_VERSION=3
66-
export CUDA_PATH=/usr/local/cuda-12.9
52+
export FA_VERSION=3
6753
export FLAGS_share_tensor_for_grad_tensor_holder=1
68-
export DSV3_USE_FP8_DISPATCH=False
54+
export FLAGS_use_default_stream=false
55+
export DSV3_USE_FP8_DISPATCH=true
56+
export USE_DS_GEMM=false
57+
6958

7059
bash script/kill_process.sh
7160

72-
# /opt/nvidia/nsight-compute/2025.2.0/host/target-linux-x64/nsys profile --stats=true -t cuda,nvtx -o fp8_overlap_quant --force-overwrite true \
7361
python3.10 -m paddle.distributed.launch \
7462
--log_dir output/paddle_distributed_logs \
75-
--master $master:$port \
76-
--nnodes $nnodes \
77-
--rank $rank \
63+
$LAUNCH_CMD \
7864
--run_mode=collective \
7965
${script:-run_pretrain.py} \
8066
$@

0 commit comments

Comments
 (0)