Skip to content

Commit 17089c8

Browse files
[Test Config] Add default test config for N1C8 dsv3 (#10898)
* feat(config): add default N1C8 dsv3 config * feat(config): add default N1C8 dsv3 config
1 parent ac58fa4 commit 17089c8

File tree

4 files changed

+38
-18
lines changed

4 files changed

+38
-18
lines changed

llm/config/deepseek-v3/pretrain_argument.json

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
{
2-
"model_name_or_path": "./model_config/DeepSeek-V3-test",
2+
"model_name_or_path": "./model_config/DeepSeek-V3",
33
"tokenizer_name_or_path": "deepseek-ai/DeepSeek-V3",
44
"input_dir": "./data",
55
"output_dir": "./checkpoints/pretrain_ckpts",
66
"per_device_train_batch_size": 1,
7-
"gradient_accumulation_steps": 120,
7+
"gradient_accumulation_steps": 24,
88
"per_device_eval_batch_size": 1,
99
"tensor_parallel_degree": 1,
10-
"pipeline_parallel_degree": 1,
11-
"sharding_parallel_degree": 64,
10+
"pipeline_parallel_degree": 4,
11+
"pipeline_parallel_config": "use_dualpipev",
12+
"sharding_parallel_degree": 2,
1213
"sharding_parallel_config": "split_param enable_fuse_optimizer_states",
13-
"sharding_comm_buffer_size_MB": 2048,
14-
"expert_parallel_degree": 64,
14+
"sharding_comm_buffer_size_MB": 4096,
15+
"expert_parallel_degree": 2,
1516
"sharding": "stage1",
1617
"virtual_pp_degree": 1,
1718
"sequence_parallel": 0,
1819
"use_flash_attention": true,
19-
"max_seq_length": 4097,
20+
"max_seq_length": 4096,
2021
"learning_rate": 3e-05,
2122
"min_learning_rate": 3e-06,
2223
"warmup_steps": 30,
@@ -44,4 +45,4 @@
4445
"use_fused_rms_norm": true,
4546
"fuse_attention_ffn": true,
4647
"use_fused_rope": true
47-
}
48+
}

llm/model_config/DeepSeek-V3/config.json

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@
2424
"moe_intermediate_size": 2048,
2525
"moe_layer_freq": 1,
2626
"n_group": 8,
27-
"n_routed_experts": 256,
27+
"n_routed_experts": 8,
2828
"n_shared_experts": 1,
2929
"norm_topk_prob": true,
3030
"num_attention_heads": 128,
3131
"num_experts_per_tok": 8,
32-
"num_hidden_layers": 61,
32+
"num_hidden_layers": 13,
3333
"num_key_value_heads": 128,
34-
"num_nextn_predict_layers": 1,
34+
"num_nextn_predict_layers": 0,
3535
"pretraining_tp": 1,
3636
"q_lora_rank": 1536,
3737
"qk_nope_head_dim": 128,
@@ -63,5 +63,8 @@
6363
"use_fused_rms_norm": true,
6464
"fuse_attention_ffn": true,
6565
"use_fused_rope": true,
66-
"token_drop_steps": 0
66+
"token_drop_steps": 0,
67+
"recompute_fwd_gate_up": false,
68+
"is_split_group_gemm": true,
69+
"use_dualpipev": true
6770
}

llm/run.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
1818
# wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx
1919

20-
mpirun sh script/kill_process.sh
21-
mpirun rm -rf output
22-
nohup mpirun sh script/train_gpu.sh config/deepseek-v3/pretrain_argument.json &
20+
# mpirun sh script/kill_process.sh
21+
# mpirun rm -rf output
22+
nohup bash script/train_gpu.sh config/deepseek-v3/pretrain_argument.json &
2323

llm/script/train_gpu.sh

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/bin/bash
2+
13
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,10 +35,14 @@ export NVSHMEM_IB_TRAFFIC_CLASS=162
3335
#export NVSHMEM_IB_ENABLE_IBGDA=true
3436
##export NVSHMEM_DISABLE_P2P=1
3537
export NVSHMEM_BOOTSTRAP=UID
36-
export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME==xgbe0
38+
export NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME==eth0
39+
40+
export FLAGS_cudnn_deterministic=1
41+
export FLAGS_embedding_deterministic=1
3742

43+
# Use nodes in the range [START_RANK, END_RANK)
3844
START_RANK=0
39-
END_RANK=8
45+
END_RANK=1
4046

4147
if [[ $rank -lt $START_RANK ]]; then
4248
exit 0
@@ -49,11 +55,21 @@ fi
4955
rank=$(($rank-$START_RANK))
5056
nnodes=$(($END_RANK-$START_RANK))
5157

52-
master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'`
58+
master=`hostname -i`
5359
port=36679
5460
export PYTHONPATH=../:$PYTHONPATH
5561
export PATH=/opt/nvidia/nsight-systems/2025.1.1/bin/:$PATH
5662

63+
export DSV3_USE_FP8_GEMM=true
64+
export DSV3_USE_ATTEN_RECOMPUTE=true
65+
# export FA_VERSION=3
66+
export CUDA_PATH=/usr/local/cuda-12.9
67+
export FLAGS_share_tensor_for_grad_tensor_holder=1
68+
export DSV3_USE_FP8_DISPATCH=False
69+
70+
bash script/kill_process.sh
71+
72+
# /opt/nvidia/nsight-compute/2025.2.0/host/target-linux-x64/nsys profile --stats=true -t cuda,nvtx -o fp8_overlap_quant --force-overwrite true \
5773
python3.10 -m paddle.distributed.launch \
5874
--log_dir output/paddle_distributed_logs \
5975
--master $master:$port \

0 commit comments

Comments
 (0)