Skip to content

Commit dc1b823

Browse files
authored
add scripts (#2398)
1 parent d911bf2 commit dc1b823

File tree

8 files changed

+236
-0
lines changed

8 files changed

+236
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
model_item=gpt3_moe
2+
dp_degree=1
3+
bs_item=8
4+
fp_item=fp16
5+
run_mode=DP_MoE_C1
6+
device_num=N1C1
7+
8+
model=gpt
9+
10+
cd ./tests
11+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
12+
# run
13+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_item=gpt3_moe
2+
fp_item=fp32
3+
dp_degree=1
4+
bs_item=8
5+
run_mode=DP_MoE_C1
6+
device_num=N1C1
7+
8+
model=gpt
9+
10+
# get data
11+
cd ./tests
12+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
13+
# run
14+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_item=gpt3_moe
2+
dp_degree=8
3+
bs_item=8
4+
fp_item=fp16
5+
run_mode=DP_MoE_C8
6+
device_num=N1C8
7+
8+
model=gpt
9+
10+
# get data
11+
cd ./tests
12+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
13+
# run
14+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_item=gpt3_moe
2+
dp_degree=8
3+
bs_item=8
4+
fp_item=fp32
5+
run_mode=DP_MoE_C8
6+
device_num=N1C8
7+
8+
model=gpt
9+
10+
# get data
11+
cd ./tests
12+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
13+
# run
14+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_item=gpt3_moe
2+
dp_degree=32
3+
bs_item=8
4+
fp_item=fp16
5+
run_mode=DP_MoE_C32
6+
device_num=N4C32
7+
8+
model=gpt
9+
10+
# get data
11+
cd ./tests
12+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
13+
# run
14+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
model_item=gpt3_moe
2+
dp_degree=32
3+
bs_item=8
4+
fp_item=fp32
5+
run_mode=DP_MoE_C32
6+
device_num=N4C32
7+
8+
model=gpt
9+
10+
# get data
11+
cd ./tests
12+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/prepare.sh
13+
# run
14+
bash ./test_tipc/dygraph/moe/${model}/benchmark_common/run_benchmark.sh ${model_item} ${fp_item} ${dp_degree} ${bs_item} ${run_mode} ${device_num} 2>&1;
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
cd ../examples/language_model/gpt-3/data_tools/
2+
sed -i "s/python3/python3.7/g" Makefile
3+
cd -
4+
5+
python3 -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
6+
unset http_proxy https_proxy
7+
python3 -m pip install -r ../requirements.txt #-i https://pypi.tuna.tsinghua.edu.cn/simple
8+
python3 -m pip install pybind11 regex sentencepiece tqdm visualdl #-i https://mirror.baidu.com/pypi/simple
9+
python3 -m pip install --upgrade paddlenlp
10+
# get data
11+
cd ../examples/language_model/gpt-moe/dygraph/
12+
rm -rf data
13+
mkdir data && cd data
14+
wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/train.data.json_ids.npz
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env bash
2+
# Test training benchmark for a model.
3+
# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding}
4+
function _set_params(){
5+
model_item=${1:-"model_item"} # (必选) 模型 item
6+
fp_item=${2:-"fp32"} # (必选) fp32|fp16
7+
dp_degree=${3:-"1"} # (必选) dp/moe数据并行度
8+
local_batch_size=${4:-"2"} # (必选) 每张卡的batch_size
9+
run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1|DP_MoE_C1
10+
device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡)
11+
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
12+
model_repo="PaddleNLP" # (必选) 模型套件的名字
13+
speed_unit="tokens/s" # (必选)速度指标单位
14+
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step
15+
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
16+
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
17+
num_experts=${7:-8} #(可选)每张卡的expert数量
18+
max_iter=${8:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数
19+
use_sharding=${9:-"false"} # (可选) 是否使用ShardingOptimizer
20+
num_workers=0 # (可选)
21+
base_batch_size=$local_batch_size
22+
# 以下为通用执行命令,无特殊可不用修改
23+
model_name=${model_item}_bs${local_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
24+
device=${CUDA_VISIBLE_DEVICES//,/ }
25+
arr=(${device})
26+
num_gpu_devices=${#arr[*]}
27+
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量
28+
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
29+
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
30+
#
31+
train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
32+
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
33+
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
34+
35+
OUTPUT_PATH=${run_log_path}/output
36+
}
37+
38+
function _train(){
39+
batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
40+
41+
if [ -d $OUTPUT_PATH ]; then
42+
rm -rf $OUTPUT_PATH
43+
fi
44+
mkdir $OUTPUT_PATH
45+
46+
# if [ ${model_item} = "gpt3_moe" ];then
47+
# static_scripts="../examples/language_model/gpt-moe/dygraph/"
48+
# else
49+
# echo "not supported model item: ${model_item}"; exit 1;
50+
# fi
51+
52+
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
53+
54+
if [ ${profiling} = "true" ];then
55+
add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
56+
log_file=${profiling_log_file}
57+
else
58+
add_options=""
59+
log_file=${train_log_file}
60+
fi
61+
62+
if [ $fp_item = "fp16" ]; then
63+
use_fp16_cmd="--use_amp true"
64+
fi
65+
66+
# data_path="./data/"
67+
68+
use_pure_fp16=False
69+
if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi
70+
train_cmd="${add_options} \
71+
--model_type gpt \
72+
--model_name_or_path gpt2-small-en \
73+
--input_dir ./data\
74+
--output_dir output\
75+
--weight_decay 0.01\
76+
--grad_clip 2\
77+
--max_steps ${max_iter}\
78+
--save_steps 100000\
79+
--decay_steps 320000\
80+
--device gpu\
81+
--eval_freq 100000\
82+
--warmup_rate 0.01\
83+
--local_batch_size ${local_batch_size} \
84+
--micro_batch_size ${local_batch_size} \
85+
--dp_degree ${dp_degree}\
86+
--mp_degree 1\
87+
--pp_degree 1\
88+
--expert_mode True\
89+
--logging_freq 1 \
90+
--num_experts ${num_experts}\
91+
--use_pure_fp16 ${use_pure_fp16} \
92+
--use_recompute False\
93+
--recompute_partition False\
94+
--recompute_offload False\
95+
--scale_loss 32768 \
96+
--gate gshard \
97+
--balance_loss_weight 1.0"
98+
99+
# 以下为通用执行命令,无特殊可不用修改
100+
case ${run_mode} in
101+
DP_MoE_C1) echo "run run_mode: DP_MoE_C1"
102+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 \
103+
run_moe_pretrain.py ${train_cmd}"
104+
workerlog_id=0
105+
;;
106+
DP_MoE_C8) echo "run run_mode: DP_MoE_C8"
107+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 \
108+
run_moe_pretrain.py ${train_cmd}"
109+
workerlog_id=0
110+
;;
111+
DP_MoE_C32) echo "run run_mode: DP_MoE_C32"
112+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 \
113+
run_moe_pretrain.py ${train_cmd}"
114+
workerlog_id=0
115+
;;
116+
*) echo "choose run_mode "; exit 1;
117+
esac
118+
cd ../examples/language_model/moe/dygraph/
119+
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
120+
python -c "import paddlenlp"
121+
timeout 15m ${train_cmd} > ${log_file} 2>&1
122+
if [ $? -ne 0 ];then
123+
echo -e "${model_name}, FAIL"
124+
else
125+
echo -e "${model_name}, SUCCESS"
126+
fi
127+
#kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
128+
if [ ${device_num} != "N1C1" -a -d mylog ]; then
129+
rm ${log_file}
130+
cp mylog/workerlog.${workerlog_id} ${log_file}
131+
fi
132+
}
133+
134+
export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
135+
136+
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
137+
_set_params $@
138+
#_train # 如果只产出训练log,不解析,可取消注释
139+
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开

0 commit comments

Comments
 (0)