Skip to content

Commit 2a5e5a6

Browse files
authored
add qwen benchmark (PaddlePaddle#7758)
* add qwen benchmark * update qwen benchmark scripts * qwen 7b benchmark * arg change * fix wrong args * fix args * update
1 parent 688adb8 commit 2a5e5a6

File tree

5 files changed

+320
-0
lines changed

5 files changed

+320
-0
lines changed

llm/fused_layers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
llama/fused_layers.py
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
param="model_name_or_path=qwen/qwen-7b "
17+
param+="per_device_train_batch_size=2 "
18+
param+="data_parallel_degree=1 "
19+
param+="tensor_parallel_degree=2 "
20+
param+="pipeline_parallel_degree=2 "
21+
param+="virtual_pp_degree=1 "
22+
param+="sequence_parallel=0 "
23+
param+="sharding_parallel_degree=2 "
24+
param+="sharding=stage1 "
25+
param+="recompute=1 "
26+
param+="recompute_granularity=core_attn "
27+
param+="run_mode=MP2-PP2-mbs2-acc4-recompute "
28+
param+="device_num=N1C8 "
29+
param+="global_batch_size=16 "
30+
param+="model_item=qwen-qwen-7b_seqlen2048_pretrain "
31+
param+="max_steps=150 "
32+
param+="gradient_accumulation_steps=4 "
33+
param+="pp_recompute_interval=1 "
34+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity, "
35+
param+="recompute_use_reentrant=true "
36+
37+
cd ./tests
38+
bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
39+
40+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
param="model_name_or_path=qwen/qwen-7b "
17+
param+="per_device_train_batch_size=2 "
18+
param+="data_parallel_degree=1 "
19+
param+="tensor_parallel_degree=4 "
20+
param+="pipeline_parallel_degree=1 "
21+
param+="virtual_pp_degree=1 "
22+
param+="sequence_parallel=0 "
23+
param+="sharding_parallel_degree=2 "
24+
param+="sharding=stage1 "
25+
param+="recompute=1 "
26+
param+="recompute_granularity=core_attn "
27+
param+="run_mode=MP4-PP2-vpp1-mbs2-acc4-recompute "
28+
param+="device_num=N1C8 "
29+
param+="global_batch_size=16 "
30+
param+="model_item=qwen-qwen-7b_seqlen2048_pretrain "
31+
param+="max_steps=150 "
32+
param+="gradient_accumulation_steps=4 "
33+
param+="pp_recompute_interval=1 "
34+
param+="tensor_parallel_config=enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add "
35+
param+="recompute_use_reentrant=true "
36+
37+
cd ./tests
38+
bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/prepare.sh
39+
40+
bash -c "${param} bash ./test_tipc/dygraph/hybrid_parallelism/qwen/benchmark_common/run_benchmark.sh"
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
python -m pip install -r ../requirements.txt
16+
17+
python -m pip install tiktoken
18+
19+
# install fused_ln custom ops
20+
cd ../model_zoo/gpt-3/external_ops/
21+
python setup.py install
22+
23+
# install tool_helpers
24+
cd ../../../llm/qwen
25+
python -m pip install tool_helpers
26+
27+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
28+
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
29+
30+
mkdir data
31+
mv llama_openwebtext_100k_ids.npy ./data
32+
mv llama_openwebtext_100k_idx.npz ./data
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Test training benchmark for a model.
18+
# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num}
19+
function _set_params(){
20+
model_name_or_path=${model_name_or_path:-"qwen/qwen-7b"}
21+
per_device_train_batch_size=${per_device_train_batch_size:-1}
22+
tensor_parallel_degree=${tensor_parallel_degree:-1}
23+
data_parallel_degree=${data_parallel_degree:-1}
24+
pipeline_parallel_degree=${pipeline_parallel_degree:-4}
25+
virtual_pp_degree=${virtual_pp_degree:-10}
26+
sequence_parallel=${sequence_parallel:-0}
27+
sharding_parallel_degree=${sharding_parallel_degree:-2}
28+
sharding=${sharding:-"stage1"}
29+
recompute=${recompute:-1}
30+
run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"}
31+
device_num=${device_num:-"N1C8"}
32+
global_batch_size=${global_batch_size:-16}
33+
model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"}
34+
max_steps=${max_steps:-150}
35+
gradient_accumulation_steps=${gradient_accumulation_steps:-8}
36+
pp_recompute_interval=${pp_recompute_interval:-1}
37+
tensor_parallel_config=${tensor_parallel_config:-"enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add"}
38+
pipeline_parallel_config=${pipeline_parallel_config:-""}
39+
recompute_use_reentrant=${recompute_use_reentrant:-"true"}
40+
recompute_granularity=${recompute_granularity:-"full"}
41+
42+
base_batch_size=${global_batch_size}
43+
44+
profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递
45+
model_repo="PaddleNLP" # (必选) 模型套件的名字
46+
speed_unit="tokens/s" # (必选)速度指标单位
47+
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step
48+
keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字
49+
50+
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
51+
52+
fp_item="bf16"
53+
# 以下为通用执行命令,无特殊可不用修改
54+
model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐
55+
device=${CUDA_VISIBLE_DEVICES//,/ }
56+
arr=(${device})
57+
num_gpu_devices=${#arr[*]}
58+
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量
59+
profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
60+
speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
61+
train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
62+
mkdir -p $(dirname ${train_log_file})
63+
64+
profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
65+
mkdir -p $(dirname ${profiling_log_file})
66+
67+
speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
68+
mkdir -p $(dirname ${speed_log_file})
69+
70+
OUTPUT_PATH=${run_log_path}/output
71+
is_large_model=True
72+
}
73+
74+
function _train(){
75+
batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
76+
77+
if [ -d $OUTPUT_PATH ]; then
78+
rm -rf $OUTPUT_PATH
79+
fi
80+
mkdir $OUTPUT_PATH
81+
82+
echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
83+
84+
if [ ${profiling} = "true" ];then
85+
add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
86+
log_file=${profiling_log_file}
87+
else
88+
add_options=""
89+
log_file=${train_log_file}
90+
fi
91+
92+
if [ $fp_item = "fp16" ]; then
93+
use_fp16_cmd="--use_amp true"
94+
fi
95+
96+
# Disable for hanging bug
97+
# if [ "${tensor_parallel_degree}" != "1" ]; then
98+
# export CUDA_DEVICE_MAX_CONNECTIONS=1
99+
# fi
100+
101+
if [ "${pipeline_parallel_config}" != "" ]; then
102+
pipeline_parallel_config_args="--pipeline_parallel_config ${pipeline_parallel_config}"
103+
else
104+
pipeline_parallel_config_args=""
105+
fi
106+
107+
use_pure_fp16=False
108+
train_cmd="--model_name_or_path ${model_name_or_path} \
109+
--tokenizer_name_or_path ${model_name_or_path} \
110+
--input_dir ./qwen/data \
111+
--output_dir ./output \
112+
--split 949,50,1 \
113+
--max_seq_length 2048 \
114+
--per_device_train_batch_size ${per_device_train_batch_size} \
115+
--gradient_accumulation_steps ${gradient_accumulation_steps} \
116+
--use_flash_attention 1 \
117+
--use_fused_rms_norm 1 \
118+
--bf16 \
119+
--fp16_opt_level O2 \
120+
--amp_master_grad \
121+
--tensor_parallel_degree ${tensor_parallel_degree} \
122+
--pipeline_parallel_degree ${pipeline_parallel_degree} \
123+
--virtual_pp_degree ${virtual_pp_degree} \
124+
--pp_recompute_interval ${pp_recompute_interval} \
125+
--learning_rate 0.00001 \
126+
--min_learning_rate 0.000001 \
127+
--max_steps ${max_steps} \
128+
--save_steps 50000 \
129+
--weight_decay 0.01 \
130+
--warmup_ratio 0.01 \
131+
--max_grad_norm 1.0 \
132+
--logging_steps 1 \
133+
--dataloader_num_workers 1 \
134+
--eval_steps 1001 \
135+
--sharding ${sharding} \
136+
--disable_tqdm true \
137+
--continue_training 0 \
138+
--do_train \
139+
--device gpu \
140+
--use_fused_rope true \
141+
--enable_linear_fused_grad_add true \
142+
--fuse_attention_qkv true \
143+
--fuse_attention_ffn true \
144+
--tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \
145+
--recompute ${recompute} \
146+
--recompute_use_reentrant ${recompute_use_reentrant} \
147+
--skip_memory_metrics 0 \
148+
--data_cache ./data_cache"
149+
150+
if [ ${PADDLE_TRAINER_ID} ]
151+
then
152+
PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}"
153+
else
154+
PADDLE_RANK_OPTION=""
155+
fi
156+
# 以下为通用执行命令,无特殊可不用修改
157+
case ${device_num} in
158+
N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
159+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\
160+
run_pretrain.py ${train_cmd}"
161+
workerlog_id=0
162+
;;
163+
N1C2) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
164+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1 ${PADDLE_RANK_OPTION}\
165+
run_pretrain.py ${train_cmd}"
166+
workerlog_id=0
167+
;;
168+
N1C4) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
169+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3 ${PADDLE_RANK_OPTION}\
170+
run_pretrain.py ${train_cmd}"
171+
workerlog_id=0
172+
;;
173+
*) echo "Run with: device_num=${device_num}, run_mode=${run_mode}"
174+
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\
175+
run_pretrain.py ${train_cmd}"
176+
workerlog_id=0
177+
;;
178+
esac
179+
cd ../llm/
180+
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
181+
python -c "import paddlenlp"
182+
if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
183+
${train_cmd} > ${log_file} 2>&1
184+
else
185+
timeout 30m ${train_cmd} > ${log_file} 2>&1
186+
# echo ${train_cmd}
187+
fi
188+
if [ $? -ne 0 ];then
189+
echo -e "${model_name}, FAIL"
190+
else
191+
echo -e "${model_name}, SUCCESS"
192+
fi
193+
#kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
194+
if [ ${device_num} != "N1C1" -a -d mylog ]; then
195+
case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog
196+
cp -r ${case_path}/mylog/workerlog.* ./mylog/
197+
rm ${log_file}
198+
cp ${case_path}/mylog/workerlog.${workerlog_id} ${log_file}
199+
fi
200+
}
201+
202+
export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
203+
204+
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开
205+
_set_params $@
206+
#_train # 如果只产出训练log,不解析,可取消注释
207+
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开

0 commit comments

Comments
 (0)