|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +# Test training benchmark for a model. |
| 18 | +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} |
| 19 | +function _set_params(){ |
| 20 | + model_name_or_path=${model_name_or_path:-"qwen/qwen-7b"} |
| 21 | + per_device_train_batch_size=${per_device_train_batch_size:-1} |
| 22 | + tensor_parallel_degree=${tensor_parallel_degree:-1} |
| 23 | + data_parallel_degree=${data_parallel_degree:-1} |
| 24 | + pipeline_parallel_degree=${pipeline_parallel_degree:-4} |
| 25 | + virtual_pp_degree=${virtual_pp_degree:-10} |
| 26 | + sequence_parallel=${sequence_parallel:-0} |
| 27 | + sharding_parallel_degree=${sharding_parallel_degree:-2} |
| 28 | + sharding=${sharding:-"stage1"} |
| 29 | + recompute=${recompute:-1} |
| 30 | + run_mode=${run_mode:-"DP1-MP1-PP4-mbs1-acc8-recompute"} |
| 31 | + device_num=${device_num:-"N1C8"} |
| 32 | + global_batch_size=${global_batch_size:-16} |
| 33 | + model_item=${model_item:-"qwen-qwen-7b_seqlen2048_pretrain"} |
| 34 | + max_steps=${max_steps:-150} |
| 35 | + gradient_accumulation_steps=${gradient_accumulation_steps:-8} |
| 36 | + pp_recompute_interval=${pp_recompute_interval:-1} |
| 37 | + tensor_parallel_config=${tensor_parallel_config:-"enable_mp_async_allreduce,enable_mp_skip_c_identity,enable_mp_fused_linear_param_grad_add"} |
| 38 | + pipeline_parallel_config=${pipeline_parallel_config:-""} |
| 39 | + recompute_use_reentrant=${recompute_use_reentrant:-"true"} |
| 40 | + recompute_granularity=${recompute_granularity:-"full"} |
| 41 | + |
| 42 | + base_batch_size=${global_batch_size} |
| 43 | + |
| 44 | + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 |
| 45 | + model_repo="PaddleNLP" # (必选) 模型套件的名字 |
| 46 | + speed_unit="tokens/s" # (必选)速度指标单位 |
| 47 | + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step |
| 48 | + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 |
| 49 | + |
| 50 | + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" |
| 51 | + |
| 52 | + fp_item="bf16" |
| 53 | + # 以下为通用执行命令,无特殊可不用修改 |
| 54 | + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 |
| 55 | + device=${CUDA_VISIBLE_DEVICES//,/ } |
| 56 | + arr=(${device}) |
| 57 | + num_gpu_devices=${#arr[*]} |
| 58 | + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 |
| 59 | + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 |
| 60 | + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} |
| 61 | + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log |
| 62 | + mkdir -p $(dirname ${train_log_file}) |
| 63 | + |
| 64 | + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling |
| 65 | + mkdir -p $(dirname ${profiling_log_file}) |
| 66 | + |
| 67 | + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed |
| 68 | + mkdir -p $(dirname ${speed_log_file}) |
| 69 | + |
| 70 | + OUTPUT_PATH=${run_log_path}/output |
| 71 | + is_large_model=True |
| 72 | +} |
| 73 | + |
| 74 | +function _train(){ |
| 75 | + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs |
| 76 | + |
| 77 | + if [ -d $OUTPUT_PATH ]; then |
| 78 | + rm -rf $OUTPUT_PATH |
| 79 | + fi |
| 80 | + mkdir $OUTPUT_PATH |
| 81 | + |
| 82 | + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" |
| 83 | + |
| 84 | + if [ ${profiling} = "true" ];then |
| 85 | + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" |
| 86 | + log_file=${profiling_log_file} |
| 87 | + else |
| 88 | + add_options="" |
| 89 | + log_file=${train_log_file} |
| 90 | + fi |
| 91 | + |
| 92 | + if [ $fp_item = "fp16" ]; then |
| 93 | + use_fp16_cmd="--use_amp true" |
| 94 | + fi |
| 95 | + |
| 96 | + # Disable for hanging bug |
| 97 | + # if [ "${tensor_parallel_degree}" != "1" ]; then |
| 98 | + # export CUDA_DEVICE_MAX_CONNECTIONS=1 |
| 99 | + # fi |
| 100 | + |
| 101 | + if [ "${pipeline_parallel_config}" != "" ]; then |
| 102 | + pipeline_parallel_config_args="--pipeline_parallel_config ${pipeline_parallel_config}" |
| 103 | + else |
| 104 | + pipeline_parallel_config_args="" |
| 105 | + fi |
| 106 | + |
| 107 | + use_pure_fp16=False |
| 108 | + train_cmd="--model_name_or_path ${model_name_or_path} \ |
| 109 | + --tokenizer_name_or_path ${model_name_or_path} \ |
| 110 | + --input_dir ./qwen/data \ |
| 111 | + --output_dir ./output \ |
| 112 | + --split 949,50,1 \ |
| 113 | + --max_seq_length 2048 \ |
| 114 | + --per_device_train_batch_size ${per_device_train_batch_size} \ |
| 115 | + --gradient_accumulation_steps ${gradient_accumulation_steps} \ |
| 116 | + --use_flash_attention 1 \ |
| 117 | + --use_fused_rms_norm 1 \ |
| 118 | + --bf16 \ |
| 119 | + --fp16_opt_level O2 \ |
| 120 | + --amp_master_grad \ |
| 121 | + --tensor_parallel_degree ${tensor_parallel_degree} \ |
| 122 | + --pipeline_parallel_degree ${pipeline_parallel_degree} \ |
| 123 | + --virtual_pp_degree ${virtual_pp_degree} \ |
| 124 | + --pp_recompute_interval ${pp_recompute_interval} \ |
| 125 | + --learning_rate 0.00001 \ |
| 126 | + --min_learning_rate 0.000001 \ |
| 127 | + --max_steps ${max_steps} \ |
| 128 | + --save_steps 50000 \ |
| 129 | + --weight_decay 0.01 \ |
| 130 | + --warmup_ratio 0.01 \ |
| 131 | + --max_grad_norm 1.0 \ |
| 132 | + --logging_steps 1 \ |
| 133 | + --dataloader_num_workers 1 \ |
| 134 | + --eval_steps 1001 \ |
| 135 | + --sharding ${sharding} \ |
| 136 | + --disable_tqdm true \ |
| 137 | + --continue_training 0 \ |
| 138 | + --do_train \ |
| 139 | + --device gpu \ |
| 140 | + --use_fused_rope true \ |
| 141 | + --enable_linear_fused_grad_add true \ |
| 142 | + --fuse_attention_qkv true \ |
| 143 | + --fuse_attention_ffn true \ |
| 144 | + --tensor_parallel_config ${tensor_parallel_config} ${pipeline_parallel_config_args} \ |
| 145 | + --recompute ${recompute} \ |
| 146 | + --recompute_use_reentrant ${recompute_use_reentrant} \ |
| 147 | + --skip_memory_metrics 0 \ |
| 148 | + --data_cache ./data_cache" |
| 149 | + |
| 150 | + if [ ${PADDLE_TRAINER_ID} ] |
| 151 | + then |
| 152 | + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" |
| 153 | + else |
| 154 | + PADDLE_RANK_OPTION="" |
| 155 | + fi |
| 156 | + # 以下为通用执行命令,无特殊可不用修改 |
| 157 | + case ${device_num} in |
| 158 | + N1C1) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" |
| 159 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 ${PADDLE_RANK_OPTION}\ |
| 160 | + run_pretrain.py ${train_cmd}" |
| 161 | + workerlog_id=0 |
| 162 | + ;; |
| 163 | + N1C2) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" |
| 164 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1 ${PADDLE_RANK_OPTION}\ |
| 165 | + run_pretrain.py ${train_cmd}" |
| 166 | + workerlog_id=0 |
| 167 | + ;; |
| 168 | + N1C4) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" |
| 169 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3 ${PADDLE_RANK_OPTION}\ |
| 170 | + run_pretrain.py ${train_cmd}" |
| 171 | + workerlog_id=0 |
| 172 | + ;; |
| 173 | + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" |
| 174 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ |
| 175 | + run_pretrain.py ${train_cmd}" |
| 176 | + workerlog_id=0 |
| 177 | + ;; |
| 178 | + esac |
| 179 | + cd ../llm/ |
| 180 | + echo "train_cmd: ${train_cmd} log_file: ${log_file}" |
| 181 | + python -c "import paddlenlp" |
| 182 | + if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 |
| 183 | + ${train_cmd} > ${log_file} 2>&1 |
| 184 | + else |
| 185 | + timeout 30m ${train_cmd} > ${log_file} 2>&1 |
| 186 | + # echo ${train_cmd} |
| 187 | + fi |
| 188 | + if [ $? -ne 0 ];then |
| 189 | + echo -e "${model_name}, FAIL" |
| 190 | + else |
| 191 | + echo -e "${model_name}, SUCCESS" |
| 192 | + fi |
| 193 | + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` |
| 194 | + if [ ${device_num} != "N1C1" -a -d mylog ]; then |
| 195 | + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog |
| 196 | + cp -r ${case_path}/mylog/workerlog.* ./mylog/ |
| 197 | + rm ${log_file} |
| 198 | + cp ${case_path}/mylog/workerlog.${workerlog_id} ${log_file} |
| 199 | + fi |
| 200 | +} |
| 201 | + |
| 202 | +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH |
| 203 | + |
| 204 | +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 |
| 205 | +_set_params $@ |
| 206 | +#_train # 如果只产出训练log,不解析,可取消注释 |
| 207 | +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 |
0 commit comments