|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Test training benchmark for a model. |
| 3 | +# Usage:bash benchmark/run_benchmark.sh ${model_item} ${fp_item} ${mp_degree} ${pp_degree} ${dp_degree} ${micro_batch_size} ${global_batch_size} ${run_mode} ${device_num} ${use_sharding} |
| 4 | +function _set_params(){ |
| 5 | + model_item=${1:-"model_item"} # (必选) 模型 item |
| 6 | + fp_item=${2:-"fp32"} # (必选) fp32|fp16 |
| 7 | + dp_degree=${3:-"1"} # (必选) dp/moe数据并行度 |
| 8 | + local_batch_size=${4:-"2"} # (必选) 每张卡的batch_size |
| 9 | + run_mode=${5:-"DP"} # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1|DP_MoE_C1 |
| 10 | + device_num=${6:-"N1C1"} # (必选) 使用的卡数量,N1C1|N1C8|N4C32 (4机32卡) |
| 11 | + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 |
| 12 | + model_repo="PaddleNLP" # (必选) 模型套件的名字 |
| 13 | + speed_unit="tokens/s" # (必选)速度指标单位 |
| 14 | + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step |
| 15 | + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 |
| 16 | + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" |
| 17 | + num_experts=${7:-8} #(可选)每张卡的expert数量 |
| 18 | + max_iter=${8:-500} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件;或使用max_epoch参数 |
| 19 | + use_sharding=${9:-"false"} # (可选) 是否使用ShardingOptimizer |
| 20 | + num_workers=0 # (可选) |
| 21 | + base_batch_size=$local_batch_size |
| 22 | + # 以下为通用执行命令,无特殊可不用修改 |
| 23 | + model_name=${model_item}_bs${local_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 |
| 24 | + device=${CUDA_VISIBLE_DEVICES//,/ } |
| 25 | + arr=(${device}) |
| 26 | + num_gpu_devices=${#arr[*]} |
| 27 | + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 |
| 28 | + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 |
| 29 | + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} |
| 30 | + # |
| 31 | + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log |
| 32 | + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling |
| 33 | + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed |
| 34 | + |
| 35 | + OUTPUT_PATH=${run_log_path}/output |
| 36 | +} |
| 37 | + |
| 38 | +function _train(){ |
| 39 | + batch_size=${local_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs |
| 40 | + |
| 41 | + if [ -d $OUTPUT_PATH ]; then |
| 42 | + rm -rf $OUTPUT_PATH |
| 43 | + fi |
| 44 | + mkdir $OUTPUT_PATH |
| 45 | + |
| 46 | + # if [ ${model_item} = "gpt3_moe" ];then |
| 47 | + # static_scripts="../examples/language_model/gpt-moe/dygraph/" |
| 48 | + # else |
| 49 | + # echo "not supported model item: ${model_item}"; exit 1; |
| 50 | + # fi |
| 51 | + |
| 52 | + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" |
| 53 | + |
| 54 | + if [ ${profiling} = "true" ];then |
| 55 | + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" |
| 56 | + log_file=${profiling_log_file} |
| 57 | + else |
| 58 | + add_options="" |
| 59 | + log_file=${train_log_file} |
| 60 | + fi |
| 61 | + |
| 62 | + if [ $fp_item = "fp16" ]; then |
| 63 | + use_fp16_cmd="--use_amp true" |
| 64 | + fi |
| 65 | + |
| 66 | + # data_path="./data/" |
| 67 | + |
| 68 | + use_pure_fp16=False |
| 69 | + if [ "fp16" = ${fp_item} ]; then use_pure_fp16=True; fi |
| 70 | + train_cmd="${add_options} \ |
| 71 | + --model_type gpt \ |
| 72 | + --model_name_or_path gpt2-small-en \ |
| 73 | + --input_dir ./data\ |
| 74 | + --output_dir output\ |
| 75 | + --weight_decay 0.01\ |
| 76 | + --grad_clip 2\ |
| 77 | + --max_steps ${max_iter}\ |
| 78 | + --save_steps 100000\ |
| 79 | + --decay_steps 320000\ |
| 80 | + --device gpu\ |
| 81 | + --eval_freq 100000\ |
| 82 | + --warmup_rate 0.01\ |
| 83 | + --local_batch_size ${local_batch_size} \ |
| 84 | + --micro_batch_size ${local_batch_size} \ |
| 85 | + --dp_degree ${dp_degree}\ |
| 86 | + --mp_degree 1\ |
| 87 | + --pp_degree 1\ |
| 88 | + --expert_mode True\ |
| 89 | + --logging_freq 1 \ |
| 90 | + --num_experts ${num_experts}\ |
| 91 | + --use_pure_fp16 ${use_pure_fp16} \ |
| 92 | + --use_recompute False\ |
| 93 | + --recompute_partition False\ |
| 94 | + --recompute_offload False\ |
| 95 | + --scale_loss 32768 \ |
| 96 | + --gate gshard \ |
| 97 | + --balance_loss_weight 1.0" |
| 98 | + |
| 99 | + # 以下为通用执行命令,无特殊可不用修改 |
| 100 | + case ${run_mode} in |
| 101 | + DP_MoE_C1) echo "run run_mode: DP_MoE_C1" |
| 102 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0 \ |
| 103 | + run_moe_pretrain.py ${train_cmd}" |
| 104 | + workerlog_id=0 |
| 105 | + ;; |
| 106 | + DP_MoE_C8) echo "run run_mode: DP_MoE_C8" |
| 107 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 \ |
| 108 | + run_moe_pretrain.py ${train_cmd}" |
| 109 | + workerlog_id=0 |
| 110 | + ;; |
| 111 | + DP_MoE_C32) echo "run run_mode: DP_MoE_C32" |
| 112 | + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=0,1,2,3,4,5,6,7 \ |
| 113 | + run_moe_pretrain.py ${train_cmd}" |
| 114 | + workerlog_id=0 |
| 115 | + ;; |
| 116 | + *) echo "choose run_mode "; exit 1; |
| 117 | + esac |
| 118 | + cd ../examples/language_model/moe/dygraph/ |
| 119 | + echo "train_cmd: ${train_cmd} log_file: ${log_file}" |
| 120 | + python -c "import paddlenlp" |
| 121 | + timeout 15m ${train_cmd} > ${log_file} 2>&1 |
| 122 | + if [ $? -ne 0 ];then |
| 123 | + echo -e "${model_name}, FAIL" |
| 124 | + else |
| 125 | + echo -e "${model_name}, SUCCESS" |
| 126 | + fi |
| 127 | + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` |
| 128 | + if [ ${device_num} != "N1C1" -a -d mylog ]; then |
| 129 | + rm ${log_file} |
| 130 | + cp mylog/workerlog.${workerlog_id} ${log_file} |
| 131 | + fi |
| 132 | +} |
| 133 | + |
| 134 | +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH |
| 135 | + |
| 136 | +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 |
| 137 | +_set_params $@ |
| 138 | +#_train # 如果只产出训练log,不解析,可取消注释 |
| 139 | +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 |
0 commit comments