|
| 1 | +set -x |
| 2 | + |
| 3 | +PARTITION=${PARTITION:-"INTERN2"} |
| 4 | +GPUS=${GPUS:-32} |
| 5 | +GPUS_PER_NODE=${GPUS_PER_NODE:-8} |
| 6 | +QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} |
| 7 | +NODES=$((GPUS / GPUS_PER_NODE)) |
| 8 | +CPUS_PER_TASK=${CPUS_PER_TASK:-10} |
| 9 | +SRUN_ARGS=${SRUN_ARGS:-""} |
| 10 | +BATCH_SIZE=${BATCH_SIZE:-128} |
| 11 | +PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1} |
| 12 | +GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) |
| 13 | + |
| 14 | + |
| 15 | +export PYTHONPATH="${PYTHONPATH}:$(pwd)" |
| 16 | +export MASTER_PORT=34229 |
| 17 | +export TF_CPP_MIN_LOG_LEVEL=3 |
| 18 | + |
| 19 | +OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full' |
| 20 | + |
| 21 | +if [ ! -d "$OUTPUT_DIR" ]; then |
| 22 | + mkdir -p "$OUTPUT_DIR" |
| 23 | +fi |
| 24 | + |
| 25 | +# number of gpus: 32 |
| 26 | +# batch size per gpu: 1 |
| 27 | +# gradient accumulation steps: 4 |
| 28 | +# total batch size: 128 |
| 29 | +# epoch: 1 |
| 30 | +srun -p ${PARTITION} \ |
| 31 | + --gres=gpu:${GPUS_PER_NODE} \ |
| 32 | + --nodes=${NODES} \ |
| 33 | + --ntasks=${GPUS} \ |
| 34 | + --ntasks-per-node=${GPUS_PER_NODE} \ |
| 35 | + --cpus-per-task=${CPUS_PER_TASK} \ |
| 36 | + --kill-on-bad-exit=1 \ |
| 37 | + --quotatype=${QUOTA_TYPE} \ |
| 38 | + ${SRUN_ARGS} \ |
| 39 | + python -u internvl/train/internvl_chat_finetune.py \ |
| 40 | + --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \ |
| 41 | + --conv_style "internlm2-chat" \ |
| 42 | + --output_dir ${OUTPUT_DIR} \ |
| 43 | + --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \ |
| 44 | + --overwrite_output_dir True \ |
| 45 | + --force_image_size 448 \ |
| 46 | + --max_dynamic_patch 6 \ |
| 47 | + --down_sample_ratio 0.5 \ |
| 48 | + --drop_path_rate 0.4 \ |
| 49 | + --freeze_llm False \ |
| 50 | + --freeze_mlp False \ |
| 51 | + --freeze_backbone True \ |
| 52 | + --vision_select_layer -1 \ |
| 53 | + --dataloader_num_workers 4 \ |
| 54 | + --bf16 True \ |
| 55 | + --num_train_epochs 1 \ |
| 56 | + --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \ |
| 57 | + --gradient_accumulation_steps ${GRADIENT_ACC} \ |
| 58 | + --evaluation_strategy "no" \ |
| 59 | + --save_strategy "steps" \ |
| 60 | + --save_steps 200 \ |
| 61 | + --save_total_limit 1 \ |
| 62 | + --learning_rate 2e-5 \ |
| 63 | + --weight_decay 0.05 \ |
| 64 | + --warmup_ratio 0.03 \ |
| 65 | + --lr_scheduler_type "cosine" \ |
| 66 | + --logging_steps 1 \ |
| 67 | + --max_seq_length 4096 \ |
| 68 | + --do_train True \ |
| 69 | + --grad_checkpoint True \ |
| 70 | + --group_by_length True \ |
| 71 | + --dynamic_image_size True \ |
| 72 | + --use_thumbnail True \ |
| 73 | + --ps_version 'v2' \ |
| 74 | + --deepspeed "zero_stage3_config_100b.json" \ |
| 75 | + --report_to "tensorboard" \ |
| 76 | + 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt" |
0 commit comments