|
| 1 | +export OMP_NUM_THREADS=8 |
| 2 | +export NCCL_IB_DISABLE=0 |
| 3 | +export NCCL_IB_GID_INDEX=3 |
| 4 | +export NCCL_SOCKET_IFNAME=eth0 |
| 5 | +export NCCL_DEBUG=INFO |
| 6 | + |
| 7 | +LLM_VERSION="Qwen/Qwen2-7B-Instruct" |
| 8 | +# for 7b model we recommend bs=1, accum=2, 16 nodes, 128 gpus, lr=1e-5, warmup=0.03 |
| 9 | +# for 72b model we recommend bs=1, accum=1, 32 nodes, 256 gpus, lr=1e-5, warmup=0.03 |
| 10 | +LLM_VERSION_CLEAN="${LLM_VERSION//\//_}" |
| 11 | +VISION_MODEL_VERSION="google/siglip-so400m-patch14-384" |
| 12 | +VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}" |
| 13 | + |
| 14 | +############### Pretrain ################ |
| 15 | + |
| 16 | +BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain" |
| 17 | +echo "BASE_RUN_NAME: ${BASE_RUN_NAME}" |
| 18 | + |
| 19 | +############### Finetune ################ |
| 20 | + |
| 21 | +# Stage 2 |
| 22 | +PROMPT_VERSION="qwen_1_5" |
| 23 | +RUN_NAME="llava-onevision-${VISION_MODEL_VERSION_CLEAN}-${LLM_VERSION_CLEAN}-ov_stage_am9" |
| 24 | +PREV_STAGE_CHECKPOINT="/mnt/bn/vl-research/checkpoints/onevision/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mid_to_final_next_3m_am9_july14" # replace it with your last checkpoint training from single image collection |
| 25 | +echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}" |
| 26 | +echo "MID_RUN_NAME: ${RUN_NAME}" |
| 27 | + |
| 28 | +ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NNODES}" --node_rank="${RANK}" --master_addr="${ADDR}" --master_port="${PORT}" \ |
| 29 | + llava/train/train_mem.py \ |
| 30 | + --deepspeed scripts/zero3.json \ |
| 31 | + --model_name_or_path $PREV_STAGE_CHECKPOINT \ |
| 32 | + --version $PROMPT_VERSION \ |
| 33 | + --data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_ov_stage_july21.yaml \ |
| 34 | + --image_folder /mnt/bn/vl-research/data/llava_data \ |
| 35 | + --video_folder /mnt/bn/vl-research/data/llava_video \ |
| 36 | + --mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \ |
| 37 | + --mm_vision_tower_lr=2e-6 \ |
| 38 | + --vision_tower ${VISION_MODEL_VERSION} \ |
| 39 | + --mm_projector_type mlp2x_gelu \ |
| 40 | + --mm_vision_select_layer -2 \ |
| 41 | + --mm_use_im_start_end False \ |
| 42 | + --mm_use_im_patch_token False \ |
| 43 | + --group_by_modality_length True \ |
| 44 | + --image_aspect_ratio anyres_max_9 \ |
| 45 | + --image_grid_pinpoints "(1x1),...,(6x6)" \ |
| 46 | + --mm_patch_merge_type spatial_unpad \ |
| 47 | + --bf16 True \ |
| 48 | + --run_name $RUN_NAME \ |
| 49 | + --output_dir /mnt/bn/vl-research/checkpoints/onevision/$RUN_NAME \ |
| 50 | + --num_train_epochs 1 \ |
| 51 | + --per_device_train_batch_size 1 \ |
| 52 | + --per_device_eval_batch_size 4 \ |
| 53 | + --gradient_accumulation_steps 2 \ |
| 54 | + --evaluation_strategy "no" \ |
| 55 | + --save_strategy "steps" \ |
| 56 | + --save_steps 1000 \ |
| 57 | + --save_total_limit 1 \ |
| 58 | + --learning_rate 1e-5 \ |
| 59 | + --weight_decay 0. \ |
| 60 | + --warmup_ratio 0.03 \ |
| 61 | + --lr_scheduler_type "cosine" \ |
| 62 | + --logging_steps 1 \ |
| 63 | + --tf32 True \ |
| 64 | + --model_max_length 32768 \ |
| 65 | + --gradient_checkpointing True \ |
| 66 | + --dataloader_num_workers 4 \ |
| 67 | + --lazy_preprocess True \ |
| 68 | + --report_to wandb \ |
| 69 | + --torch_compile True \ |
| 70 | + --torch_compile_backend "inductor" \ |
| 71 | + --dataloader_drop_last True \ |
| 72 | + --frames_upbound 32 |
| 73 | +exit 0; |
| 74 | + |
| 75 | +# You can delete the sdpa attn_implementation if you want to use flash attn |
0 commit comments