@@ -13,22 +13,26 @@ VISION_MODEL_VERSION_CLEAN="${VISION_MODEL_VERSION//\//_}"
1313
1414# ############## Pretrain ################
1515
16- PROMPT_VERSION=" qwen_1_5"
17-
18- BASE_RUN_NAME=" llavanext-${VISION_MODEL_VERSION_CLEAN} -${LLM_VERSION_CLEAN} -mlp2x_gelu-pretrain_blip558k_plain"
16+ BASE_RUN_NAME=" llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
1917echo " BASE_RUN_NAME: ${BASE_RUN_NAME} "
2018
21- CKPT_PATH=$LLM_VERSION # this could also be the previous stage checkpoint
19+ # ############## Finetune ################
20+
21+ # Stage 2
22+ PROMPT_VERSION=" qwen_1_5"
23+ MID_RUN_NAME=" llava-onevision-${VISION_MODEL_VERSION_CLEAN} -${LLM_VERSION_CLEAN} -ov_stage_am9"
24+ PREV_STAGE_CHECKPOINT=" /mnt/bn/vl-research/checkpoints/onevision/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mid_to_final_next_3m_am9_july14" # replace it with your last checkpoint training from single image collection
25+ echo " PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT} "
26+ echo " MID_RUN_NAME: ${MID_RUN_NAME} "
2227
2328ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=" ${NUM_GPUS} " --nnodes=" ${NNODES} " --node_rank=" ${RANK} " --master_addr=" ${ADDR} " --master_port=" ${PORT} " \
2429 llava/train/train_mem.py \
2530 --deepspeed scripts/zero3.json \
26- --model_name_or_path ${CKPT_PATH} \
27- --version ${PROMPT_VERSION} \
28- --data_path ./onevision_data.yaml \
29- --image_folder ./onevision_data/images \
30- --video_folder ./onevision_data/videos \
31- --pretrain_mm_mlp_adapter=" /checkpoints/projectors/${BASE_RUN_NAME} /mm_projector.bin" \
31+ --model_name_or_path $PREV_STAGE_CHECKPOINT \
32+ --version $PROMPT_VERSION \
33+ --data_path /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/scripts/i18n/scale_llms/next_ov_stage_july21.yaml \
34+ --image_folder /mnt/bn/vl-research/data/llava_data \
35+ --video_folder /mnt/bn/vl-research/data/llava_video \
3236 --mm_tunable_parts=" mm_vision_tower,mm_mlp_adapter,mm_language_model" \
3337 --mm_vision_tower_lr=2e-6 \
3438 --vision_tower ${VISION_MODEL_VERSION} \
@@ -42,7 +46,7 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN
4246 --mm_patch_merge_type spatial_unpad \
4347 --bf16 True \
4448 --run_name $MID_RUN_NAME \
45- --output_dir " / checkpoints/${ MID_RUN_NAME} " \
49+ --output_dir /mnt/bn/vl-research/ checkpoints/onevision/ $ MID_RUN_NAME \
4650 --num_train_epochs 1 \
4751 --per_device_train_batch_size 1 \
4852 --per_device_eval_batch_size 4 \
@@ -66,5 +70,6 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node="${NUM_GPUS}" --nnodes="${NN
6670 --torch_compile_backend " inductor" \
6771 --dataloader_drop_last True \
6872 --frames_upbound 32
73+ exit 0;
6974
70- # You can delete the sdpa attn_implementation if you want to use flash attn
75+ # You can delete the sdpa attn_implementation if you want to use flash attn
0 commit comments