1+ #! /bin/bash
2+
3+ # Set up the data folder
4+ IMAGE_FOLDER=" XXX"
5+ VIDEO_FOLDER=" XXX"
6+ DATA_YAML=" XXX" # e.g exp.yaml
7+
8+ # ############## Prepare Envs #################
9+ python3 -m pip install flash-attn --no-build-isolation
10+ alias python=python3
11+ # ############## Show Envs ####################
12+
13+ nvidia-smi
14+
15+ # ############### Arnold Jobs ################
16+
17+ LLM_VERSION=" Qwen/Qwen2-72B-Instruct"
18+ LLM_VERSION_CLEAN=" ${LLM_VERSION// \/ / _} "
19+ VISION_MODEL_VERSION=" google/siglip-so400m-patch14-384"
20+ VISION_MODEL_VERSION_CLEAN=" ${VISION_MODEL_VERSION// \/ / _} "
21+
22+ PROMPT_VERSION=plain
23+ PRETRAIN_DATA_VERSION=" blip558k"
24+ # ############## Pretrain ################
25+
26+ BASE_RUN_NAME=" llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-72B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
27+ echo " BASE_RUN_NAME: ${BASE_RUN_NAME} "
28+
29+ # Stage 2
30+ PROMPT_VERSION=" qwen_1_5"
31+ MID_RUN_NAME=" llavanext-${VISION_MODEL_VERSION_CLEAN} -${LLM_VERSION_CLEAN} -ov_to_video_am9"
32+ PREV_STAGE_CHECKPOINT=" lmms-lab/llava-onevision-qwen2-72b-ov"
33+ echo " PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT} "
34+ echo " MID_RUN_NAME: ${MID_RUN_NAME} "
35+
36+
37+ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=" ${ARNOLD_WORKER_GPU} " --nnodes=" ${ARNOLD_WORKER_NUM} " --node_rank=" ${ARNOLD_ID} " --master_addr=" ${METIS_WORKER_0_HOST} " --master_port=" ${port_in_cmd} " \
38+ llava/train/train_mem.py \
39+ --deepspeed scripts/zero3.json \
40+ --model_name_or_path $PREV_STAGE_CHECKPOINT \
41+ --version $PROMPT_VERSION \
42+ --data_path $DATA_YAML \
43+ --image_folder $IMAGE_FOLDER \
44+ --video_folder $VIDEO_FOLDER \
45+ --mm_tunable_parts=" mm_vision_tower,mm_mlp_adapter,mm_language_model" \
46+ --mm_vision_tower_lr=2e-6 \
47+ --vision_tower ${VISION_MODEL_VERSION} \
48+ --mm_projector_type mlp2x_gelu \
49+ --mm_vision_select_layer -2 \
50+ --mm_use_im_start_end False \
51+ --mm_use_im_patch_token False \
52+ --group_by_modality_length True \
53+ --image_aspect_ratio anyres_max_9 \
54+ --image_grid_pinpoints " (1x1),...,(6x6)" \
55+ --mm_patch_merge_type spatial_unpad \
56+ --bf16 True \
57+ --run_name $MID_RUN_NAME \
58+ --output_dir ./work_dirs/$MID_RUN_NAME \
59+ --num_train_epochs 1 \
60+ --per_device_train_batch_size 1 \
61+ --per_device_eval_batch_size 4 \
62+ --gradient_accumulation_steps 2 \
63+ --evaluation_strategy " no" \
64+ --save_strategy " steps" \
65+ --save_steps 500 \
66+ --save_total_limit 1 \
67+ --learning_rate 1e-5 \
68+ --weight_decay 0. \
69+ --warmup_ratio 0.03 \
70+ --lr_scheduler_type " cosine" \
71+ --logging_steps 1 \
72+ --tf32 True \
73+ --model_max_length 32768 \
74+ --gradient_checkpointing True \
75+ --dataloader_num_workers 2 \
76+ --lazy_preprocess True \
77+ --report_to wandb \
78+ --torch_compile True \
79+ --torch_compile_backend " inductor" \
80+ --dataloader_drop_last True \
81+ --frames_upbound 32 \
82+ --mm_newline_position grid \
83+ --add_time_instruction True \
84+ --force_sample True \
85+ --mm_spatial_pool_stride 2
86+ exit 0;
0 commit comments