1+ #! /bin/bash
2+
3+ # Export environment variables
4+ export CUDA_VISIBLE_DEVICES=" 0,1,2,3"
5+ export OMP_NUM_THREADS=" 8"
6+ export NCCL_IB_DISABLE=" 0"
7+ export NCCL_IB_GID_INDEX=" 3"
8+ export NCCL_SOCKET_IFNAME=" eth0"
9+ export NCCL_DEBUG=" INFO"
10+ export ACCELERATE_CPU_AFFINITY=" 1"
11+ # export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
12+ export WANDB_API_KEY=" 65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
13+
14+ # Run the command using torchrun
15+ torchrun --nproc_per_node=4 \
16+ --nnodes=1 \
17+ --node_rank=0 \
18+ --master_addr=127.0.0.1 \
19+ --master_port=29500 \
20+ llava/train/train_mem.py \
21+ --deepspeed scripts/zero3.json \
22+ --model_name_or_path lmms-lab/llava-onevision-qwen2-0.5b-ov \
23+ --version qwen_1_5 \
24+ --data_path scripts/train/onevision.yaml \
25+ --image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
26+ --video_folder /media/data/haozhe/VFM/onevision/llava_video \
27+ --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
28+ --mm_vision_tower_lr 2e-6 \
29+ --vision_tower google/siglip-so400m-patch14-384 \
30+ --mm_projector_type mlp2x_gelu \
31+ --mm_vision_select_layer -2 \
32+ --mm_use_im_start_end False \
33+ --mm_use_im_patch_token False \
34+ --group_by_modality_length True \
35+ --image_aspect_ratio anyres_max_9 \
36+ --image_grid_pinpoints " (1x1),...,(6x6)" \
37+ --mm_patch_merge_type spatial_unpad \
38+ --bf16 True \
39+ --run_name EK100_lora_05b_new \
40+ --output_dir experiments/EK100_lora_05b_new \
41+ --num_train_epochs 1 \
42+ --per_device_train_batch_size 1 \
43+ --per_device_eval_batch_size 4 \
44+ --gradient_accumulation_steps 2 \
45+ --evaluation_strategy no \
46+ --save_strategy steps \
47+ --save_steps 1000 \
48+ --save_total_limit 1 \
49+ --learning_rate 1e-4 \
50+ --weight_decay 0. \
51+ --warmup_ratio 0.03 \
52+ --lr_scheduler_type cosine \
53+ --logging_steps 1 \
54+ --tf32 True \
55+ --model_max_length 32768 \
56+ --gradient_checkpointing True \
57+ --dataloader_num_workers 4 \
58+ --lazy_preprocess True \
59+ --report_to wandb \
60+ --torch_compile True \
61+ --torch_compile_backend inductor \
62+ --dataloader_drop_last True \
63+ --frames_upbound 32 \
64+ --lora_enable True \
65+ --lora_r 128 \
66+ --lora_alpha 256 \
67+ --mm_projector_lr 2e-5 > train_kitchen_lora_0.5b_new.out 2>&1
0 commit comments