Skip to content

Commit bb7d8e8

Browse files
committed
make training work for lora
1 parent 27e2fa1 commit bb7d8e8

File tree

5 files changed

+81
-10
lines changed

5 files changed

+81
-10
lines changed

action/llava_ov_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def llava_inference(video_frames, tokenizer, model, image_processor, max_length,
1919
model.eval()
2020
device = "cuda"
2121
video_frames = video_frames[0]
22-
temporal_stride = 16 // num_frames
22+
temporal_stride = 32 // num_frames
2323
video_frames = video_frames[::temporal_stride]
2424
image_tensors = []
2525
frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()

run.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ torchrun --nproc_per_node=4 \
3636
--image_grid_pinpoints "(1x1),...,(6x6)" \
3737
--mm_patch_merge_type spatial_unpad \
3838
--bf16 True \
39-
--run_name EK100_test \
40-
--output_dir experiments/EK100_test \
39+
--run_name EK100_test_new \
40+
--output_dir experiments/EK100_test_new \
4141
--num_train_epochs 1 \
4242
--per_device_train_batch_size 1 \
4343
--per_device_eval_batch_size 4 \
@@ -60,4 +60,4 @@ torchrun --nproc_per_node=4 \
6060
--torch_compile True \
6161
--torch_compile_backend inductor \
6262
--dataloader_drop_last True \
63-
--frames_upbound 32 > train_kitchen_0.5b.out 2>&1
63+
--frames_upbound 32 > train_kitchen_0.5b_new.out 2>&1

run_EK100.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ python3 action/dataset.py \
33
--train-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
44
--val-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
55
--llm_size 0.5b \
6-
--llava_num_frames 16 \
7-
--llava_checkpoint experiments/EK100_test/checkpoint-8402 \
8-
--action_predictions action/avaion_predictions.json \
9-
--topk_predictions 10 > kitchen_test.out 2>&1
6+
--llava_num_frames 32 --clip-length 32 \
7+
--llava_checkpoint experiments/EK100_test_new \
8+
--action_predictions /media/data/haozhe/VFM/EK100/predictions.json \
9+
--topk_predictions 10 > EK100_test_new.out 2>&1

run_EK100_2.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
python3 action/dataset.py \
2-
--root /media/data/haozhe/VFM/EK100/EK100_320p_15sec_30fps_libx264 \
2+
--root /media/data/haozhe/VFM/onevision/llava_video/EK100 \
33
--train-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
44
--val-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
55
--llm_size 0.5b \
6-
--llava_num_frames 16 > kitchen_test_2.out 2>&1 \
6+
--llava_num_frames 32 --clip-length 32 \
7+
--llava_checkpoint experiments/EK100_lora_05b_new \
8+
--action_predictions /media/data/haozhe/VFM/EK100/predictions.json \
9+
--topk_predictions 10 > EK100_lora_05b_new.out 2>&1
10+

run_lora_05b.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
3+
# Export environment variables
4+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
5+
export OMP_NUM_THREADS="8"
6+
export NCCL_IB_DISABLE="0"
7+
export NCCL_IB_GID_INDEX="3"
8+
export NCCL_SOCKET_IFNAME="eth0"
9+
export NCCL_DEBUG="INFO"
10+
export ACCELERATE_CPU_AFFINITY="1"
11+
# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
12+
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
13+
14+
# Run the command using torchrun
15+
torchrun --nproc_per_node=4 \
16+
--nnodes=1 \
17+
--node_rank=0 \
18+
--master_addr=127.0.0.1 \
19+
--master_port=29500 \
20+
llava/train/train_mem.py \
21+
--deepspeed scripts/zero3.json \
22+
--model_name_or_path lmms-lab/llava-onevision-qwen2-0.5b-ov \
23+
--version qwen_1_5 \
24+
--data_path scripts/train/onevision.yaml \
25+
--image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
26+
--video_folder /media/data/haozhe/VFM/onevision/llava_video \
27+
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
28+
--mm_vision_tower_lr 2e-6 \
29+
--vision_tower google/siglip-so400m-patch14-384 \
30+
--mm_projector_type mlp2x_gelu \
31+
--mm_vision_select_layer -2 \
32+
--mm_use_im_start_end False \
33+
--mm_use_im_patch_token False \
34+
--group_by_modality_length True \
35+
--image_aspect_ratio anyres_max_9 \
36+
--image_grid_pinpoints "(1x1),...,(6x6)" \
37+
--mm_patch_merge_type spatial_unpad \
38+
--bf16 True \
39+
--run_name EK100_lora_05b_new \
40+
--output_dir experiments/EK100_lora_05b_new \
41+
--num_train_epochs 1 \
42+
--per_device_train_batch_size 1 \
43+
--per_device_eval_batch_size 4 \
44+
--gradient_accumulation_steps 2 \
45+
--evaluation_strategy no \
46+
--save_strategy steps \
47+
--save_steps 1000 \
48+
--save_total_limit 1 \
49+
--learning_rate 1e-4 \
50+
--weight_decay 0. \
51+
--warmup_ratio 0.03 \
52+
--lr_scheduler_type cosine \
53+
--logging_steps 1 \
54+
--tf32 True \
55+
--model_max_length 32768 \
56+
--gradient_checkpointing True \
57+
--dataloader_num_workers 4 \
58+
--lazy_preprocess True \
59+
--report_to wandb \
60+
--torch_compile True \
61+
--torch_compile_backend inductor \
62+
--dataloader_drop_last True \
63+
--frames_upbound 32 \
64+
--lora_enable True \
65+
--lora_r 128 \
66+
--lora_alpha 256 \
67+
--mm_projector_lr 2e-5 > train_kitchen_lora_0.5b_new.out 2>&1

0 commit comments

Comments
 (0)