Skip to content

Commit 6c671f2

Browse files
author
Haozhe Qi
committed
add todi runfile, delete some old runfiles
1 parent 3e4bba9 commit 6c671f2

File tree

10 files changed

+343
-84
lines changed

10 files changed

+343
-84
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,5 @@ data_processing/
7373

7474

7575
experiments/
76-
*.out
76+
*.out
77+
pretrained_models/

docs/LLaVA_OneVision_Tutorials.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,18 @@
7171
import warnings
7272
from decord import VideoReader, cpu
7373

74+
os.environ["HF_HOME"] = "/capstor/scratch/cscs/hqi/huggingface"
75+
7476
warnings.filterwarnings("ignore")
7577
# Load the OneVision model
76-
# pretrained = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
78+
pretrained = "lmms-lab/LLaVA-Video-72B-Qwen2"
7779
# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_quick_config"
78-
# model_base = None
79-
pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_lora_quick_check"
80-
model_base = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
81-
model_name = "lora_llava_qwen"
80+
model_base = None
81+
model_name = "llava_qwen"
82+
83+
# pretrained = "/mnt/SV_storage/VFM/LLaVA-NeXT/experiments/EK100_lora_quick_check"
84+
# model_base = "/mnt/SV_storage/VFM/huggingface/hub/models--lmms-lab--llava-onevision-qwen2-0.5b-ov/snapshots/381d9947148efb1e58a577f451c05705ceec666e"
85+
# model_name = "lora_llava_qwen"
8286
device = "cuda"
8387
device_map = "auto"
8488
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, model_base, model_name, device_map=device_map, attn_implementation="sdpa")

run.sh

Lines changed: 52 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
# Export environment variables
44
export CUDA_VISIBLE_DEVICES="0,1,2,3"
55
export OMP_NUM_THREADS="8"
6-
export NCCL_IB_DISABLE="0"
7-
export NCCL_IB_GID_INDEX="3"
8-
export NCCL_SOCKET_IFNAME="eth0"
9-
export NCCL_DEBUG="INFO"
6+
# export NCCL_IB_DISABLE="0"
7+
# export NCCL_IB_GID_INDEX="3"
8+
# export NCCL_SOCKET_IFNAME="eth0"
9+
# export NCCL_DEBUG="INFO"
1010
export ACCELERATE_CPU_AFFINITY="1"
1111
# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
1212
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
@@ -18,46 +18,51 @@ torchrun --nproc_per_node=4 \
1818
--master_addr=127.0.0.1 \
1919
--master_port=29500 \
2020
llava/train/train_mem.py \
21-
--deepspeed scripts/zero3.json \
22-
--model_name_or_path lmms-lab/llava-onevision-qwen2-0.5b-ov \
23-
--version qwen_1_5 \
24-
--data_path scripts/train/onevision.yaml \
25-
--image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
26-
--video_folder /media/data/haozhe/VFM/onevision/llava_video \
27-
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
28-
--mm_vision_tower_lr 2e-6 \
29-
--vision_tower google/siglip-so400m-patch14-384 \
30-
--mm_projector_type mlp2x_gelu \
31-
--mm_vision_select_layer -2 \
32-
--mm_use_im_start_end False \
33-
--mm_use_im_patch_token False \
34-
--group_by_modality_length True \
35-
--image_aspect_ratio anyres_max_9 \
36-
--image_grid_pinpoints "(1x1),...,(6x6)" \
37-
--mm_patch_merge_type spatial_unpad \
38-
--bf16 True \
39-
--run_name EK100_test_new \
40-
--output_dir experiments/EK100_test_new \
41-
--num_train_epochs 1 \
42-
--per_device_train_batch_size 1 \
43-
--per_device_eval_batch_size 4 \
44-
--gradient_accumulation_steps 2 \
45-
--evaluation_strategy no \
46-
--save_strategy steps \
47-
--save_steps 1000 \
48-
--save_total_limit 1 \
49-
--learning_rate 1e-5 \
50-
--weight_decay 0. \
51-
--warmup_ratio 0.03 \
52-
--lr_scheduler_type cosine \
53-
--logging_steps 1 \
54-
--tf32 True \
55-
--model_max_length 32768 \
56-
--gradient_checkpointing True \
57-
--dataloader_num_workers 4 \
58-
--lazy_preprocess True \
59-
--report_to wandb \
60-
--torch_compile True \
61-
--torch_compile_backend inductor \
62-
--dataloader_drop_last True \
63-
--frames_upbound 32 > train_kitchen_0.5b_new.out 2>&1
21+
--deepspeed scripts/zero3.json \
22+
--model_name_or_path pretrained_models/LLaVA-Video-7B-Qwen2 \
23+
--version qwen_1_5 \
24+
--data_path scripts/train/EK100_avion_mc_top10.yaml \
25+
--video_folder /capstor/scratch/cscs/hqi/llava/onevision/llava_video \
26+
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
27+
--mm_vision_tower_lr 2e-6 \
28+
--vision_tower pretrained_models/siglip-so400m-patch14-384 \
29+
--mm_projector_type mlp2x_gelu \
30+
--mm_vision_select_layer -2 \
31+
--mm_use_im_start_end False \
32+
--mm_use_im_patch_token False \
33+
--group_by_modality_length True \
34+
--image_aspect_ratio anyres_max_9 \
35+
--image_grid_pinpoints "(1x1),...,(6x6)" \
36+
--mm_patch_merge_type spatial_unpad \
37+
--bf16 True \
38+
--run_name todi_llava_video_7b_avion_mc_top10_5epochs_test \
39+
--output_dir experiments/todi_llava_video_7b_avion_mc_top10_5epochs_test \
40+
--num_train_epochs 5 \
41+
--per_device_train_batch_size 2 \
42+
--per_device_eval_batch_size 4 \
43+
--gradient_accumulation_steps 2 \
44+
--evaluation_strategy steps \
45+
--eval_steps 2000\
46+
--save_strategy steps \
47+
--save_steps 1000 \
48+
--learning_rate 1e-5 \
49+
--weight_decay 0. \
50+
--warmup_ratio 0.03 \
51+
--lr_scheduler_type cosine \
52+
--logging_steps 1 \
53+
--tf32 True \
54+
--model_max_length 32768 \
55+
--gradient_checkpointing True \
56+
--dataloader_num_workers 4 \
57+
--lazy_preprocess True \
58+
--report_to wandb \
59+
--torch_compile True \
60+
--torch_compile_backend inductor \
61+
--dataloader_drop_last True \
62+
--frames_upbound 32 \
63+
--root /capstor/scratch/cscs/hqi/llava/onevision/llava_video/EK100 \
64+
--action_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_test.json \
65+
--val_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
66+
--llava_num_frames 32 \
67+
--clip_length 32 \
68+
--topk_predictions 10

run_EK100.sh

Lines changed: 0 additions & 9 deletions
This file was deleted.

run_EK100_2.sh

Lines changed: 0 additions & 10 deletions
This file was deleted.

run_demo.sh

Lines changed: 0 additions & 8 deletions
This file was deleted.

run_todi.sbatch

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/bin/bash
2+
#SBATCH --job-name multinode
3+
#SBATCH --account a03
4+
#SBATCH --reservation=sai-a03
5+
#SBATCH --hint nomultithread
6+
#SBATCH --cpus-per-task 288
7+
#SBATCH --no-requeue
8+
#SBATCH --nodes 8 # number of Nodes
9+
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
10+
#SBATCH --gres gpu:4 # Number of GPUs
11+
#SBATCH --time 05:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
12+
#SBATCH --output logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs.out
13+
#SBATCH --error logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs.err
14+
15+
mkdir -p logs
16+
17+
echo "START TIME: $(date)"
18+
19+
# auto-fail on any errors in this script
20+
# set -eo pipefail
21+
22+
# logging script's variables/commands for future debug needs
23+
set -x
24+
25+
######################
26+
### Set enviroment ###
27+
######################
28+
# module purge
29+
# module load singularity
30+
31+
GPUS_PER_NODE=4
32+
echo "NODES: $SLURM_NNODES"
33+
######################
34+
35+
######################
36+
#### Set network #####
37+
######################
38+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
39+
MASTER_PORT=6000
40+
######################
41+
42+
# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
43+
# 0 and the launcher will hang
44+
#
45+
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
46+
LAUNCHER="torchrun \
47+
--nproc_per_node $GPUS_PER_NODE \
48+
--nnodes $SLURM_NNODES \
49+
--node_rank \$SLURM_PROCID \
50+
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
51+
--rdzv_backend c10d \
52+
--max_restarts 0 \
53+
--tee 3 \
54+
"
55+
56+
PYTHON_FILE=llava/train/train_mem.py
57+
PYTHON_ARGS=" \
58+
--deepspeed scripts/zero3.json \
59+
--model_name_or_path pretrained_models/LLaVA-Video-7B-Qwen2 \
60+
--version qwen_1_5 \
61+
--data_path scripts/train/EK100_avion_mc_top10.yaml \
62+
--video_folder /capstor/scratch/cscs/hqi/llava/onevision/llava_video \
63+
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
64+
--mm_vision_tower_lr 2e-6 \
65+
--vision_tower google/siglip-so400m-patch14-384 \
66+
--mm_projector_type mlp2x_gelu \
67+
--mm_vision_select_layer -2 \
68+
--mm_use_im_start_end False \
69+
--mm_use_im_patch_token False \
70+
--group_by_modality_length True \
71+
--image_aspect_ratio anyres_max_9 \
72+
--image_grid_pinpoints \"(1x1),...,(6x6)\" \
73+
--mm_patch_merge_type spatial_unpad \
74+
--bf16 True \
75+
--run_name todi_llava_video_7b_avion_mc_top10_5epochs \
76+
--output_dir experiments/todi_llava_video_7b_avion_mc_top10_5epochs \
77+
--num_train_epochs 5 \
78+
--per_device_train_batch_size 2 \
79+
--per_device_eval_batch_size 4 \
80+
--gradient_accumulation_steps 2 \
81+
--evaluation_strategy steps \
82+
--eval_steps 2000\
83+
--save_strategy steps \
84+
--save_steps 1000 \
85+
--learning_rate 1e-5 \
86+
--weight_decay 0. \
87+
--warmup_ratio 0.03 \
88+
--lr_scheduler_type cosine \
89+
--logging_steps 1 \
90+
--tf32 True \
91+
--model_max_length 32768 \
92+
--gradient_checkpointing True \
93+
--dataloader_num_workers 4 \
94+
--lazy_preprocess True \
95+
--report_to wandb \
96+
--torch_compile True \
97+
--torch_compile_backend inductor \
98+
--dataloader_drop_last True \
99+
--frames_upbound 32 \
100+
--root /capstor/scratch/cscs/hqi/llava/onevision/llava_video/EK100 \
101+
--action_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_test.json \
102+
--val_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
103+
--llava_num_frames 32 \
104+
--clip_length 32 \
105+
--topk_predictions 10 \
106+
"
107+
108+
export CMD="$LAUNCHER $PYTHON_FILE $PYTHON_ARGS"
109+
export HF_HOME=$SCRATCH/huggingface
110+
export OMP_NUM_THREADS="8"
111+
export ACCELERATE_CPU_AFFINITY="1"
112+
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
113+
114+
echo $CMD
115+
116+
# srun error handling:
117+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
118+
SRUN_ARGS=" \
119+
-ul \
120+
--cpus-per-task $SLURM_CPUS_PER_TASK \
121+
--jobid $SLURM_JOB_ID \
122+
--wait 60 \
123+
--environment=llava-env \
124+
--container-workdir=$PWD \
125+
"
126+
# SINGULARITY_CONTAINER=/path/to/singularity/.sif/file
127+
# SINGULARITY_ARGS=" \
128+
# --bind /path/to/bind/folder \
129+
# $SINGULARITY_CONTAINER \
130+
# "
131+
132+
# bash -c is needed for the delayed interpolation of env vars to work
133+
srun $SRUN_ARGS bash -c "
134+
source /capstor/scratch/cscs/hqi/llava/llava_dependency/llava-venv/bin/activate
135+
$CMD"
136+
137+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)