11#! /bin/bash
22# SBATCH --job-name multinode
3- # SBATCH --account a03
4- # SBATCH --reservation=sai-a03
3+ # SBATCH -A a-a03
54# SBATCH --hint nomultithread
65# SBATCH --cpus-per-task 288
76# SBATCH --no-requeue
87# SBATCH --nodes 8 # number of Nodes
98# SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
109# SBATCH --gres gpu:4 # Number of GPUs
11- # SBATCH --time 05 :00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
12- # SBATCH --output logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs .out
13- # SBATCH --error logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs .err
10+ # SBATCH --time 23 :00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11+ # SBATCH --output logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe .out
12+ # SBATCH --error logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe .err
1413
1514mkdir -p logs
1615
@@ -56,10 +55,10 @@ LAUNCHER="torchrun \
5655PYTHON_FILE=llava/train/train_mem.py
5756PYTHON_ARGS=" \
5857 --deepspeed scripts/zero3.json \
59- --model_name_or_path pretrained_models /LLaVA-Video-7B-Qwen2 \
58+ --model_name_or_path lmms-lab /LLaVA-Video-7B-Qwen2 \
6059 --version qwen_1_5 \
61- --data_path scripts/train/EK100_avion_mc_top10 .yaml \
62- --video_folder /capstor /scratch/cscs/hqi/llava /onevision/llava_video \
60+ --data_path scripts/train/llava_video .yaml \
61+ --video_folder /iopsstor /scratch/cscs/hqi/VFM /onevision/llava_video \
6362 --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
6463 --mm_vision_tower_lr 2e-6 \
6564 --vision_tower google/siglip-so400m-patch14-384 \
@@ -72,16 +71,16 @@ PYTHON_ARGS=" \
7271 --image_grid_pinpoints \" (1x1),...,(6x6)\" \
7372 --mm_patch_merge_type spatial_unpad \
7473 --bf16 True \
75- --run_name todi_llava_video_7b_avion_mc_top10_5epochs \
76- --output_dir experiments/todi_llava_video_7b_avion_mc_top10_5epochs \
77- --num_train_epochs 5 \
78- --per_device_train_batch_size 2 \
79- --per_device_eval_batch_size 4 \
74+ --run_name dev_7b_4f_llavavideo_test_haozhe \
75+ --output_dir experiments/dev_7b_4f_llavavideo_test_haozhe \
76+ --num_train_epochs 1 \
77+ --per_device_train_batch_size 1 \
78+ --per_device_eval_batch_size 1 \
8079 --gradient_accumulation_steps 2 \
81- --evaluation_strategy steps \
82- --eval_steps 2000\
80+ --evaluation_strategy no \
81+ --eval_steps 2000 \
8382 --save_strategy steps \
84- --save_steps 1000 \
83+ --save_steps 2000 \
8584 --learning_rate 1e-5 \
8685 --weight_decay 0. \
8786 --warmup_ratio 0.03 \
@@ -96,13 +95,15 @@ PYTHON_ARGS=" \
9695 --torch_compile True \
9796 --torch_compile_backend inductor \
9897 --dataloader_drop_last True \
99- --frames_upbound 32 \
100- --root /capstor/scratch/cscs/hqi/llava/onevision/llava_video/EK100 \
101- --action_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_test.json \
102- --val_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
103- --llava_num_frames 32 \
104- --clip_length 32 \
105- --topk_predictions 10 \
98+ --frames_upbound 4 \
99+ --root /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/EK100 \
100+ --action_predictions /iopsstor/scratch/cscs/hqi/VFM/llava_data/TIM_PREDS/tim_pred_ids_val.json \
101+ --val_metadata /iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
102+ --add_time_instruction False \
103+ --llava_num_frames 4 \
104+ --clip_length 4 \
105+ --action_representation official_key \
106+ --topk_predictions 5 \
106107 "
107108
108109export CMD=" $LAUNCHER $PYTHON_FILE $PYTHON_ARGS "
@@ -130,8 +131,8 @@ SRUN_ARGS=" \
130131# "
131132
132133# bash -c is needed for the delayed interpolation of env vars to work
133- srun $SRUN_ARGS bash -c "
134- source /capstor /scratch/cscs/hqi/llava /llava_dependency/llava-venv/bin/activate
134+ srun $SRUN_ARGS numactl --membind=0-3 bash -c "
135+ source /iopsstor /scratch/cscs/hqi/VFM /llava_dependency/llava-venv/bin/activate
135136 $CMD "
136137
137138echo " END TIME: $( date) "
0 commit comments