Skip to content

Commit 693d553

Browse files
author
Haozhe Qi
committed
add training and evalation script on clariden
1 parent ef08304 commit 693d553

File tree

7 files changed

+539
-173
lines changed

7 files changed

+539
-173
lines changed

lmms_eval_CSCS.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
2+
export OMP_NUM_THREADS="8"
3+
# export NCCL_IB_DISABLE="0"
4+
# export NCCL_IB_GID_INDEX="3"
5+
# export NCCL_SOCKET_IFNAME="eth0"
6+
# export NCCL_DEBUG="INFO"
7+
export ACCELERATE_CPU_AFFINITY="1"
8+
export WANDB_API_KEY="4474ec79de023b0c3ffb43588ab6163264f875db"
9+
export HF_HOME=/iopsstor/scratch/cscs/hqi/huggingface
10+
# export HF_HOME=/mnt/SV_storage/VFM/huggingface
11+
export PYTHONPATH=/iopsstor/scratch/cscs/hqi/VFM/haozhe/LLaVA-NeXT:$PYTHONPATH
12+
# export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
13+
export OPENAI_API_KEY=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
14+
15+
# source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
16+
accelerate launch --num_processes=4 \
17+
-m lmms_eval \
18+
--model llava_onevision \
19+
--model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
20+
--tasks video_dc499 \
21+
--batch_size 1 \
22+
--log_samples \
23+
--log_samples_suffix llava_onevision \
24+
--output_path ./logs/ \
25+
--verbosity=DEBUG
26+
27+
28+
# source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
29+
# accelerate launch --num_processes=4 \
30+
# -m lmms_eval \
31+
# --model llava_vid \
32+
# --model_args pretrained=experiments/dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent/checkpoint-15000,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average,model_name=llava_qwen \
33+
# --tasks videomme \
34+
# --batch_size 1 \
35+
# --log_samples \
36+
# --log_samples_suffix dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent_checkpoint_15000 \
37+
# --output_path ./logs/ \
38+
# --verbosity=DEBUG > ./logs/dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent_checkpoint_15000.log 2>&1
39+
40+
41+
# # source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
42+
# accelerate launch --num_processes=4 \
43+
# -m lmms_eval \
44+
# --model llava_vid \
45+
# --model_args pretrained=lmms-lab/LLaVA-Video-7B-Qwen2,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
46+
# --tasks videochatgpt \
47+
# --batch_size 1 \
48+
# --log_samples \
49+
# --log_samples_suffix llava_vid_7b \
50+
# --output_path ./logs/
51+
# --verbosity=DEBUG

run_todi.sbatch renamed to run_clariden.sbatch

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
#!/bin/bash
22
#SBATCH --job-name multinode
3-
#SBATCH --account a03
4-
#SBATCH --reservation=sai-a03
3+
#SBATCH -A a-a03
54
#SBATCH --hint nomultithread
65
#SBATCH --cpus-per-task 288
76
#SBATCH --no-requeue
87
#SBATCH --nodes 8 # number of Nodes
98
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
109
#SBATCH --gres gpu:4 # Number of GPUs
11-
#SBATCH --time 05:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
12-
#SBATCH --output logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs.out
13-
#SBATCH --error logs/R-%x.%j_train_llavavideo_kitchen_7b_avion_mc_32f_top10_5epochs.err
10+
#SBATCH --time 23:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11+
#SBATCH --output logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.out
12+
#SBATCH --error logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.err
1413

1514
mkdir -p logs
1615

@@ -56,10 +55,10 @@ LAUNCHER="torchrun \
5655
PYTHON_FILE=llava/train/train_mem.py
5756
PYTHON_ARGS=" \
5857
--deepspeed scripts/zero3.json \
59-
--model_name_or_path pretrained_models/LLaVA-Video-7B-Qwen2 \
58+
--model_name_or_path lmms-lab/LLaVA-Video-7B-Qwen2 \
6059
--version qwen_1_5 \
61-
--data_path scripts/train/EK100_avion_mc_top10.yaml \
62-
--video_folder /capstor/scratch/cscs/hqi/llava/onevision/llava_video \
60+
--data_path scripts/train/llava_video.yaml \
61+
--video_folder /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video \
6362
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
6463
--mm_vision_tower_lr 2e-6 \
6564
--vision_tower google/siglip-so400m-patch14-384 \
@@ -72,16 +71,16 @@ PYTHON_ARGS=" \
7271
--image_grid_pinpoints \"(1x1),...,(6x6)\" \
7372
--mm_patch_merge_type spatial_unpad \
7473
--bf16 True \
75-
--run_name todi_llava_video_7b_avion_mc_top10_5epochs \
76-
--output_dir experiments/todi_llava_video_7b_avion_mc_top10_5epochs \
77-
--num_train_epochs 5 \
78-
--per_device_train_batch_size 2 \
79-
--per_device_eval_batch_size 4 \
74+
--run_name dev_7b_4f_llavavideo_test_haozhe \
75+
--output_dir experiments/dev_7b_4f_llavavideo_test_haozhe \
76+
--num_train_epochs 1 \
77+
--per_device_train_batch_size 1 \
78+
--per_device_eval_batch_size 1 \
8079
--gradient_accumulation_steps 2 \
81-
--evaluation_strategy steps \
82-
--eval_steps 2000\
80+
--evaluation_strategy no \
81+
--eval_steps 2000 \
8382
--save_strategy steps \
84-
--save_steps 1000 \
83+
--save_steps 2000 \
8584
--learning_rate 1e-5 \
8685
--weight_decay 0. \
8786
--warmup_ratio 0.03 \
@@ -96,13 +95,15 @@ PYTHON_ARGS=" \
9695
--torch_compile True \
9796
--torch_compile_backend inductor \
9897
--dataloader_drop_last True \
99-
--frames_upbound 32 \
100-
--root /capstor/scratch/cscs/hqi/llava/onevision/llava_video/EK100 \
101-
--action_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_test.json \
102-
--val_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
103-
--llava_num_frames 32 \
104-
--clip_length 32 \
105-
--topk_predictions 10 \
98+
--frames_upbound 4 \
99+
--root /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/EK100 \
100+
--action_predictions /iopsstor/scratch/cscs/hqi/VFM/llava_data/TIM_PREDS/tim_pred_ids_val.json \
101+
--val_metadata /iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
102+
--add_time_instruction False \
103+
--llava_num_frames 4 \
104+
--clip_length 4 \
105+
--action_representation official_key \
106+
--topk_predictions 5 \
106107
"
107108

108109
export CMD="$LAUNCHER $PYTHON_FILE $PYTHON_ARGS"
@@ -130,8 +131,8 @@ SRUN_ARGS=" \
130131
# "
131132

132133
# bash -c is needed for the delayed interpolation of env vars to work
133-
srun $SRUN_ARGS bash -c "
134-
source /capstor/scratch/cscs/hqi/llava/llava_dependency/llava-venv/bin/activate
134+
srun $SRUN_ARGS numactl --membind=0-3 bash -c "
135+
source /iopsstor/scratch/cscs/hqi/VFM/llava_dependency/llava-venv/bin/activate
135136
$CMD"
136137

137138
echo "END TIME: $(date)"

run_llmseval_clariden.sbatch

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/bin/bash
2+
#SBATCH --job-name multinode
3+
#SBATCH -A a-a03
4+
#SBATCH --hint nomultithread
5+
#SBATCH --cpus-per-task 288
6+
#SBATCH --no-requeue
7+
#SBATCH --nodes 1 # number of Nodes
8+
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
9+
#SBATCH --gres gpu:4 # Number of GPUs
10+
#SBATCH --time 23:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11+
#SBATCH --output logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.out
12+
#SBATCH --error logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.err
13+
14+
mkdir -p logs
15+
16+
echo "START TIME: $(date)"
17+
18+
# auto-fail on any errors in this script
19+
# set -eo pipefail
20+
21+
# logging script's variables/commands for future debug needs
22+
set -x
23+
24+
######################
25+
### Set enviroment ###
26+
######################
27+
# module purge
28+
# module load singularity
29+
30+
GPUS_PER_NODE=4
31+
echo "NODES: $SLURM_NNODES"
32+
######################
33+
34+
######################
35+
#### Set network #####
36+
######################
37+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
38+
MASTER_PORT=6000
39+
######################
40+
41+
# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
42+
# 0 and the launcher will hang
43+
#
44+
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
45+
LAUNCHER="accelerate launch \
46+
--num_processes=$GPUS_PER_NODE \
47+
--rdzv_backend c10d \
48+
--max_restarts 0 \
49+
--tee 3 \
50+
"
51+
52+
PYTHON_FILE="-m lmms_eval"
53+
# PYTHON_ARGS=" \
54+
# --model llava_onevision \
55+
# --model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
56+
# --tasks video_dc499 \
57+
# --batch_size 1 \
58+
# --log_samples_suffix llava_onevision \
59+
# --output_path ./logs/ \
60+
# --verbosity=DEBUG \
61+
# "
62+
63+
PYTHON_ARGS=" \
64+
--model llava_vid \
65+
--model_args pretrained=lmms-lab/LLaVA-Video-7B-Qwen2,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
66+
--tasks activitynetqa,videochatgpt,nextqa_mc_test,egoschema,video_dc499,videomme,videomme_w_subtitle,perceptiontest_val_mc \
67+
--batch_size 1 \
68+
--log_samples \
69+
--log_samples_suffix llava_vid \
70+
--output_path ./logs/
71+
--verbosity=DEBUG \
72+
"
73+
74+
export CMD="$LAUNCHER $PYTHON_FILE $PYTHON_ARGS"
75+
export HF_HOME=$SCRATCH/huggingface
76+
export OMP_NUM_THREADS="8"
77+
export ACCELERATE_CPU_AFFINITY="1"
78+
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
79+
export OPENAI_API_KEY=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
80+
81+
echo $CMD
82+
83+
# srun error handling:
84+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
85+
SRUN_ARGS=" \
86+
-ul \
87+
--cpus-per-task $SLURM_CPUS_PER_TASK \
88+
--jobid $SLURM_JOB_ID \
89+
--wait 60 \
90+
--environment=llava-env \
91+
--container-workdir=$PWD \
92+
"
93+
# SINGULARITY_CONTAINER=/path/to/singularity/.sif/file
94+
# SINGULARITY_ARGS=" \
95+
# --bind /path/to/bind/folder \
96+
# $SINGULARITY_CONTAINER \
97+
# "
98+
99+
# bash -c is needed for the delayed interpolation of env vars to work
100+
srun $SRUN_ARGS numactl --membind=0-3 bash -c "
101+
source /iopsstor/scratch/cscs/hqi/VFM/llava_dependency/llava-venv/bin/activate
102+
$CMD"
103+
104+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)