1+ #! /bin/bash
2+ # SBATCH --job-name multinode
3+ # SBATCH -A a-a03
4+ # SBATCH --hint nomultithread
5+ # SBATCH --cpus-per-task 288
6+ # SBATCH --no-requeue
7+ # SBATCH --nodes 1 # number of Nodes
8+ # SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
9+ # SBATCH --gres gpu:4 # Number of GPUs
10+ # SBATCH --time 23:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11+ # SBATCH --output logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.out
12+ # SBATCH --error logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.err
13+
14+ mkdir -p logs
15+
16+ echo " START TIME: $( date) "
17+
18+ # auto-fail on any errors in this script
19+ # set -eo pipefail
20+
21+ # logging script's variables/commands for future debug needs
22+ set -x
23+
24+ # #####################
25+ # ## Set enviroment ###
26+ # #####################
27+ # module purge
28+ # module load singularity
29+
30+ GPUS_PER_NODE=4
31+ echo " NODES: $SLURM_NNODES "
32+ # #####################
33+
34+ # #####################
35+ # ### Set network #####
36+ # #####################
37+ MASTER_ADDR=$( scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
38+ MASTER_PORT=6000
39+ # #####################
40+
41+ # note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
42+ # 0 and the launcher will hang
43+ #
44+ # same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
45+ LAUNCHER=" accelerate launch \
46+ --num_processes=$GPUS_PER_NODE \
47+ --rdzv_backend c10d \
48+ --max_restarts 0 \
49+ --tee 3 \
50+ "
51+
52+ PYTHON_FILE=" -m lmms_eval"
53+ # PYTHON_ARGS=" \
54+ # --model llava_onevision \
55+ # --model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
56+ # --tasks video_dc499 \
57+ # --batch_size 1 \
58+ # --log_samples_suffix llava_onevision \
59+ # --output_path ./logs/ \
60+ # --verbosity=DEBUG \
61+ # "
62+
63+ PYTHON_ARGS=" \
64+ --model llava_vid \
65+ --model_args pretrained=lmms-lab/LLaVA-Video-7B-Qwen2,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
66+ --tasks ai2d \
67+ --batch_size 1 \
68+ --log_samples \
69+ --log_samples_suffix llava_vid \
70+ --output_path ./logs/
71+ --verbosity=DEBUG \
72+ "
73+
74+ export CMD=" $LAUNCHER $PYTHON_FILE $PYTHON_ARGS "
75+ export HF_HOME=$SCRATCH /huggingface
76+ export OMP_NUM_THREADS=" 8"
77+ export ACCELERATE_CPU_AFFINITY=" 1"
78+ export WANDB_API_KEY=" 65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
79+ export OPENAI_API_KEY=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
80+
81+ echo $CMD
82+
83+ # srun error handling:
84+ # --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
85+ SRUN_ARGS=" \
86+ -ul \
87+ --cpus-per-task $SLURM_CPUS_PER_TASK \
88+ --jobid $SLURM_JOB_ID \
89+ --wait 60 \
90+ --environment=llava-env \
91+ --container-workdir=$PWD \
92+ "
93+ # SINGULARITY_CONTAINER=/path/to/singularity/.sif/file
94+ # SINGULARITY_ARGS=" \
95+ # --bind /path/to/bind/folder \
96+ # $SINGULARITY_CONTAINER \
97+ # "
98+
99+ # bash -c is needed for the delayed interpolation of env vars to work
100+ srun $SRUN_ARGS numactl --membind=0-3 bash -c "
101+ source /iopsstor/scratch/cscs/hqi/VFM/llava_dependency/llava-venv/bin/activate
102+ $CMD "
103+
104+ echo " END TIME: $( date) "
0 commit comments