Skip to content

Commit 11910e9

Browse files
author
Haozhe Qi
committed
Fixed a bug
1 parent 254588b commit 11910e9

File tree

2 files changed

+107
-4
lines changed

2 files changed

+107
-4
lines changed

llava/train/train.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,10 +1293,9 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
12931293
meta_data = None
12941294

12951295
if self.eval_args.learn_neighbor_actions and 'EK100' in video_file:
1296-
vid = video_info
1297-
1298-
start_timestamp = round(self.list_data_dict[i]['start_timestamp'], 2)
1299-
end_timestamp = round(self.list_data_dict[i]['end_timestamp'], 2)
1296+
vid = video_info
1297+
start_timestamp = round(float(self.list_data_dict[i]['start_timestamp']), 2)
1298+
end_timestamp = round(float(self.list_data_dict[i]['end_timestamp']), 2)
13001299
uid = f"{vid}_{start_timestamp}_{end_timestamp}"
13011300
meta_data = self.train_triple_lookup.get(uid, None)
13021301

run_llmseval_clariden.sbatch

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#!/bin/bash
2+
#SBATCH --job-name multinode
3+
#SBATCH -A a-a03
4+
#SBATCH --hint nomultithread
5+
#SBATCH --cpus-per-task 288
6+
#SBATCH --no-requeue
7+
#SBATCH --nodes 1 # number of Nodes
8+
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
9+
#SBATCH --gres gpu:4 # Number of GPUs
10+
#SBATCH --time 23:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11+
#SBATCH --output logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.out
12+
#SBATCH --error logs/R-%x.%j-lmmseval-dev_7b_4f_llavavideo_test_haozhe.err
13+
14+
mkdir -p logs
15+
16+
echo "START TIME: $(date)"
17+
18+
# auto-fail on any errors in this script
19+
# set -eo pipefail
20+
21+
# logging script's variables/commands for future debug needs
22+
set -x
23+
24+
######################
25+
### Set enviroment ###
26+
######################
27+
# module purge
28+
# module load singularity
29+
30+
GPUS_PER_NODE=4
31+
echo "NODES: $SLURM_NNODES"
32+
######################
33+
34+
######################
35+
#### Set network #####
36+
######################
37+
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
38+
MASTER_PORT=6000
39+
######################
40+
41+
# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get
42+
# 0 and the launcher will hang
43+
#
44+
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
45+
LAUNCHER="accelerate launch \
46+
--num_processes=$GPUS_PER_NODE \
47+
--rdzv_backend c10d \
48+
--max_restarts 0 \
49+
--tee 3 \
50+
"
51+
52+
PYTHON_FILE="-m lmms_eval"
53+
# PYTHON_ARGS=" \
54+
# --model llava_onevision \
55+
# --model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
56+
# --tasks video_dc499 \
57+
# --batch_size 1 \
58+
# --log_samples_suffix llava_onevision \
59+
# --output_path ./logs/ \
60+
# --verbosity=DEBUG \
61+
# "
62+
63+
PYTHON_ARGS=" \
64+
--model llava_vid \
65+
--model_args pretrained=lmms-lab/LLaVA-Video-7B-Qwen2,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
66+
--tasks ai2d \
67+
--batch_size 1 \
68+
--log_samples \
69+
--log_samples_suffix llava_vid \
70+
--output_path ./logs/
71+
--verbosity=DEBUG \
72+
"
73+
74+
export CMD="$LAUNCHER $PYTHON_FILE $PYTHON_ARGS"
75+
export HF_HOME=$SCRATCH/huggingface
76+
export OMP_NUM_THREADS="8"
77+
export ACCELERATE_CPU_AFFINITY="1"
78+
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
79+
export OPENAI_API_KEY=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
80+
81+
echo $CMD
82+
83+
# srun error handling:
84+
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
85+
SRUN_ARGS=" \
86+
-ul \
87+
--cpus-per-task $SLURM_CPUS_PER_TASK \
88+
--jobid $SLURM_JOB_ID \
89+
--wait 60 \
90+
--environment=llava-env \
91+
--container-workdir=$PWD \
92+
"
93+
# SINGULARITY_CONTAINER=/path/to/singularity/.sif/file
94+
# SINGULARITY_ARGS=" \
95+
# --bind /path/to/bind/folder \
96+
# $SINGULARITY_CONTAINER \
97+
# "
98+
99+
# bash -c is needed for the delayed interpolation of env vars to work
100+
srun $SRUN_ARGS numactl --membind=0-3 bash -c "
101+
source /iopsstor/scratch/cscs/hqi/VFM/llava_dependency/llava-venv/bin/activate
102+
$CMD"
103+
104+
echo "END TIME: $(date)"

0 commit comments

Comments
 (0)