Skip to content

Commit a296605

Browse files
committed
fix the instruction generation
1 parent e22fd9e commit a296605

File tree

5 files changed

+204
-119
lines changed

5 files changed

+204
-119
lines changed

.vscode/launch.json

Lines changed: 101 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,11 @@
1-
// {
2-
// "version": "0.2.0",
3-
// "configurations": [
4-
// {
5-
// "name": "Run LLAVA Training with torchrun",
6-
// "type": "debugpy",
7-
// "request": "launch",
8-
// "module": "torch.distributed.run",
9-
// "env": {
10-
// "CUDA_VISIBLE_DEVICES": "0,1,2,3",
11-
// "OMP_NUM_THREADS": "8",
12-
// "NCCL_IB_DISABLE": "0",
13-
// "NCCL_IB_GID_INDEX": "3",
14-
// "NCCL_SOCKET_IFNAME": "eth0",
15-
// "NCCL_DEBUG": "INFO",
16-
// "ACCELERATE_CPU_AFFINITY": "1",
17-
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18-
// "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
19-
// "CUDA_LAUNCH_BLOCKING": "1",
20-
// "HF_HOME": "/media/data/haozhe/VFM/huggingface",
21-
// },
22-
// "args": [
23-
// "--nproc_per_node=4",
24-
// "--nnodes=1",
25-
// "llava/train/train_mem.py",
26-
// "--deepspeed", "scripts/zero3.json",
27-
// "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28-
// "--version", "qwen_1_5",
29-
// "--data_path", "scripts/train/llava_video.yaml",
30-
// "--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
31-
// "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
32-
// "--mm_vision_tower_lr", "2e-6",
33-
// "--vision_tower", "google/siglip-so400m-patch14-384",
34-
// "--mm_projector_type", "mlp2x_gelu",
35-
// "--mm_vision_select_layer", "-2",
36-
// "--mm_use_im_start_end", "False",
37-
// "--mm_use_im_patch_token", "False",
38-
// "--group_by_modality_length", "True",
39-
// "--image_aspect_ratio", "anyres_max_9",
40-
// "--image_grid_pinpoints", "(1x1),...,(6x6)",
41-
// "--mm_patch_merge_type", "spatial_unpad",
42-
// "--bf16", "True",
43-
// "--run_name", "dev_0.5b_4f_llavavideo_haozhe",
44-
// "--output_dir", "experiments/dev_0.5b_4f_llavavideo_haozhe",
45-
// "--num_train_epochs", "1",
46-
// "--per_device_train_batch_size", "8",
47-
// "--per_device_eval_batch_size", "4",
48-
// "--gradient_accumulation_steps", "2",
49-
// "--evaluation_strategy", "epoch",
50-
// "--eval_steps", "1",
51-
// "--save_strategy", "steps",
52-
// "--save_steps", "2000",
53-
// "--learning_rate", "1e-5",
54-
// "--weight_decay", "0.",
55-
// "--warmup_ratio", "0.03",
56-
// "--lr_scheduler_type", "cosine",
57-
// "--logging_steps", "1",
58-
// "--tf32", "True",
59-
// "--model_max_length", "32768",
60-
// "--gradient_checkpointing", "True",
61-
// "--dataloader_num_workers", "4",
62-
// "--lazy_preprocess", "True",
63-
// "--report_to", "wandb",
64-
// "--torch_compile", "True",
65-
// "--torch_compile_backend", "inductor",
66-
// "--dataloader_drop_last", "True",
67-
// "--frames_upbound", "4",
68-
// "--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
69-
// "--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
70-
// "--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
71-
// "--add_time_instruction", "False",
72-
// "--llava_num_frames", "4",
73-
// "--clip_length", "4",
74-
// "--action_representation", "official_key",
75-
// "--topk_predictions", "5"
76-
// ],
77-
// "console": "integratedTerminal",
78-
// "justMyCode": false,
79-
// "cwd": "${workspaceFolder}"
80-
// }
81-
// ]
82-
// }
83-
84-
851
{
862
"version": "0.2.0",
873
"configurations": [
884
{
895
"name": "Run LLAVA Training with torchrun",
906
"type": "debugpy",
917
"request": "launch",
92-
"python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
93-
"module": "accelerate.commands.launch",
8+
"module": "torch.distributed.run",
949
"env": {
9510
"CUDA_VISIBLE_DEVICES": "0,1,2,3",
9611
"OMP_NUM_THREADS": "8",
@@ -103,20 +18,64 @@
10318
"WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
10419
"CUDA_LAUNCH_BLOCKING": "1",
10520
"HF_HOME": "/media/data/haozhe/VFM/huggingface",
106-
"OPENAI_API_KEY": "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"
10721
},
10822
"args": [
109-
"--num_processes", "4",
110-
"-m", "lmms_eval",
111-
// "--model", "llava_vid",
112-
"--model", "llava_onevision",
113-
// "--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
114-
"--model_args", "pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen",
115-
"--tasks", "video_dc499",
116-
"--batch_size", "1",
117-
"--log_samples",
118-
"--log_samples_suffix", "llava_onevision",
119-
"--output_path", "./logs/"
23+
"--nproc_per_node=4",
24+
"--nnodes=1",
25+
"llava/train/train_mem.py",
26+
"--deepspeed", "scripts/zero3.json",
27+
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28+
"--version", "qwen_1_5",
29+
"--data_path", "scripts/train/llava_video_RCP.yaml",
30+
"--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
31+
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
32+
"--mm_vision_tower_lr", "2e-6",
33+
"--vision_tower", "google/siglip-so400m-patch14-384",
34+
"--mm_projector_type", "mlp2x_gelu",
35+
"--mm_vision_select_layer", "-2",
36+
"--mm_use_im_start_end", "False",
37+
"--mm_use_im_patch_token", "False",
38+
"--group_by_modality_length", "True",
39+
"--image_aspect_ratio", "anyres_max_9",
40+
"--image_grid_pinpoints", "(1x1),...,(6x6)",
41+
"--mm_patch_merge_type", "spatial_unpad",
42+
"--bf16", "True",
43+
"--run_name", "dev_0.5b_llavavideo_haozhe",
44+
"--output_dir", "experiments/dev_0.5b_llavavideo_haozhe",
45+
"--num_train_epochs", "1",
46+
"--per_device_train_batch_size", "1",
47+
"--per_device_eval_batch_size", "4",
48+
"--gradient_accumulation_steps", "2",
49+
"--evaluation_strategy", "epoch",
50+
"--eval_steps", "1",
51+
"--save_strategy", "steps",
52+
"--save_steps", "2000",
53+
"--learning_rate", "1e-5",
54+
"--weight_decay", "0.",
55+
"--warmup_ratio", "0.03",
56+
"--lr_scheduler_type", "cosine",
57+
"--logging_steps", "1",
58+
"--tf32", "True",
59+
"--model_max_length", "32768",
60+
"--gradient_checkpointing", "True",
61+
"--dataloader_num_workers", "4",
62+
"--lazy_preprocess", "True",
63+
"--report_to", "wandb",
64+
"--torch_compile", "True",
65+
"--torch_compile_backend", "inductor",
66+
"--dataloader_drop_last", "True",
67+
"--frames_upbound", "64",
68+
"--mm_newline_position", "grid",
69+
"--add_time_instruction", "True",
70+
"--force_sample", "True",
71+
"--mm_spatial_pool_stride", "2",
72+
"--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
73+
"--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
74+
"--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
75+
"--llava_num_frames", "64",
76+
"--clip_length", "64",
77+
"--action_representation", "official_key",
78+
"--topk_predictions", "5"
12079
],
12180
"console": "integratedTerminal",
12281
"justMyCode": false,
@@ -125,6 +84,50 @@
12584
]
12685
}
12786

87+
88+
// {
89+
// "version": "0.2.0",
90+
// "configurations": [
91+
// {
92+
// "name": "Run LLAVA Training with torchrun",
93+
// "type": "debugpy",
94+
// "request": "launch",
95+
// "python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
96+
// "module": "accelerate.commands.launch",
97+
// "env": {
98+
// "CUDA_VISIBLE_DEVICES": "0,1,2,3",
99+
// "OMP_NUM_THREADS": "8",
100+
// "NCCL_IB_DISABLE": "0",
101+
// "NCCL_IB_GID_INDEX": "3",
102+
// "NCCL_SOCKET_IFNAME": "eth0",
103+
// "NCCL_DEBUG": "INFO",
104+
// "ACCELERATE_CPU_AFFINITY": "1",
105+
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
106+
// "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
107+
// "CUDA_LAUNCH_BLOCKING": "1",
108+
// "HF_HOME": "/media/data/haozhe/VFM/huggingface",
109+
// "OPENAI_API_KEY": "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"
110+
// },
111+
// "args": [
112+
// "--num_processes", "4",
113+
// "-m", "lmms_eval",
114+
// // "--model", "llava_vid",
115+
// "--model", "llava_onevision",
116+
// // "--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
117+
// "--model_args", "pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen",
118+
// "--tasks", "video_dc499",
119+
// "--batch_size", "1",
120+
// "--log_samples",
121+
// "--log_samples_suffix", "llava_onevision",
122+
// "--output_path", "./logs/"
123+
// ],
124+
// "console": "integratedTerminal",
125+
// "justMyCode": false,
126+
// "cwd": "${workspaceFolder}"
127+
// }
128+
// ]
129+
// }
130+
128131
// {
129132
// // Use IntelliSense to learn about possible attributes.
130133
// // Hover to view descriptions of existing attributes.

llava/train/train.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,7 +1206,8 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
12061206
if not os.path.exists(video_file):
12071207
print("File {} not exist!".format(video_file))
12081208

1209-
try:
1209+
# try:
1210+
if True:
12101211
if "sharegpt4video" in video_folder:
12111212
frame_files = [os.path.join(video_file, f) for f in os.listdir(video_file) if os.path.isfile(os.path.join(video_file, f))]
12121213
frame_files.sort() # Ensure the frames are sorted if they are named sequentially
@@ -1279,11 +1280,7 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
12791280

12801281
processor = self.data_args.image_processor
12811282
image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
1282-
if 'EK100' not in video_file and 'EKframes' not in video_folder:
1283-
if self.data_args.add_time_instruction:
1284-
time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
1285-
sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
1286-
else:
1283+
if 'EK100' in video_file or 'EKframes' in video_folder:
12871284
# We use our own prompting logic when it's EK100
12881285
# We turn a string of list to a python list
12891286
question_type = sources[0]['question_type']
@@ -1306,16 +1303,21 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
13061303
include_frame_time = False)
13071304
sources[0]["conversations"][0]["value"] = llava_prompt
13081305
# rank0_print (sources[0])
1306+
1307+
if self.data_args.add_time_instruction:
1308+
time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
1309+
sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
1310+
13091311
action = torch.tensor([sources[0]['verb_id'], sources[0]['noun_id'], sources[0]['action_id']] if 'verb_id' in sources[0] else [-1, -1, -1]).long()
13101312
image = [(image, video[0].size, "video", action)]
13111313
sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
13121314
# print(sources)
1313-
except Exception as e:
1314-
import traceback
1315-
traceback.print_exc()
1316-
print(f"Error: {e}")
1317-
print(f"Failed to read video file: {video_file}")
1318-
return self._get_item(i + 1)
1315+
# except Exception as e:
1316+
# import traceback
1317+
# traceback.print_exc()
1318+
# print(f"Error: {e}")
1319+
# print(f"Failed to read video file: {video_file}")
1320+
# return self._get_item(i + 1)
13191321
else:
13201322
sources = copy.deepcopy([e["conversations"] for e in sources])
13211323

run_clariden.sbatch

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task
99
#SBATCH --gres gpu:4 # Number of GPUs
1010
#SBATCH --time 23:00:00 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5
11-
#SBATCH --output logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.out
12-
#SBATCH --error logs/R-%x.%j-dev_7b_4f_llavavideo_test_haozhe.err
11+
#SBATCH --output logs/R-%x.%j-dev_7b_64f_10llavavideo_EK100_haozhe.out
12+
#SBATCH --error logs/R-%x.%j-dev_7b_64f_10llavavideo_EK100_haozhe.err
1313

1414
mkdir -p logs
1515

@@ -71,8 +71,8 @@ PYTHON_ARGS=" \
7171
--image_grid_pinpoints \"(1x1),...,(6x6)\" \
7272
--mm_patch_merge_type spatial_unpad \
7373
--bf16 True \
74-
--run_name dev_7b_4f_llavavideo_test_haozhe \
75-
--output_dir experiments/dev_7b_4f_llavavideo_test_haozhe \
74+
--run_name dev_7b_64f_10llavavideo_EK100_haozhe \
75+
--output_dir experiments/dev_7b_64f_10llavavideo_EK100_haozhe \
7676
--num_train_epochs 1 \
7777
--per_device_train_batch_size 1 \
7878
--per_device_eval_batch_size 4 \
@@ -103,9 +103,8 @@ PYTHON_ARGS=" \
103103
--root /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/EK100 \
104104
--action_predictions /iopsstor/scratch/cscs/hqi/VFM/llava_data/TIM_PREDS/tim_pred_ids_val.json \
105105
--val_metadata /iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
106-
--add_time_instruction False \
107-
--llava_num_frames 4 \
108-
--clip_length 4 \
106+
--llava_num_frames 64 \
107+
--clip_length 64 \
109108
--action_representation official_key \
110109
--topk_predictions 5 \
111110
"

scripts/train/llava_video.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
datasets:
22
- json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/0_30_s_academic_v0_1/0_30_s_academic_v0_1_cap_processed.json
3-
sampling_strategy: "first:10%"
3+
sampling_strategy: "first:1%"
44
- json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_cap_processed.json
55
sampling_strategy: "first:10%"
66
- json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/30_60_s_academic_v0_1/30_60_s_academic_v0_1_cap_processed.json
@@ -78,4 +78,4 @@ datasets:
7878
# - json_path: /iopsstor/scratch/cscs/hqi/VFM/onevision/llava_video/LLaVA-Video-178K/llava_hound/sharegptvideo_qa_255k_processed.json
7979
# sampling_strategy: "first:10%"
8080
- json_path: /media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_mc_top5_official_key/train_convs_narration_actionids.jsonl
81-
sampling_strategy: all
81+
sampling_strategy: "first:1%"

0 commit comments

Comments
 (0)