Skip to content

Commit 0b99bef

Browse files
author
Haozhe Qi
committed
conflicts resolved
2 parents f1235c1 + eee2422 commit 0b99bef

29 files changed

+1703
-269
lines changed

.vscode/launch.json

Lines changed: 103 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,101 @@
1+
{
2+
"version": "0.2.0",
3+
"configurations": [
4+
{
5+
"name": "Run LLAVA Training with torchrun",
6+
"type": "debugpy",
7+
"request": "launch",
8+
"module": "torch.distributed.run",
9+
"env": {
10+
"CUDA_VISIBLE_DEVICES": "0,1,2,3",
11+
"OMP_NUM_THREADS": "8",
12+
"NCCL_IB_DISABLE": "0",
13+
"NCCL_IB_GID_INDEX": "3",
14+
"NCCL_SOCKET_IFNAME": "eth0",
15+
"NCCL_DEBUG": "INFO",
16+
"ACCELERATE_CPU_AFFINITY": "1",
17+
"LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18+
"WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
19+
"CUDA_LAUNCH_BLOCKING": "1",
20+
"HF_HOME": "/media/data/haozhe/VFM/huggingface",
21+
},
22+
"args": [
23+
"--nproc_per_node=4",
24+
"--nnodes=1",
25+
"llava/train/train_mem.py",
26+
"--deepspeed", "scripts/zero3.json",
27+
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28+
"--version", "qwen_1_5",
29+
"--data_path", "scripts/train/llava_video_RCP.yaml",
30+
"--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
31+
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
32+
"--mm_vision_tower_lr", "2e-6",
33+
"--vision_tower", "google/siglip-so400m-patch14-384",
34+
"--mm_projector_type", "mlp2x_gelu",
35+
"--mm_vision_select_layer", "-2",
36+
"--mm_use_im_start_end", "False",
37+
"--mm_use_im_patch_token", "False",
38+
"--group_by_modality_length", "True",
39+
"--image_aspect_ratio", "anyres_max_9",
40+
"--image_grid_pinpoints", "(1x1),...,(6x6)",
41+
"--mm_patch_merge_type", "spatial_unpad",
42+
"--bf16", "True",
43+
"--run_name", "dev_0.5b_llavavideo_haozhe",
44+
"--output_dir", "experiments/dev_0.5b_llavavideo_haozhe",
45+
"--num_train_epochs", "1",
46+
"--per_device_train_batch_size", "1",
47+
"--per_device_eval_batch_size", "4",
48+
"--gradient_accumulation_steps", "2",
49+
"--evaluation_strategy", "epoch",
50+
"--eval_steps", "1",
51+
"--save_strategy", "steps",
52+
"--save_steps", "2000",
53+
"--learning_rate", "1e-5",
54+
"--weight_decay", "0.",
55+
"--warmup_ratio", "0.03",
56+
"--lr_scheduler_type", "cosine",
57+
"--logging_steps", "1",
58+
"--tf32", "True",
59+
"--model_max_length", "32768",
60+
"--gradient_checkpointing", "True",
61+
"--dataloader_num_workers", "4",
62+
"--lazy_preprocess", "True",
63+
"--report_to", "wandb",
64+
"--torch_compile", "True",
65+
"--torch_compile_backend", "inductor",
66+
"--dataloader_drop_last", "True",
67+
"--frames_upbound", "64",
68+
"--mm_newline_position", "grid",
69+
"--add_time_instruction", "True",
70+
"--force_sample", "True",
71+
"--mm_spatial_pool_stride", "2",
72+
"--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
73+
"--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
74+
"--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
75+
"--llava_num_frames", "64",
76+
"--clip_length", "64",
77+
"--action_representation", "official_key",
78+
"--topk_predictions", "5"
79+
],
80+
"console": "integratedTerminal",
81+
"justMyCode": false,
82+
"cwd": "${workspaceFolder}"
83+
}
84+
]
85+
}
86+
87+
188
// {
289
// "version": "0.2.0",
390
// "configurations": [
491
// {
592
// "name": "Run LLAVA Training with torchrun",
693
// "type": "debugpy",
794
// "request": "launch",
8-
// "module": "torch.distributed.run",
95+
// "python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
96+
// "module": "accelerate.commands.launch",
997
// "env": {
10-
// "CUDA_VISIBLE_DEVICES": "0,2,3",
98+
// "CUDA_VISIBLE_DEVICES": "0,1,2,3",
1199
// "OMP_NUM_THREADS": "8",
12100
// "NCCL_IB_DISABLE": "0",
13101
// "NCCL_IB_GID_INDEX": "3",
@@ -17,70 +105,21 @@
17105
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18106
// "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
19107
// "CUDA_LAUNCH_BLOCKING": "1",
108+
// "HF_HOME": "/media/data/haozhe/VFM/huggingface",
109+
// "OPENAI_API_KEY": "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"
20110
// },
21111
// "args": [
22-
// "--nproc_per_node=3",
23-
// "--nnodes=1",
24-
// "--node_rank=0",
25-
// "--master_addr=127.0.0.1",
26-
// "--master_port=29500",
27-
// "llava/train/train_mem.py",
28-
// "--deepspeed", "scripts/zero3.json",
29-
// "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
30-
// "--version", "qwen_1_5",
31-
// "--data_path", "scripts/train/onevision.yaml",
32-
// // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
33-
// "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
34-
// "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
35-
// // "--video_folder", "/home/haozhe/kitchen/AVION/datasets",
36-
// "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
37-
// "--mm_vision_tower_lr", "2e-6",
38-
// "--vision_tower", "google/siglip-so400m-patch14-384",
39-
// "--mm_projector_type", "mlp2x_gelu",
40-
// "--mm_vision_select_layer", "-2",
41-
// "--mm_use_im_start_end", "False",
42-
// "--mm_use_im_patch_token", "False",
43-
// "--group_by_modality_length", "True",
44-
// "--image_aspect_ratio", "anyres_max_9",
45-
// "--image_grid_pinpoints", "(1x1),...,(6x6)",
46-
// "--mm_patch_merge_type", "spatial_unpad",
47-
// "--bf16", "True",
48-
// "--run_name", "test1",
49-
// "--output_dir", "experiments/test1",
50-
// "--num_train_epochs", "1",
51-
// "--per_device_train_batch_size", "1",
52-
// "--per_device_eval_batch_size", "4",
53-
// "--gradient_accumulation_steps", "2",
54-
// "--evaluation_strategy", "steps",
55-
// "--eval_steps", "100",
56-
// "--save_strategy", "steps",
57-
// "--save_steps", "2000",
58-
// // "--save_total_limit", "1",
59-
// "--learning_rate", "1e-5",
60-
// "--weight_decay", "0.",
61-
// "--warmup_ratio", "0.03",
62-
// "--lr_scheduler_type", "cosine",
63-
// "--logging_steps", "1",
64-
// "--tf32", "True",
65-
// "--model_max_length", "32768",
66-
// "--gradient_checkpointing", "True",
67-
// "--dataloader_num_workers", "4",
68-
// "--lazy_preprocess", "True",
69-
// "--report_to", "wandb",
70-
// "--torch_compile", "True",
71-
// "--torch_compile_backend", "inductor",
72-
// "--dataloader_drop_last", "True",
73-
// "--frames_upbound", "16",
74-
// "--root", "/mediaPFM/data/haozhe/onevision/llava_video/EK100",
75-
// "--action_predictions", "/mediaPFM/data/haozhe/EK100/EK100_in_LLAVA/avion_pred_ids_val.json",
76-
// "--val_metadata", "/mediaPFM/data/haozhe/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
77-
// "--llava_num_frames", "16",
78-
// "--clip_length", "16",
79-
// "--action_representation", "GT_random_narration",
80-
// "--topk_predictions", "5",
81-
// "--dataset", "ek100_cls",
82-
// "--vision_supervision", "three_tokens",
83-
// "--action_types", "97,300,3806"
112+
// "--num_processes", "4",
113+
// "-m", "lmms_eval",
114+
// // "--model", "llava_vid",
115+
// "--model", "llava_onevision",
116+
// // "--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
117+
// "--model_args", "pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen",
118+
// "--tasks", "video_dc499",
119+
// "--batch_size", "1",
120+
// "--log_samples",
121+
// "--log_samples_suffix", "llava_onevision",
122+
// "--output_path", "./logs/"
84123
// ],
85124
// "console": "integratedTerminal",
86125
// "justMyCode": false,
@@ -89,7 +128,6 @@
89128
// ]
90129
// }
91130

92-
93131
// {
94132
// // Use IntelliSense to learn about possible attributes.
95133
// // Hover to view descriptions of existing attributes.

docs/LLaVA_OneVision_Tutorials.ipynb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,11 @@
6060
"model_name = \"llava_qwen\"\n",
6161
"device = \"cuda\"\n",
6262
"device_map = \"auto\"\n",
63-
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args\n",
63+
"llava_model_args = {\n",
64+
" \"multimodal\": True,\n",
65+
" \"attn_implementation\": \"sdpa\",\n",
66+
"}\n",
67+
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, **llava_model_args) # Add any other thing you want to pass in llava_model_args\n",
6468
"\n",
6569
"model.eval()\n",
6670
"\n",
@@ -322,7 +326,10 @@
322326
"model_name = \"llava_qwen\"\n",
323327
"device = \"cuda\"\n",
324328
"device_map = \"auto\"\n",
325-
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation=\"sdpa\")\n",
329+
"llava_model_args = {\n",
330+
" \"multimodal\": True,\n",
331+
"}\n",
332+
"tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map, attn_implementation=\"sdpa\", **llava_model_args)\n",
326333
"\n",
327334
"model.eval()\n",
328335
"\n",

docs/LLaVA_Video_1003.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ print(text_outputs)
8484

8585
## Training
8686

87-
[[Scripts]](/Users/zhangyuanhan/Desktop/LLaVA-NeXT/scripts/video/train): Start training models on your single-image/multi-image/video data.
87+
[[Scripts]](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/yhzhang/video_dev/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9_aug6.sh): Start training models on your single-image/multi-image/video data.
8888

8989

9090
## Evaluation Guidance

docs/download_data.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,41 @@
11
import os
2+
os.environ["HF_HOME"] = "/mnt/SV_storage/VFM/huggingface"
23
from datasets import load_dataset
4+
from datasets import get_dataset_config_names, get_dataset_split_names
35
from tqdm import tqdm
46
import json
57
import yaml
68

9+
dataset_name = "lmms-lab/LLaVA-Video-178K"
10+
11+
save_root = "/mnt/SV_storage/VFM/onevision/llava_video_178k"
12+
13+
subsets = get_dataset_config_names(dataset_name)
14+
for subset in subsets:
15+
# download the dataset
16+
data = load_dataset(dataset_name, subset)
17+
for da in tqdm(data):
18+
json_data = {}
19+
json_data["id"] = da["id"]
20+
json_data["video"] = da["video"]
21+
json_data["conversations"] = da["conversations"]
22+
with open(os.path.join(save_root, '{}.json'.format(da["id"])), "w") as f:
23+
json.dump(json_data, f, indent=4, ensure_ascii=False)
24+
aa= 1
25+
26+
# splits = get_dataset_split_names(dataset_name, subset)
27+
28+
29+
# aa = 1
30+
31+
32+
# data = load_dataset("lmms-lab/LLaVA-Video-178K", '0_30_s_academic_v0_1', split="caption")
33+
34+
# for da in tqdm(data):
35+
# json_data = {}
36+
# json_data["id"] = da["id"]
37+
# aa= 2
38+
739
avaliable_datasets = ['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)',
840
'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)',
941
'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)',

0 commit comments

Comments
 (0)