Skip to content

Commit 32df8ce

Browse files
committed
add test
1 parent 381dd49 commit 32df8ce

File tree

6 files changed

+175
-100
lines changed

6 files changed

+175
-100
lines changed

.vscode/launch.json

Lines changed: 91 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,101 +1,101 @@
1-
{
2-
"version": "0.2.0",
3-
"configurations": [
4-
{
5-
"name": "Run LLAVA Training with torchrun",
6-
"type": "debugpy",
7-
"request": "launch",
8-
"module": "torch.distributed.run",
9-
"env": {
10-
"CUDA_VISIBLE_DEVICES": "1,2",
11-
"OMP_NUM_THREADS": "8",
12-
"NCCL_IB_DISABLE": "0",
13-
"NCCL_IB_GID_INDEX": "3",
14-
"NCCL_SOCKET_IFNAME": "eth0",
15-
"NCCL_DEBUG": "INFO",
16-
"ACCELERATE_CPU_AFFINITY": "1",
17-
"LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18-
},
19-
"args": [
20-
"--nproc_per_node=2",
21-
"--nnodes=1",
22-
"--node_rank=0",
23-
"--master_addr=127.0.0.1",
24-
"--master_port=29500",
25-
"llava/train/train_mem.py",
26-
"--deepspeed", "scripts/zero3.json",
27-
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28-
"--version", "qwen_1_5",
29-
"--data_path", "scripts/train/onevision.yaml",
30-
// "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
31-
"--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
32-
"--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
33-
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
34-
"--mm_vision_tower_lr", "2e-6",
35-
"--vision_tower", "google/siglip-so400m-patch14-384",
36-
"--mm_projector_type", "mlp2x_gelu",
37-
"--mm_vision_select_layer", "-2",
38-
"--mm_use_im_start_end", "False",
39-
"--mm_use_im_patch_token", "False",
40-
"--group_by_modality_length", "True",
41-
"--image_aspect_ratio", "anyres_max_9",
42-
"--image_grid_pinpoints", "(1x1),...,(6x6)",
43-
"--mm_patch_merge_type", "spatial_unpad",
44-
"--bf16", "True",
45-
"--run_name", "test",
46-
"--output_dir", "experiments/test",
47-
"--num_train_epochs", "1",
48-
"--per_device_train_batch_size", "1",
49-
"--per_device_eval_batch_size", "4",
50-
"--gradient_accumulation_steps", "2",
51-
"--evaluation_strategy", "no",
52-
"--save_strategy", "steps",
53-
"--save_steps", "1000",
54-
"--save_total_limit", "1",
55-
"--learning_rate", "1e-5",
56-
"--weight_decay", "0.",
57-
"--warmup_ratio", "0.03",
58-
"--lr_scheduler_type", "cosine",
59-
"--logging_steps", "1",
60-
"--tf32", "True",
61-
"--model_max_length", "32768",
62-
"--gradient_checkpointing", "True",
63-
"--dataloader_num_workers", "4",
64-
"--lazy_preprocess", "True",
65-
"--report_to", "wandb",
66-
"--torch_compile", "True",
67-
"--torch_compile_backend", "inductor",
68-
"--dataloader_drop_last", "True",
69-
"--frames_upbound", "32",
70-
],
71-
"console": "integratedTerminal",
72-
"justMyCode": false,
73-
"cwd": "${workspaceFolder}"
74-
}
75-
]
76-
}
77-
78-
791
// {
80-
// // Use IntelliSense to learn about possible attributes.
81-
// // Hover to view descriptions of existing attributes.
82-
// // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
832
// "version": "0.2.0",
843
// "configurations": [
854
// {
86-
// "name": "Python: Current File",
5+
// "name": "Run LLAVA Training with torchrun",
876
// "type": "debugpy",
887
// "request": "launch",
89-
// "program": "docs/LLaVA_OneVision_Tutorials.py",
8+
// "module": "torch.distributed.run",
9+
// "env": {
10+
// "CUDA_VISIBLE_DEVICES": "1,2,3",
11+
// "OMP_NUM_THREADS": "8",
12+
// "NCCL_IB_DISABLE": "0",
13+
// "NCCL_IB_GID_INDEX": "3",
14+
// "NCCL_SOCKET_IFNAME": "eth0",
15+
// "NCCL_DEBUG": "INFO",
16+
// "ACCELERATE_CPU_AFFINITY": "1",
17+
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18+
// "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
19+
// },
20+
// "args": [
21+
// "--nproc_per_node=3",
22+
// "--nnodes=1",
23+
// "--node_rank=0",
24+
// "--master_addr=127.0.0.1",
25+
// "--master_port=29500",
26+
// "llava/train/train_mem.py",
27+
// "--deepspeed", "scripts/zero3.json",
28+
// "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
29+
// "--version", "qwen_1_5",
30+
// "--data_path", "scripts/train/onevision.yaml",
31+
// // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
32+
// "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
33+
// "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
34+
// "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
35+
// "--mm_vision_tower_lr", "2e-6",
36+
// "--vision_tower", "google/siglip-so400m-patch14-384",
37+
// "--mm_projector_type", "mlp2x_gelu",
38+
// "--mm_vision_select_layer", "-2",
39+
// "--mm_use_im_start_end", "False",
40+
// "--mm_use_im_patch_token", "False",
41+
// "--group_by_modality_length", "True",
42+
// "--image_aspect_ratio", "anyres_max_9",
43+
// "--image_grid_pinpoints", "(1x1),...,(6x6)",
44+
// "--mm_patch_merge_type", "spatial_unpad",
45+
// "--bf16", "True",
46+
// "--run_name", "test",
47+
// "--output_dir", "experiments/test",
48+
// "--num_train_epochs", "1",
49+
// "--per_device_train_batch_size", "1",
50+
// "--per_device_eval_batch_size", "4",
51+
// "--gradient_accumulation_steps", "2",
52+
// "--evaluation_strategy", "no",
53+
// "--save_strategy", "steps",
54+
// "--save_steps", "1000",
55+
// "--save_total_limit", "1",
56+
// "--learning_rate", "1e-5",
57+
// "--weight_decay", "0.",
58+
// "--warmup_ratio", "0.03",
59+
// "--lr_scheduler_type", "cosine",
60+
// "--logging_steps", "1",
61+
// "--tf32", "True",
62+
// "--model_max_length", "32768",
63+
// "--gradient_checkpointing", "True",
64+
// "--dataloader_num_workers", "4",
65+
// "--lazy_preprocess", "True",
66+
// "--report_to", "wandb",
67+
// "--torch_compile", "True",
68+
// "--torch_compile_backend", "inductor",
69+
// "--dataloader_drop_last", "True",
70+
// "--frames_upbound", "32",
71+
// ],
9072
// "console": "integratedTerminal",
91-
// "env":{"CUDA_VISIBLE_DEVICES":"0",
92-
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
93-
// "LD_LIBRARY_PATH": "/home/haozhe/miniconda3/envs/llava/lib"},
9473
// "justMyCode": false,
95-
// // "args": [
96-
// // "--run_dir_name", "test",
97-
// // // "--use_big_decoder"
98-
// // ]
74+
// "cwd": "${workspaceFolder}"
9975
// }
10076
// ]
101-
// }
77+
// }
78+
79+
80+
{
81+
// Use IntelliSense to learn about possible attributes.
82+
// Hover to view descriptions of existing attributes.
83+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
84+
"version": "0.2.0",
85+
"configurations": [
86+
{
87+
"name": "Python: Current File",
88+
"type": "debugpy",
89+
"request": "launch",
90+
"program": "docs/LLaVA_OneVision_Tutorials.py",
91+
"console": "integratedTerminal",
92+
"env":{"CUDA_VISIBLE_DEVICES":"0",
93+
"LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
94+
"justMyCode": false,
95+
// "args": [
96+
// "--run_dir_name", "test",
97+
// // "--use_big_decoder"
98+
// ]
99+
}
100+
]
101+
}

docs/LLaVA_OneVision_Tutorials.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import os
2+
import sys
3+
sys.path[0] = os.path.dirname(sys.path[0])
4+
15
# from llava.model.builder import load_pretrained_model
26
# from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
37
# from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX

docs/download_data.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@
2828
'vistext(cauldron)', 'visual7w(cauldron,llava_format)', 'visualmrc(cauldron)',
2929
'vqarad(cauldron,llava_format)', 'vsr(cauldron,llava_format)', 'websight(cauldron)']
3030

31-
chossen_datasets = ['sharegpt4v(sam)', 'sharegpt4v(llava)']
31+
# chossen_datasets = ['sharegpt4v(sam)', 'sharegpt4v(llava)']
32+
chossen_datasets = ['geo3k']
3233

33-
image_base = "/mediaPFM/data/haozhe/onevision/llava_data"
34-
json_base = "/mediaPFM/data/haozhe/onevision/llava_instruct"
35-
dataset_yaml = 'scripts/train/onevision.yaml'
34+
image_base = "/mnt/SV_storage/VFM/onevision/llava_data"
35+
json_base = "/mnt/SV_storage/VFM/onevision/llava_instruct"
36+
# dataset_yaml = 'scripts/train/onevision.yaml'
3637

3738
# # open the yaml file
3839
# with open(dataset_yaml, 'r') as f:

run.sh

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,63 @@
1-
python docs/download_data.py
1+
#!/bin/bash
2+
3+
# Export environment variables
4+
export CUDA_VISIBLE_DEVICES="0,1"
5+
export OMP_NUM_THREADS="8"
6+
export NCCL_IB_DISABLE="0"
7+
export NCCL_IB_GID_INDEX="3"
8+
export NCCL_SOCKET_IFNAME="eth0"
9+
export NCCL_DEBUG="INFO"
10+
export ACCELERATE_CPU_AFFINITY="1"
11+
# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
12+
export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
13+
14+
# Run the command using torchrun
15+
torchrun --nproc_per_node=2 \
16+
--nnodes=1 \
17+
--node_rank=0 \
18+
--master_addr=127.0.0.1 \
19+
--master_port=29500 \
20+
llava/train/train_mem.py \
21+
--deepspeed scripts/zero3.json \
22+
--model_name_or_path lmms-lab/llava-onevision-qwen2-7b-ov \
23+
--version qwen_1_5 \
24+
--data_path scripts/train/onevision.yaml \
25+
--image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
26+
--video_folder /media/data/haozhe/VFM/onevision/llava_video \
27+
--mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
28+
--mm_vision_tower_lr 2e-6 \
29+
--vision_tower google/siglip-so400m-patch14-384 \
30+
--mm_projector_type mlp2x_gelu \
31+
--mm_vision_select_layer -2 \
32+
--mm_use_im_start_end False \
33+
--mm_use_im_patch_token False \
34+
--group_by_modality_length True \
35+
--image_aspect_ratio anyres_max_9 \
36+
--image_grid_pinpoints "(1x1),...,(6x6)" \
37+
--mm_patch_merge_type spatial_unpad \
38+
--bf16 True \
39+
--run_name test \
40+
--output_dir experiments/test \
41+
--num_train_epochs 1 \
42+
--per_device_train_batch_size 1 \
43+
--per_device_eval_batch_size 4 \
44+
--gradient_accumulation_steps 2 \
45+
--evaluation_strategy no \
46+
--save_strategy steps \
47+
--save_steps 1000 \
48+
--save_total_limit 1 \
49+
--learning_rate 1e-5 \
50+
--weight_decay 0. \
51+
--warmup_ratio 0.03 \
52+
--lr_scheduler_type cosine \
53+
--logging_steps 1 \
54+
--tf32 True \
55+
--model_max_length 32768 \
56+
--gradient_checkpointing True \
57+
--dataloader_num_workers 4 \
58+
--lazy_preprocess True \
59+
--report_to wandb \
60+
--torch_compile True \
61+
--torch_compile_backend inductor \
62+
--dataloader_drop_last True \
63+
--frames_upbound 32 > test7b.out 2>&1

run_demo.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
# Export environment variables
4+
export CUDA_VISIBLE_DEVICES="0"
5+
# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
6+
7+
# Run the Python script
8+
python docs/LLaVA_OneVision_Tutorials.py > demo7b.out 2>&1

scripts/train/onevision.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ datasets:
6767
# sampling_strategy: "all"
6868
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
6969
# sampling_strategy: "all"
70-
# - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
71-
# sampling_strategy: "all"
70+
- json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
71+
sampling_strategy: "all"
7272
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
7373
# sampling_strategy: "first:10%"
7474
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
@@ -181,5 +181,5 @@ datasets:
181181
# sampling_strategy: all
182182
# - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/0718_0_30_s_academic_mc_v0_1_all.json # will be released in next version of LLaVA-NeXT-Video
183183
# sampling_strategy: all
184-
- json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/sharegpt4video.json # download from sharegpt4video
185-
sampling_strategy: all
184+
# - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/sharegpt4video.json # download from sharegpt4video
185+
# sampling_strategy: all

0 commit comments

Comments
 (0)