Skip to content

Commit ef08304

Browse files
committed
test with llava 7B
1 parent a369e23 commit ef08304

File tree

4 files changed

+278
-157
lines changed

4 files changed

+278
-157
lines changed

.vscode/launch.json

Lines changed: 95 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,96 @@
1+
// {
2+
// "version": "0.2.0",
3+
// "configurations": [
4+
// {
5+
// "name": "Run LLAVA Training with torchrun",
6+
// "type": "debugpy",
7+
// "request": "launch",
8+
// "module": "torch.distributed.run",
9+
// "env": {
10+
// "CUDA_VISIBLE_DEVICES": "0,1,2,3",
11+
// "OMP_NUM_THREADS": "8",
12+
// "NCCL_IB_DISABLE": "0",
13+
// "NCCL_IB_GID_INDEX": "3",
14+
// "NCCL_SOCKET_IFNAME": "eth0",
15+
// "NCCL_DEBUG": "INFO",
16+
// "ACCELERATE_CPU_AFFINITY": "1",
17+
// "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
18+
// "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
19+
// "CUDA_LAUNCH_BLOCKING": "1",
20+
// "HF_HOME": "/media/data/haozhe/VFM/huggingface",
21+
// },
22+
// "args": [
23+
// "--nproc_per_node=4",
24+
// "--nnodes=1",
25+
// "llava/train/train_mem.py",
26+
// "--deepspeed", "scripts/zero3.json",
27+
// "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28+
// "--version", "qwen_1_5",
29+
// "--data_path", "scripts/train/llava_video.yaml",
30+
// "--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
31+
// "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
32+
// "--mm_vision_tower_lr", "2e-6",
33+
// "--vision_tower", "google/siglip-so400m-patch14-384",
34+
// "--mm_projector_type", "mlp2x_gelu",
35+
// "--mm_vision_select_layer", "-2",
36+
// "--mm_use_im_start_end", "False",
37+
// "--mm_use_im_patch_token", "False",
38+
// "--group_by_modality_length", "True",
39+
// "--image_aspect_ratio", "anyres_max_9",
40+
// "--image_grid_pinpoints", "(1x1),...,(6x6)",
41+
// "--mm_patch_merge_type", "spatial_unpad",
42+
// "--bf16", "True",
43+
// "--run_name", "dev_0.5b_4f_llavavideo_haozhe",
44+
// "--output_dir", "experiments/dev_0.5b_4f_llavavideo_haozhe",
45+
// "--num_train_epochs", "1",
46+
// "--per_device_train_batch_size", "8",
47+
// "--per_device_eval_batch_size", "4",
48+
// "--gradient_accumulation_steps", "2",
49+
// "--evaluation_strategy", "epoch",
50+
// "--eval_steps", "1",
51+
// "--save_strategy", "steps",
52+
// "--save_steps", "2000",
53+
// "--learning_rate", "1e-5",
54+
// "--weight_decay", "0.",
55+
// "--warmup_ratio", "0.03",
56+
// "--lr_scheduler_type", "cosine",
57+
// "--logging_steps", "1",
58+
// "--tf32", "True",
59+
// "--model_max_length", "32768",
60+
// "--gradient_checkpointing", "True",
61+
// "--dataloader_num_workers", "4",
62+
// "--lazy_preprocess", "True",
63+
// "--report_to", "wandb",
64+
// "--torch_compile", "True",
65+
// "--torch_compile_backend", "inductor",
66+
// "--dataloader_drop_last", "True",
67+
// "--frames_upbound", "4",
68+
// "--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
69+
// "--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
70+
// "--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
71+
// "--add_time_instruction", "False",
72+
// "--llava_num_frames", "4",
73+
// "--clip_length", "4",
74+
// "--action_representation", "official_key",
75+
// "--topk_predictions", "5"
76+
// ],
77+
// "console": "integratedTerminal",
78+
// "justMyCode": false,
79+
// "cwd": "${workspaceFolder}"
80+
// }
81+
// ]
82+
// }
83+
84+
185
{
286
"version": "0.2.0",
387
"configurations": [
488
{
589
"name": "Run LLAVA Training with torchrun",
690
"type": "debugpy",
791
"request": "launch",
8-
"module": "torch.distributed.run",
92+
"python": "/media/data/haozhe/VFM/llmseval-venv/bin/python",
93+
"module": "accelerate.commands.launch",
994
"env": {
1095
"CUDA_VISIBLE_DEVICES": "0,1,2,3",
1196
"OMP_NUM_THREADS": "8",
@@ -20,59 +105,15 @@
20105
"HF_HOME": "/media/data/haozhe/VFM/huggingface",
21106
},
22107
"args": [
23-
"--nproc_per_node=4",
24-
"--nnodes=1",
25-
"llava/train/train_mem.py",
26-
"--deepspeed", "scripts/zero3.json",
27-
"--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
28-
"--version", "qwen_1_5",
29-
"--data_path", "scripts/train/llava_video.yaml",
30-
"--video_folder", "/media/data/haozhe/VFM/onevision/llava_video",
31-
"--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
32-
"--mm_vision_tower_lr", "2e-6",
33-
"--vision_tower", "google/siglip-so400m-patch14-384",
34-
"--mm_projector_type", "mlp2x_gelu",
35-
"--mm_vision_select_layer", "-2",
36-
"--mm_use_im_start_end", "False",
37-
"--mm_use_im_patch_token", "False",
38-
"--group_by_modality_length", "True",
39-
"--image_aspect_ratio", "anyres_max_9",
40-
"--image_grid_pinpoints", "(1x1),...,(6x6)",
41-
"--mm_patch_merge_type", "spatial_unpad",
42-
"--bf16", "True",
43-
"--run_name", "dev_0.5b_4f_llavavideo_haozhe",
44-
"--output_dir", "experiments/dev_0.5b_4f_llavavideo_haozhe",
45-
"--num_train_epochs", "1",
46-
"--per_device_train_batch_size", "8",
47-
"--per_device_eval_batch_size", "4",
48-
"--gradient_accumulation_steps", "2",
49-
"--evaluation_strategy", "epoch",
50-
"--eval_steps", "1",
51-
"--save_strategy", "steps",
52-
"--save_steps", "2000",
53-
"--learning_rate", "1e-5",
54-
"--weight_decay", "0.",
55-
"--warmup_ratio", "0.03",
56-
"--lr_scheduler_type", "cosine",
57-
"--logging_steps", "1",
58-
"--tf32", "True",
59-
"--model_max_length", "32768",
60-
"--gradient_checkpointing", "True",
61-
"--dataloader_num_workers", "4",
62-
"--lazy_preprocess", "True",
63-
"--report_to", "wandb",
64-
"--torch_compile", "True",
65-
"--torch_compile_backend", "inductor",
66-
"--dataloader_drop_last", "True",
67-
"--frames_upbound", "4",
68-
"--root", "/media/data/haozhe/VFM/onevision/llava_video/EK100",
69-
"--action_predictions", "/media/data/haozhe/VFM/EK100/EK100_in_LLAVA/TIM/tim_pred_ids_val.json",
70-
"--val_metadata", "/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
71-
"--add_time_instruction", "False",
72-
"--llava_num_frames", "4",
73-
"--clip_length", "4",
74-
"--action_representation", "official_key",
75-
"--topk_predictions", "5"
108+
"--num_processes", "4",
109+
"-m", "lmms_eval",
110+
"--model", "llava_vid",
111+
"--model_args", "pretrained=experiments/dev_LLaVA-Video-7B-Qwen2_4f_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average",
112+
"--tasks", "videomme",
113+
"--batch_size", "1",
114+
"--log_samples",
115+
"--log_samples_suffix", "llava_vid_retrained",
116+
"--output_path", "./logs/"
76117
],
77118
"console": "integratedTerminal",
78119
"justMyCode": false,
@@ -81,7 +122,6 @@
81122
]
82123
}
83124

84-
85125
// {
86126
// // Use IntelliSense to learn about possible attributes.
87127
// // Hover to view descriptions of existing attributes.

lmms_eval.sh

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,45 @@ export NCCL_SOCKET_IFNAME="eth0"
66
export NCCL_DEBUG="INFO"
77
export ACCELERATE_CPU_AFFINITY="1"
88
export WANDB_API_KEY="4474ec79de023b0c3ffb43588ab6163264f875db"
9-
# export HF_HOME=/media/data/haozhe/VFM/huggingface
10-
export HF_HOME=/mnt/SV_storage/VFM/huggingface
11-
# export PYTHONPATH=/media/data/haozhe/VFM/LLaVA-NeXT:$PYTHONPATH
12-
export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
9+
export HF_HOME=/media/data/haozhe/VFM/huggingface
10+
# export HF_HOME=/mnt/SV_storage/VFM/huggingface
11+
export PYTHONPATH=/media/data/haozhe/VFM/LLaVA-NeXT:$PYTHONPATH
12+
# export PYTHONPATH=/mnt/SV_storage/VFM/LLaVA-NeXT:$PYTHONPATH
1313
export OPENAI_API_KEY=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
1414

15+
# source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
16+
# accelerate launch --num_processes=4 \
17+
# -m lmms_eval \
18+
# --model llava_onevision \
19+
# --model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
20+
# --tasks videomme \
21+
# --batch_size 1 \
22+
# --log_samples \
23+
# --log_samples_suffix llava_onevision \
24+
# --output_path ./logs/ \
25+
# --verbosity=DEBUG > ./logs/llava_onevision_clustertest.log 2>&1
26+
27+
28+
# source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
29+
# accelerate launch --num_processes=4 \
30+
# -m lmms_eval \
31+
# --model llava_vid \
32+
# --model_args pretrained=experiments/dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent/checkpoint-15000,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average,model_name=llava_qwen \
33+
# --tasks videomme \
34+
# --batch_size 1 \
35+
# --log_samples \
36+
# --log_samples_suffix dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent_checkpoint_15000 \
37+
# --output_path ./logs/ \
38+
# --verbosity=DEBUG > ./logs/dev_7b_16f_top5_strong_first_layer_three_tokens_detection_and_direct_llava_video_10percent_checkpoint_15000.log 2>&1
39+
40+
41+
source /media/data/haozhe/VFM/llmseval-venv/bin/activate && \
1542
accelerate launch --num_processes=4 \
1643
-m lmms_eval \
17-
--model llava_onevision \
18-
--model_args pretrained=lmms-lab/llava-onevision-qwen2-0.5b-ov,conv_template=qwen_1_5,model_name=llava_qwen \
19-
--tasks activitynetqa \
44+
--model llava_vid \
45+
--model_args pretrained=experiments/dev_7b_4f_llavavideo_test_haozhe,conv_template=qwen_1_5,max_frames_num=64,mm_spatial_pool_mode=average \
46+
--tasks videomme \
2047
--batch_size 1 \
2148
--log_samples \
22-
--log_samples_suffix llava_onevision \
23-
--output_path ./logs/ \
24-
--verbosity=DEBUG > ./logs/llava_onevision_activitynetqa_1.log 2>&1
49+
--log_samples_suffix llava_vid_retrained \
50+
--output_path ./logs/ > ./logs/llava_video7B_retrained_eval.log 2>&1

0 commit comments

Comments
 (0)