add test

HaozheQi · HaozheQi · commit 32df8cecbdb6 · 2024-10-02T17:32:01.000Z
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,101 +1,101 @@
-{
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Run LLAVA Training with torchrun",
-            "type": "debugpy",
-            "request": "launch",
-            "module": "torch.distributed.run",
-            "env": {
-                "CUDA_VISIBLE_DEVICES": "1,2",
-                "OMP_NUM_THREADS": "8",
-                "NCCL_IB_DISABLE": "0",
-                "NCCL_IB_GID_INDEX": "3",
-                "NCCL_SOCKET_IFNAME": "eth0",
-                "NCCL_DEBUG": "INFO",
-                "ACCELERATE_CPU_AFFINITY": "1",
-                "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
-            },
-            "args": [
-                "--nproc_per_node=2",
-                "--nnodes=1",
-                "--node_rank=0",
-                "--master_addr=127.0.0.1",
-                "--master_port=29500",
-                "llava/train/train_mem.py",
-                "--deepspeed", "scripts/zero3.json",
-                "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
-                "--version", "qwen_1_5",
-                "--data_path", "scripts/train/onevision.yaml",
-                // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
-                "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
-                "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
-                "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
-                "--mm_vision_tower_lr", "2e-6",
-                "--vision_tower", "google/siglip-so400m-patch14-384",
-                "--mm_projector_type", "mlp2x_gelu",
-                "--mm_vision_select_layer", "-2",
-                "--mm_use_im_start_end", "False",
-                "--mm_use_im_patch_token", "False",
-                "--group_by_modality_length", "True",
-                "--image_aspect_ratio", "anyres_max_9",
-                "--image_grid_pinpoints", "(1x1),...,(6x6)",
-                "--mm_patch_merge_type", "spatial_unpad",
-                "--bf16", "True",
-                "--run_name", "test",
-                "--output_dir", "experiments/test",
-                "--num_train_epochs", "1",
-                "--per_device_train_batch_size", "1",
-                "--per_device_eval_batch_size", "4",
-                "--gradient_accumulation_steps", "2",
-                "--evaluation_strategy", "no",
-                "--save_strategy", "steps",
-                "--save_steps", "1000",
-                "--save_total_limit", "1",
-                "--learning_rate", "1e-5",
-                "--weight_decay", "0.",
-                "--warmup_ratio", "0.03",
-                "--lr_scheduler_type", "cosine",
-                "--logging_steps", "1",
-                "--tf32", "True",
-                "--model_max_length", "32768",
-                "--gradient_checkpointing", "True",
-                "--dataloader_num_workers", "4",
-                "--lazy_preprocess", "True",
-                "--report_to", "wandb",
-                "--torch_compile", "True",
-                "--torch_compile_backend", "inductor",
-                "--dataloader_drop_last", "True",
-                "--frames_upbound", "32",
-            ],
-            "console": "integratedTerminal",
-            "justMyCode": false,
-            "cwd": "${workspaceFolder}"
-        }
-    ]
-}
-
-
 // {
-//     // Use IntelliSense to learn about possible attributes.
-//     // Hover to view descriptions of existing attributes.
-//     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 //     "version": "0.2.0",
 //     "configurations": [
 //         {
-//             "name": "Python: Current File",
+//             "name": "Run LLAVA Training with torchrun",
 //             "type": "debugpy",
 //             "request": "launch",
-//             "program": "docs/LLaVA_OneVision_Tutorials.py",
+//             "module": "torch.distributed.run",
+//             "env": {
+//                 "CUDA_VISIBLE_DEVICES": "1,2,3",
+//                 "OMP_NUM_THREADS": "8",
+//                 "NCCL_IB_DISABLE": "0",
+//                 "NCCL_IB_GID_INDEX": "3",
+//                 "NCCL_SOCKET_IFNAME": "eth0",
+//                 "NCCL_DEBUG": "INFO",
+//                 "ACCELERATE_CPU_AFFINITY": "1",
+//                 "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
+//                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
+//             },
+//             "args": [
+//                 "--nproc_per_node=3",
+//                 "--nnodes=1",
+//                 "--node_rank=0",
+//                 "--master_addr=127.0.0.1",
+//                 "--master_port=29500",
+//                 "llava/train/train_mem.py",
+//                 "--deepspeed", "scripts/zero3.json",
+//                 "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
+//                 "--version", "qwen_1_5",
+//                 "--data_path", "scripts/train/onevision.yaml",
+//                 // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
+//                 "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
+//                 "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
+//                 "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
+//                 "--mm_vision_tower_lr", "2e-6",
+//                 "--vision_tower", "google/siglip-so400m-patch14-384",
+//                 "--mm_projector_type", "mlp2x_gelu",
+//                 "--mm_vision_select_layer", "-2",
+//                 "--mm_use_im_start_end", "False",
+//                 "--mm_use_im_patch_token", "False",
+//                 "--group_by_modality_length", "True",
+//                 "--image_aspect_ratio", "anyres_max_9",
+//                 "--image_grid_pinpoints", "(1x1),...,(6x6)",
+//                 "--mm_patch_merge_type", "spatial_unpad",
+//                 "--bf16", "True",
+//                 "--run_name", "test",
+//                 "--output_dir", "experiments/test",
+//                 "--num_train_epochs", "1",
+//                 "--per_device_train_batch_size", "1",
+//                 "--per_device_eval_batch_size", "4",
+//                 "--gradient_accumulation_steps", "2",
+//                 "--evaluation_strategy", "no",
+//                 "--save_strategy", "steps",
+//                 "--save_steps", "1000",
+//                 "--save_total_limit", "1",
+//                 "--learning_rate", "1e-5",
+//                 "--weight_decay", "0.",
+//                 "--warmup_ratio", "0.03",
+//                 "--lr_scheduler_type", "cosine",
+//                 "--logging_steps", "1",
+//                 "--tf32", "True",
+//                 "--model_max_length", "32768",
+//                 "--gradient_checkpointing", "True",
+//                 "--dataloader_num_workers", "4",
+//                 "--lazy_preprocess", "True",
+//                 "--report_to", "wandb",
+//                 "--torch_compile", "True",
+//                 "--torch_compile_backend", "inductor",
+//                 "--dataloader_drop_last", "True",
+//                 "--frames_upbound", "32",
+//             ],
 //             "console": "integratedTerminal",
-//             "env":{"CUDA_VISIBLE_DEVICES":"0",
-//                    "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
-//                    "LD_LIBRARY_PATH": "/home/haozhe/miniconda3/envs/llava/lib"},
 //             "justMyCode": false,
-//             // "args": [
-//             //     "--run_dir_name", "test",
-//             //     // "--use_big_decoder"
-//             // ]
+//             "cwd": "${workspaceFolder}"
 //         }
 //     ]
-// }
+// }
+
+
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "docs/LLaVA_OneVision_Tutorials.py",
+            "console": "integratedTerminal",
+            "env":{"CUDA_VISIBLE_DEVICES":"0",
+                   "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
+            "justMyCode": false,
+            // "args": [
+            //     "--run_dir_name", "test",
+            //     // "--use_big_decoder"
+            // ]
+        }
+    ]
+}
diff --git a/docs/LLaVA_OneVision_Tutorials.py b/docs/LLaVA_OneVision_Tutorials.py
@@ -1,3 +1,7 @@
+import os
+import sys
+sys.path[0] = os.path.dirname(sys.path[0])
+
 # from llava.model.builder import load_pretrained_model
 # from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
 # from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
diff --git a/docs/download_data.py b/docs/download_data.py
@@ -28,11 +28,12 @@
                       'vistext(cauldron)', 'visual7w(cauldron,llava_format)', 'visualmrc(cauldron)', 
                       'vqarad(cauldron,llava_format)', 'vsr(cauldron,llava_format)', 'websight(cauldron)']
 
-chossen_datasets = ['sharegpt4v(sam)', 'sharegpt4v(llava)']
+# chossen_datasets = ['sharegpt4v(sam)', 'sharegpt4v(llava)']
+chossen_datasets = ['geo3k']
 
-image_base = "/mediaPFM/data/haozhe/onevision/llava_data"
-json_base = "/mediaPFM/data/haozhe/onevision/llava_instruct"
-dataset_yaml = 'scripts/train/onevision.yaml'
+image_base = "/mnt/SV_storage/VFM/onevision/llava_data"
+json_base = "/mnt/SV_storage/VFM/onevision/llava_instruct"
+# dataset_yaml = 'scripts/train/onevision.yaml'
 
 # # open the yaml file
 # with open(dataset_yaml, 'r') as f:
diff --git a/run.sh b/run.sh
@@ -1 +1,63 @@
-python docs/download_data.py
+#!/bin/bash
+
+# Export environment variables
+export CUDA_VISIBLE_DEVICES="0,1"
+export OMP_NUM_THREADS="8"
+export NCCL_IB_DISABLE="0"
+export NCCL_IB_GID_INDEX="3"
+export NCCL_SOCKET_IFNAME="eth0"
+export NCCL_DEBUG="INFO"
+export ACCELERATE_CPU_AFFINITY="1"
+# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
+export WANDB_API_KEY="65aeda82a75f1eed29c8e9250b175fcc73dca0d7"
+
+# Run the command using torchrun
+torchrun --nproc_per_node=2 \
+         --nnodes=1 \
+         --node_rank=0 \
+         --master_addr=127.0.0.1 \
+         --master_port=29500 \
+         llava/train/train_mem.py \
+         --deepspeed scripts/zero3.json \
+         --model_name_or_path lmms-lab/llava-onevision-qwen2-7b-ov \
+         --version qwen_1_5 \
+         --data_path scripts/train/onevision.yaml \
+         --image_folder /media/data/haozhe/VFM/onevision/llava_data/geo3k/ \
+         --video_folder /media/data/haozhe/VFM/onevision/llava_video \
+         --mm_tunable_parts mm_vision_tower,mm_mlp_adapter,mm_language_model \
+         --mm_vision_tower_lr 2e-6 \
+         --vision_tower google/siglip-so400m-patch14-384 \
+         --mm_projector_type mlp2x_gelu \
+         --mm_vision_select_layer -2 \
+         --mm_use_im_start_end False \
+         --mm_use_im_patch_token False \
+         --group_by_modality_length True \
+         --image_aspect_ratio anyres_max_9 \
+         --image_grid_pinpoints "(1x1),...,(6x6)" \
+         --mm_patch_merge_type spatial_unpad \
+         --bf16 True \
+         --run_name test \
+         --output_dir experiments/test \
+         --num_train_epochs 1 \
+         --per_device_train_batch_size 1 \
+         --per_device_eval_batch_size 4 \
+         --gradient_accumulation_steps 2 \
+         --evaluation_strategy no \
+         --save_strategy steps \
+         --save_steps 1000 \
+         --save_total_limit 1 \
+         --learning_rate 1e-5 \
+         --weight_decay 0. \
+         --warmup_ratio 0.03 \
+         --lr_scheduler_type cosine \
+         --logging_steps 1 \
+         --tf32 True \
+         --model_max_length 32768 \
+         --gradient_checkpointing True \
+         --dataloader_num_workers 4 \
+         --lazy_preprocess True \
+         --report_to wandb \
+         --torch_compile True \
+         --torch_compile_backend inductor \
+         --dataloader_drop_last True \
+         --frames_upbound 32  > test7b.out 2>&1
diff --git a/run_demo.sh b/run_demo.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Export environment variables
+export CUDA_VISIBLE_DEVICES="0"
+# export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
+
+# Run the Python script
+python docs/LLaVA_OneVision_Tutorials.py > demo7b.out 2>&1
diff --git a/scripts/train/onevision.yaml b/scripts/train/onevision.yaml
@@ -67,8 +67,8 @@ datasets:
   #   sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
   #   sampling_strategy: "all"
-  # - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
-  #   sampling_strategy: "all"
+  - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
+    sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
   #   sampling_strategy: "first:10%"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json
@@ -181,5 +181,5 @@ datasets:
   #   sampling_strategy: all
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/0718_0_30_s_academic_mc_v0_1_all.json # will be released in next version of LLaVA-NeXT-Video
   #   sampling_strategy: all
-  - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/sharegpt4video.json # download from sharegpt4video
-    sampling_strategy: all
+  # - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/sharegpt4video.json # download from sharegpt4video
+  #   sampling_strategy: all