Merge branch 'main' of github.com:HaozheQi/LLaVA-NeXT into main

yeshaokai · yeshaokai · commit 03cfaf79a0f0 · 2024-10-03T19:57:22.000Z
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,82 +1,105 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Run LLAVA Training with torchrun",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "torch.distributed.run",
+            "env": {
+                "CUDA_VISIBLE_DEVICES": "1,2",
+                "OMP_NUM_THREADS": "8",
+                "NCCL_IB_DISABLE": "0",
+                "NCCL_IB_GID_INDEX": "3",
+                "NCCL_SOCKET_IFNAME": "eth0",
+                "NCCL_DEBUG": "INFO",
+                "ACCELERATE_CPU_AFFINITY": "1",
+                "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
+                "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
+            },
+            "args": [
+                "--nproc_per_node=2",
+                "--nnodes=1",
+                "--node_rank=0",
+                "--master_addr=127.0.0.1",
+                "--master_port=29500",
+                "llava/train/train_mem.py",
+                "--deepspeed", "scripts/zero3.json",
+                "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
+                "--version", "qwen_1_5",
+                "--data_path", "scripts/train/onevision.yaml",
+                // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
+                "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
+                "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
+                "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
+                "--mm_vision_tower_lr", "2e-6",
+                "--vision_tower", "google/siglip-so400m-patch14-384",
+                "--mm_projector_type", "mlp2x_gelu",
+                "--mm_vision_select_layer", "-2",
+                "--mm_use_im_start_end", "False",
+                "--mm_use_im_patch_token", "False",
+                "--group_by_modality_length", "True",
+                "--image_aspect_ratio", "anyres_max_9",
+                "--image_grid_pinpoints", "(1x1),...,(6x6)",
+                "--mm_patch_merge_type", "spatial_unpad",
+                "--bf16", "True",
+                "--run_name", "test1",
+                "--output_dir", "experiments/test1",
+                "--num_train_epochs", "1",
+                "--per_device_train_batch_size", "1",
+                "--per_device_eval_batch_size", "4",
+                "--gradient_accumulation_steps", "2",
+                "--evaluation_strategy", "no",
+                "--save_strategy", "steps",
+                "--save_steps", "1000",
+                "--save_total_limit", "1",
+                "--learning_rate", "1e-5",
+                "--weight_decay", "0.",
+                "--warmup_ratio", "0.03",
+                "--lr_scheduler_type", "cosine",
+                "--logging_steps", "1",
+                "--tf32", "True",
+                "--model_max_length", "32768",
+                "--gradient_checkpointing", "True",
+                "--dataloader_num_workers", "4",
+                "--lazy_preprocess", "True",
+                "--report_to", "wandb",
+                "--torch_compile", "True",
+                "--torch_compile_backend", "inductor",
+                "--dataloader_drop_last", "True",
+                "--frames_upbound", "32",
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}
+
+
 // {
+//     // Use IntelliSense to learn about possible attributes.
+//     // Hover to view descriptions of existing attributes.
+//     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 //     "version": "0.2.0",
 //     "configurations": [
 //         {
-//             "name": "Run LLAVA Training with torchrun",
+//             "name": "Python: Current File",
 //             "type": "debugpy",
 //             "request": "launch",
-//             "module": "torch.distributed.run",
-//             "env": {
-//                 "CUDA_VISIBLE_DEVICES": "1,2,3",
-//                 "OMP_NUM_THREADS": "8",
-//                 "NCCL_IB_DISABLE": "0",
-//                 "NCCL_IB_GID_INDEX": "3",
-//                 "NCCL_SOCKET_IFNAME": "eth0",
-//                 "NCCL_DEBUG": "INFO",
-//                 "ACCELERATE_CPU_AFFINITY": "1",
-//                 "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7",
-//                 "WANDB_API_KEY": "65aeda82a75f1eed29c8e9250b175fcc73dca0d7",
-//             },
-//             "args": [
-//                 "--nproc_per_node=3",
-//                 "--nnodes=1",
-//                 "--node_rank=0",
-//                 "--master_addr=127.0.0.1",
-//                 "--master_port=29500",
-//                 "llava/train/train_mem.py",
-//                 "--deepspeed", "scripts/zero3.json",
-//                 "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-7b-ov",
-//                 "--version", "qwen_1_5",
-//                 "--data_path", "scripts/train/onevision.yaml",
-//                 // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
-//                 "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data/geo3k/",
-//                 "--video_folder", "/mediaPFM/data/haozhe/onevision/llava_video",
-//                 "--mm_tunable_parts", "mm_vision_tower,mm_mlp_adapter,mm_language_model",
-//                 "--mm_vision_tower_lr", "2e-6",
-//                 "--vision_tower", "google/siglip-so400m-patch14-384",
-//                 "--mm_projector_type", "mlp2x_gelu",
-//                 "--mm_vision_select_layer", "-2",
-//                 "--mm_use_im_start_end", "False",
-//                 "--mm_use_im_patch_token", "False",
-//                 "--group_by_modality_length", "True",
-//                 "--image_aspect_ratio", "anyres_max_9",
-//                 "--image_grid_pinpoints", "(1x1),...,(6x6)",
-//                 "--mm_patch_merge_type", "spatial_unpad",
-//                 "--bf16", "True",
-//                 "--run_name", "test1",
-//                 "--output_dir", "experiments/test1",
-//                 "--num_train_epochs", "1",
-//                 "--per_device_train_batch_size", "1",
-//                 "--per_device_eval_batch_size", "4",
-//                 "--gradient_accumulation_steps", "2",
-//                 "--evaluation_strategy", "no",
-//                 "--save_strategy", "steps",
-//                 "--save_steps", "1000",
-//                 "--save_total_limit", "1",
-//                 "--learning_rate", "1e-5",
-//                 "--weight_decay", "0.",
-//                 "--warmup_ratio", "0.03",
-//                 "--lr_scheduler_type", "cosine",
-//                 "--logging_steps", "1",
-//                 "--tf32", "True",
-//                 "--model_max_length", "32768",
-//                 "--gradient_checkpointing", "True",
-//                 "--dataloader_num_workers", "4",
-//                 "--lazy_preprocess", "True",
-//                 "--report_to", "wandb",
-//                 "--torch_compile", "True",
-//                 "--torch_compile_backend", "inductor",
-//                 "--dataloader_drop_last", "True",
-//                 "--frames_upbound", "32",
-//             ],
+//             "program": "docs/LLaVA_OneVision_Tutorials.py",
 //             "console": "integratedTerminal",
+//             "env":{"CUDA_VISIBLE_DEVICES":"0",
+//                    "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
 //             "justMyCode": false,
-//             "cwd": "${workspaceFolder}"
+//             // "args": [
+//             //     "--run_dir_name", "test",
+//             //     // "--use_big_decoder"
+//             // ]
 //         }
 //     ]
 // }
 
-
 // {
 //     // Use IntelliSense to learn about possible attributes.
 //     // Hover to view descriptions of existing attributes.
@@ -87,39 +110,16 @@
 //             "name": "Python: Current File",
 //             "type": "debugpy",
 //             "request": "launch",
-//             "program": "docs/LLaVA_OneVision_Tutorials.py",
+//             "program": "action/dataset.py",
 //             "console": "integratedTerminal",
-//             "env":{"CUDA_VISIBLE_DEVICES":"0",
-//                    "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
+//             "env":{"CUDA_VISIBLE_DEVICES":"0"},
 //             "justMyCode": false,
-//             // "args": [
-//             //     "--run_dir_name", "test",
-//             //     // "--use_big_decoder"
-//             // ]
+//             "args": [
+//                 "--root", "/mnt/SV_storage/VFM/EK100/EK100_320p_15sec_30fps_libx264",
+//                 "--train-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv",
+//                 "--val-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
+//                 // "--use_big_decoder"
+//             ]
 //         }
 //     ]
-// }
-
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Python: Current File",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "action/dataset.py",
-            "console": "integratedTerminal",
-            "env":{"CUDA_VISIBLE_DEVICES":"0"},
-            "justMyCode": false,
-            "args": [
-                "--root", "/mnt/SV_storage/VFM/EK100/EK100_320p_15sec_30fps_libx264",
-                "--train-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv",
-                "--val-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
-                // "--use_big_decoder"
-            ]
-        }
-    ]
-}
+// }
diff --git a/run_EK100_2.sh b/run_EK100_2.sh
@@ -0,0 +1,6 @@
+python3 action/dataset.py \
+    --root /media/data/haozhe/VFM/EK100/EK100_320p_15sec_30fps_libx264 \
+    --train-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
+    --val-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv \
+    --llm_size 0.5b \
+    --llava_num_frames 16 > kitchen_test_2.out 2>&1 \
diff --git a/scripts/train/onevision.yaml b/scripts/train/onevision.yaml
@@ -67,8 +67,8 @@ datasets:
   #   sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
   #   sampling_strategy: "all"
-  - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
-  # - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
+  # - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
+  - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
     sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
   #   sampling_strategy: "first:10%"