add test code on EK100

HaozheQi · HaozheQi · commit 0e8073f10c06 · 2024-10-03T12:19:35.000Z
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -25,7 +25,7 @@
 //                 "--master_port=29500",
 //                 "llava/train/train_mem.py",
 //                 "--deepspeed", "scripts/zero3.json",
-//                 "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-0.5b-ov",
+//                 "--model_name_or_path", "lmms-lab/llava-onevision-qwen2-7b-ov",
 //                 "--version", "qwen_1_5",
 //                 "--data_path", "scripts/train/onevision.yaml",
 //                 // "--image_folder", "/mediaPFM/data/haozhe/onevision/llava_data",
@@ -43,8 +43,8 @@
 //                 "--image_grid_pinpoints", "(1x1),...,(6x6)",
 //                 "--mm_patch_merge_type", "spatial_unpad",
 //                 "--bf16", "True",
-//                 "--run_name", "test",
-//                 "--output_dir", "experiments/test",
+//                 "--run_name", "test1",
+//                 "--output_dir", "experiments/test1",
 //                 "--num_train_epochs", "1",
 //                 "--per_device_train_batch_size", "1",
 //                 "--per_device_eval_batch_size", "4",
@@ -77,6 +77,29 @@
 // }
 
 
+// {
+//     // Use IntelliSense to learn about possible attributes.
+//     // Hover to view descriptions of existing attributes.
+//     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+//     "version": "0.2.0",
+//     "configurations": [
+//         {
+//             "name": "Python: Current File",
+//             "type": "debugpy",
+//             "request": "launch",
+//             "program": "docs/LLaVA_OneVision_Tutorials.py",
+//             "console": "integratedTerminal",
+//             "env":{"CUDA_VISIBLE_DEVICES":"0",
+//                    "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
+//             "justMyCode": false,
+//             // "args": [
+//             //     "--run_dir_name", "test",
+//             //     // "--use_big_decoder"
+//             // ]
+//         }
+//     ]
+// }
+
 {
     // Use IntelliSense to learn about possible attributes.
     // Hover to view descriptions of existing attributes.
@@ -87,15 +110,16 @@
             "name": "Python: Current File",
             "type": "debugpy",
             "request": "launch",
-            "program": "docs/LLaVA_OneVision_Tutorials.py",
+            "program": "action/dataset.py",
             "console": "integratedTerminal",
-            "env":{"CUDA_VISIBLE_DEVICES":"0",
-                   "LD_PRELOAD": "/usr/lib/x86_64-linux-gnu/libffi.so.7"},
+            "env":{"CUDA_VISIBLE_DEVICES":"0"},
             "justMyCode": false,
-            // "args": [
-            //     "--run_dir_name", "test",
-            //     // "--use_big_decoder"
-            // ]
+            "args": [
+                "--root", "/mnt/SV_storage/VFM/EK100/EK100_320p_15sec_30fps_libx264",
+                "--train-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv",
+                "--val-metadata", "/mnt/SV_storage/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv",
+                // "--use_big_decoder"
+            ]
         }
     ]
 }
diff --git a/action/dataset.py b/action/dataset.py
@@ -10,6 +10,10 @@
 import decord
 from torch.utils.data import DataLoader
 from tqdm import tqdm
+from pathlib import Path
+import sys
+import os
+sys.path[0] = os.path.dirname(sys.path[0])
 
 
 def datetime2sec(str):
@@ -130,8 +134,9 @@ def __init__(self, dataset, root, metadata, is_trimmed=True):
         self.root = root
         self.metadata = metadata
         self.is_trimmed = is_trimmed
-        self.verb_file = f'/data/EK100/epic-kitchens-100-annotations/EPIC_100_verb_classes.csv'
-        self.noun_file = f'/data/EK100/epic-kitchens-100-annotations/EPIC_100_noun_classes.csv'
+        anno_root = Path(metadata).parent
+        self.verb_file = str(anno_root / 'EPIC_100_verb_classes.csv')
+        self.noun_file = str(anno_root / 'EPIC_100_noun_classes.csv')
         self.verb_df = pd.read_csv(self.verb_file)
         self.nouns_df = pd.read_csv(self.noun_file)
         self.nouns = self.nouns_df['key'].to_list()
@@ -484,8 +489,8 @@ def generate_label_map():
     vn_list = []
     mapping_vn2narration = {}
     for f in [
-        '/data/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
-        '/data/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
+        '/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
+        '/media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
     ]:
         csv_reader = csv.reader(open(f))
         _ = next(csv_reader)  # skip the header
@@ -617,7 +622,7 @@ def get_args_parser():
     )
 
     val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False) 
-    from llava_ov_inference import llava_inference
+    from action.llava_ov_inference import llava_inference
     gts = []
     preds = []
     running_corrects = 0
@@ -644,5 +649,5 @@ def get_args_parser():
     # get final accuracy 
     accuracy = np.mean(gts == preds)
     print('Final accuracy', accuracy)
-    with open('llava_ov_4f_0.5b_result.txt', 'w') as f:
+    with open('llava_ov_16f_7b_result.txt', 'w') as f:
         f.write(f'Final accuracy: {accuracy:.4f}\n')
diff --git a/action/llava_ov_inference.py b/action/llava_ov_inference.py
@@ -15,7 +15,7 @@
 
 warnings.filterwarnings("ignore")
 # Load the OneVision model
-pretrained = "lmms-lab/llava-onevision-qwen2-0.5b-ov"
+pretrained = "lmms-lab/llava-onevision-qwen2-7b-ov"
 model_name = "llava_qwen"
 device = "cuda"
 device_map = "auto"
@@ -25,7 +25,6 @@
 
 def llava_inference(video_frames, gt):
     video_frames = video_frames[0]
-    video_frames = video_frames[::4]
     image_tensors = []
     frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()
     image_tensors.append(frames)
diff --git a/run.sh b/run.sh
@@ -36,8 +36,8 @@ torchrun --nproc_per_node=2 \
          --image_grid_pinpoints "(1x1),...,(6x6)" \
          --mm_patch_merge_type spatial_unpad \
          --bf16 True \
-         --run_name test \
-         --output_dir experiments/test \
+         --run_name test1 \
+         --output_dir experiments/test1 \
          --num_train_epochs 1 \
          --per_device_train_batch_size 1 \
          --per_device_eval_batch_size 4 \
diff --git a/run_EK100.sh b/run_EK100.sh
@@ -0,0 +1,4 @@
+python3 action/dataset.py \
+    --root /media/data/haozhe/VFM/EK100/EK100_320p_15sec_30fps_libx264 \
+    --train-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
+    --val-metadata /media/data/haozhe/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv > kitchen_test.out 2>&1
diff --git a/run_demo.sh b/run_demo.sh
@@ -5,4 +5,4 @@ export CUDA_VISIBLE_DEVICES="0"
 # export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libffi.so.7"
 
 # Run the Python script
-python docs/LLaVA_OneVision_Tutorials.py > demo7b.out 2>&1
+python3 docs/LLaVA_OneVision_Tutorials.py > demo7b.out 2>&1
diff --git a/scripts/train/onevision.yaml b/scripts/train/onevision.yaml
@@ -68,6 +68,7 @@ datasets:
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json
   #   sampling_strategy: "all"
   - json_path: /media/data/haozhe/VFM/onevision/llava_instruct/geo3k.json
+  # - json_path: /mediaPFM/data/haozhe/onevision/llava_instruct/geo3k.json
     sampling_strategy: "all"
   # - json_path: /mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json
   #   sampling_strategy: "first:10%"