Merge branch 'visualization' into shaokai/dev

Ye Shaokai · Ye Shaokai · commit 8891c5098437 · 2025-02-07T14:55:18.000+01:00
diff --git a/llava/action/benchmark.py b/llava/action/benchmark.py
@@ -37,7 +37,7 @@ def benchmark_tim_mcq(n_samples):
                                         root,
                                         annotation_file,
                                         gen_type = 'tim',
-                                        prediction_file = avion_prediction_file,
+                                        prediction_file = tim_prediction_file,
                                         clip_length = n_frames,
                                         question_type = 'mc_',
                                         action_representation=action_representation,
@@ -63,6 +63,6 @@ def benchmark_random_mcq(n_samples):
     
     
 if __name__ == '__main__':
-    benchmark_avion_mcq(100)
+    #benchmark_avion_mcq(100)
     benchmark_tim_mcq(100)
     #benchmark_random_mcq(100)    
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -19,6 +19,7 @@
 import base64
 from pathlib import Path
 import traceback
+import cv2
 
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
@@ -348,7 +349,8 @@ def __init__(self,
                  debug = False,
                  topk = 10,
                  perspective = 'first_person',
-                 benchmark_testing = False
+                 benchmark_testing = False,
+                 do_visualization = False
                  ):
         """
         Parameters
@@ -373,17 +375,31 @@ def __init__(self,
         self.perspective = perspective
         self.benchmark_testing = benchmark_testing
         assert gen_type in ['avion', 'tim', 'random']
-      
+
         if gen_type == 'avion' or gen_type == 'tim':                  
             self.mc_generator = ActionMultiChoiceGenerator(self.annotation_root)
+            assert os.path.exists(self.prediction_file)
             with open(self.prediction_file, 'r') as f:
                 self.action_model_predictions = json.load(f)
         else:
             self.mc_generator = RandomMultiChoiceGenerator(self.annotation_root)
             
-            
+        self.do_visualization = do_visualization
+        self.vis_folder = f"{self.gpt_model}_{self.gen_type}_{self.question_type}_{self.perspective}"
+        os.makedirs(self.vis_folder, exist_ok = True)
         self.data = self.init_data()
-       
+     
+    def save_visualization(self,frames, uid):
+        """
+        Save the frames to the out_dir
+        """
+        out_dir = Path(self.vis_folder)
+        out_dir.mkdir(parents=True, exist_ok=True)        
+        sub_folder = out_dir / uid
+        sub_folder.mkdir(parents=True, exist_ok=True)
+        for idx, frame in enumerate(frames):            
+            cv2.imwrite(str(sub_folder / f"{uid}_{idx}.jpg"), cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                    
      
     def init_data(self):
         ret = {}      
@@ -435,41 +451,45 @@ def init_data(self):
                 'end_second': end_second,
                 'vid_path': vid_path
             }
-
         return ret
 
-    def multi_process_run(self, n_samples = -1):
-        # to initialize it
+    def multi_process_run(self, offset= 0, n_samples = -1, disable_api_calling = False):
+        # inside GPT inference annotator
 
-        if n_samples != -1:
-            indices = list(range(len(self.data)))[:n_samples]
+        if n_samples == -1:
+            # do not use offset if n_samples is -1
+            assert offset == 0
 
+        if n_samples != -1:
+            indices = list(range(len(self.data)))[offset:offset + n_samples]
         num_chunks = os.cpu_count() if not self.debug else 2
 
         indices_groups = self.split_indices(indices, num_chunks)
 
         with ProcessPoolExecutor(max_workers=num_chunks) as executor:
             # Pass additional arguments to the function
-            futures = [executor.submit(self.run, group) for group in indices_groups]
+            futures = [executor.submit(self.run, group, disable_api_calling) for group in indices_groups]
             
             # Wait for all futures to complete
             combined_results = {}
             for future in futures:
                 result_dict = future.result()
                 combined_results.update(result_dict)
-
+            print (combined_results)
         if self.debug:
             print (combined_results)
-
-        calculation = calculate_gpt_accuracy(data = combined_results)
+        if combined_results and 'mc_' in self.question_type:
+            calculation = calculate_gpt_accuracy(data = combined_results)
 
         prefix = self.gen_type
         assert n_samples != -1
         checkpoint_name = f"{prefix}_{self.action_representation}_top{self.topk}_{self.clip_length}f_{n_samples}samples.json"
 
+        if self.do_visualization:
+            self.checkpoint(combined_results, os.path.join(self.vis_folder, checkpoint_name))
         self.checkpoint(combined_results, checkpoint_name)                            
 
-    def run(self, indices=None):
+    def run(self, indices=None, disable_api_calling = False):      
         if indices is None:
             data_batch = {i : self.data[i] for i in range(len(self.data)) if i in list(range(len(self.data)))}
         else:
@@ -481,22 +501,36 @@ def run(self, indices=None):
             start_timestamp = v['start_second']
             end_timestamp = v['end_second']
             vid_path = v['vid_path']
+            _id = v['vid_path'].replace('/', '-')
+            uid = f"{_id}_{start_timestamp}_{end_timestamp}"
 
             frames, time_meta = self.extract_frames(vid_path, start_timestamp, end_timestamp)
-            try:
+            
+            if self.do_visualization:
+                # the output folder should reflect the gen type, question type and perspective
+                # and the question type
+                self.save_visualization(frames, uid)
+            if disable_api_calling:
+                break
+            try:                
                 parsed_answer = self.predict_images(frames, v)
             except Exception as e:
                 # get full stack trace
-                traceback.print_exc()
-                
+                traceback.print_exc()                
                 print ("An exception occurred: ", e)
             
             predicted_answer = parsed_answer.answer
             gt_name = v['gt_answer']
             ret[k] = {
+                "uid": uid,
                 'gt_name': gt_name,
-                'chatgpt_answer': process_raw_pred(predicted_answer),
+                "options": v['options'],
+                'chatgpt_answer': process_raw_pred(predicted_answer) if 'mc_' in self.question_type else predicted_answer
             }
+            if self.do_visualization:
+                # save ret to the output folder
+                self.checkpoint(ret, os.path.join(self.vis_folder, uid, 'inference_results.json'))
+            
             if self.debug:
                 break
       
@@ -529,9 +563,7 @@ def predict_images(self, images, parsed_item):
      
         if 'o1' in self.gpt_model:
             system_prompt += format_prompt
-     
-        #print (system_prompt)
-
+                    
         if self.handobj_root is not None:
             system_prompt += f"""To further assist you, we mark hands and object when they are visible. The left hand is marked with a bounding box that contains letter L and the right hand's bounding box contains letter R. The object is marked as 'O'."""
         
diff --git a/llava/action/ek_eval.py b/llava/action/ek_eval.py
@@ -11,10 +11,10 @@
 import json
 import logging
 from llava.utils import rank0_print
-from llava.action.utils import generate_label_map,  match_answer
+from llava.action.utils import generate_label_map
 from collections import Counter 
 import torch.distributed as dist
-from llava.action.dataset import VideoMultiChoiceDataset, VideoTemporalMultiChoiceDataset
+from llava.action.dataset import VideoMultiChoiceDataset
 import torchvision.io as io
 import re
 
diff --git a/llava/action/llava_inference.py b/llava/action/llava_inference.py
@@ -52,7 +52,7 @@ def llava_inference(
         elif test_type == "direct_narration":
             question_type = "direct_narration"
         elif test_type == 'caption' or test_type == 'debug':
-            question_type = "gpt-gt-reason"
+            question_type = "caption"
         elif test_type == 'temporal_cot':
             question_type = 'temporal_cot'
                     
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
diff --git a/llava/action/utils.py b/llava/action/utils.py