fixed bugs even for benchmark code. The prediction file pointer was wrong

yeshaokai · yeshaokai · commit 44ec08ff34a7 · 2025-02-06T19:02:41.000Z
diff --git a/llava/action/benchmark.py b/llava/action/benchmark.py
@@ -10,8 +10,8 @@
 n_frames = 4
 topk = 5
 action_representation = 'GT_random_narration'
-#gpt_model = 'gpt-4o-mini-2024-07-18'
-gpt_model = 'gpt-4o-2024-08-06'
+gpt_model = 'gpt-4o-mini-2024-07-18'
+#gpt_model = 'gpt-4o-2024-08-06'
 perspective = 'first_person'
 benchmark_testing = True
 
@@ -37,7 +37,7 @@ def benchmark_tim_mcq(n_samples):
                                         root,
                                         annotation_file,
                                         gen_type = 'tim',
-                                        prediction_file = avion_prediction_file,
+                                        prediction_file = tim_prediction_file,
                                         clip_length = n_frames,
                                         question_type = 'mc_',
                                         action_representation=action_representation,
@@ -63,6 +63,6 @@ def benchmark_random_mcq(n_samples):
     
     
 if __name__ == '__main__':
-    benchmark_avion_mcq(100)
+    #benchmark_avion_mcq(100)
     benchmark_tim_mcq(100)
     #benchmark_random_mcq(100)    
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -19,6 +19,7 @@
 import base64
 from pathlib import Path
 import traceback
+import cv2
 
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
@@ -348,7 +349,8 @@ def __init__(self,
                  debug = False,
                  topk = 10,
                  perspective = 'first_person',
-                 benchmark_testing = False
+                 benchmark_testing = False,
+                 do_visualization = False
                  ):
         """
         Parameters
@@ -373,17 +375,31 @@ def __init__(self,
         self.perspective = perspective
         self.benchmark_testing = benchmark_testing
         assert gen_type in ['avion', 'tim', 'random']
-      
+
         if gen_type == 'avion' or gen_type == 'tim':                  
             self.mc_generator = ActionMultiChoiceGenerator(self.annotation_root)
+            assert os.path.exists(self.prediction_file)
+            print ('prediction_file'*5, self.prediction_file)
             with open(self.prediction_file, 'r') as f:
                 self.action_model_predictions = json.load(f)
         else:
             self.mc_generator = RandomMultiChoiceGenerator(self.annotation_root)
             
-            
+        self.do_visualization = do_visualization
+        self.vis_folder = f"{self.gpt_model}_{self.gen_type}_{self.question_type}_{self.perspective}"
         self.data = self.init_data()
-       
+     
+    def save_visualization(self,frames, uid):
+        """
+        Save the frames to the out_dir
+        """
+        out_dir = Path(self.vis_folder)
+        out_dir.mkdir(parents=True, exist_ok=True)        
+        sub_folder = out_dir / uid
+        sub_folder.mkdir(parents=True, exist_ok=True)
+        for idx, frame in enumerate(frames):            
+            cv2.imwrite(str(sub_folder / f"{uid}_{idx}.jpg"), cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                    
      
     def init_data(self):
         ret = {}      
@@ -438,8 +454,8 @@ def init_data(self):
 
         return ret
 
-    def multi_process_run(self, n_samples = -1):
-        # to initialize it
+    def multi_process_run(self, n_samples = -1, disable_api_calling = False):
+        # inside GPT inference annotator
 
         if n_samples != -1:
             indices = list(range(len(self.data)))[:n_samples]
@@ -450,7 +466,7 @@ def multi_process_run(self, n_samples = -1):
 
         with ProcessPoolExecutor(max_workers=num_chunks) as executor:
             # Pass additional arguments to the function
-            futures = [executor.submit(self.run, group) for group in indices_groups]
+            futures = [executor.submit(self.run, group, disable_api_calling) for group in indices_groups]
             
             # Wait for all futures to complete
             combined_results = {}
@@ -460,16 +476,18 @@ def multi_process_run(self, n_samples = -1):
 
         if self.debug:
             print (combined_results)
-
-        calculation = calculate_gpt_accuracy(data = combined_results)
+        if combined_results and 'mc_' in self.question_type:
+            calculation = calculate_gpt_accuracy(data = combined_results)
 
         prefix = self.gen_type
         assert n_samples != -1
         checkpoint_name = f"{prefix}_{self.action_representation}_top{self.topk}_{self.clip_length}f_{n_samples}samples.json"
 
+        if self.do_visualization:
+            self.checkpoint(combined_results, os.path.join(self.vis_folder, checkpoint_name))
         self.checkpoint(combined_results, checkpoint_name)                            
 
-    def run(self, indices=None):
+    def run(self, indices=None, disable_api_calling = False):      
         if indices is None:
             data_batch = {i : self.data[i] for i in range(len(self.data)) if i in list(range(len(self.data)))}
         else:
@@ -481,22 +499,36 @@ def run(self, indices=None):
             start_timestamp = v['start_second']
             end_timestamp = v['end_second']
             vid_path = v['vid_path']
+            _id = v['vid_path'].replace('/', '-')
+            uid = f"{_id}_{start_timestamp}_{end_timestamp}"
 
             frames, time_meta = self.extract_frames(vid_path, start_timestamp, end_timestamp)
-            try:
+            
+            if self.do_visualization:
+                # the output folder should reflect the gen type, question type and perspective
+                # and the question type
+                self.save_visualization(frames, uid)
+            if disable_api_calling:
+                break
+            try:                
                 parsed_answer = self.predict_images(frames, v)
             except Exception as e:
                 # get full stack trace
-                traceback.print_exc()
-                
+                traceback.print_exc()                
                 print ("An exception occurred: ", e)
             
             predicted_answer = parsed_answer.answer
             gt_name = v['gt_answer']
             ret[k] = {
+                "uid": uid,
                 'gt_name': gt_name,
-                'chatgpt_answer': process_raw_pred(predicted_answer),
+                "options": v['options'],
+                'chatgpt_answer': process_raw_pred(predicted_answer) if 'mc_' in self.question_type else predicted_answer
             }
+            if self.do_visualization:
+                # save ret to the output folder
+                self.checkpoint(ret, os.path.join(self.vis_folder, uid, 'inference_results.json'))
+            
             if self.debug:
                 break
       
diff --git a/llava/action/ek_eval.py b/llava/action/ek_eval.py
@@ -14,7 +14,7 @@
 from llava.action.utils import generate_label_map,  match_answer
 from collections import Counter 
 import torch.distributed as dist
-from llava.action.dataset import VideoMultiChoiceDataset, VideoTemporalMultiChoiceDataset
+from llava.action.dataset import VideoMultiChoiceDataset
 import torchvision.io as io
 import re
 
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
@@ -0,0 +1,70 @@
+"""
+We need to keep track of the following:
+
+The uid of each segment
+
+The GPT inference of corresponding segment
+The LLaVA zero-shot inference of corresponding segment
+The Finetuned LLaVA's inference of corresponding segment
+
+Note that in each inference, we should be able to pick the corresponding prompt and checkpoint folder
+"""
+
+from llava.action.chatgpt_utils import GPTInferenceAnnotator
+
+root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
+annotation_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
+tim_prediction_file = '/data/epic_kitchen/TIM_PREDS/tim_pred_ids_val.json'
+n_frames = 4
+topk = 5
+action_representation = 'GT_random_narration'
+gpt_model = 'gpt-4o-mini-2024-07-18'
+#gpt_model = 'gpt-4o-2024-08-06'
+perspective = 'first_person'
+benchmark_testing = True
+
+
+
+def visualize_with_random(n_samples, question_type = 'mc_'):
+    """
+    Here we should test gpt-4o, gpt-4o-mini with different prompts
+    """
+    inferencer = GPTInferenceAnnotator(gpt_model,
+                                       root,
+                                       annotation_file,
+                                        gen_type = 'random',
+                                        prediction_file = tim_prediction_file,
+                                        clip_length = n_frames,
+                                        question_type = question_type,
+                                        action_representation=action_representation,
+                                        perspective = perspective,
+                                        benchmark_testing = benchmark_testing,
+                                        do_visualization = True,
+                                        topk = topk) 
+    
+    inferencer.multi_process_run(n_samples, disable_api_calling=False)
+
+def visualize_with_gpt_with_tim(n_samples, question_type = 'mc_'):
+    """
+    Here we should test gpt-4o, gpt-4o-mini with different prompts
+    """
+    inferencer = GPTInferenceAnnotator(gpt_model,
+                                       root,
+                                       annotation_file,
+                                        gen_type = 'tim',
+                                        prediction_file = tim_prediction_file,
+                                        clip_length = n_frames,
+                                        question_type = question_type,
+                                        action_representation=action_representation,
+                                        perspective = perspective,
+                                        benchmark_testing = benchmark_testing,
+                                        do_visualization = True,
+                                        topk = topk) 
+    
+    inferencer.multi_process_run(n_samples, disable_api_calling=False)    
+
+
+if __name__ == '__main__':
+    #visualize_with_random(1, question_type = "mc_")
+    visualize_with_gpt_with_tim(1, question_type = "mc_")
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -428,7 +428,7 @@ def generate_multi_choice(self,
         else:
             return self.test_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = benchmark_testing)
     
-    def train_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps):
+    def train_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = False):
         # letters as A, B, C, D, .. Note we maximally support 26 letters
         letters = [chr(65+i) for i in range(26)][:k]                
         answer_list = [vn for vn in mapping_vn2narration.keys()]                
@@ -463,11 +463,11 @@ def train_generate(self, gt_vn, narration, k, action_representation, n_narration
             }  
         return mc_data 
     
-    def test_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps):
+    def test_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = False):
         """
         There is no difference between train and test for random generation
         """        
-        return self.train_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps)        
+        return self.train_generate(gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = benchmark_testing)        
 
 class AvionMultiChoiceGenerator(MultiChoiceGenerator):
     """