WIP

Ye Shaokai · Ye Shaokai · commit 030c047e2867 · 2025-02-07T09:02:39.000+01:00
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -386,6 +386,7 @@ def __init__(self,
             
         self.do_visualization = do_visualization
         self.vis_folder = f"{self.gpt_model}_{self.gen_type}_{self.question_type}_{self.perspective}"
+        os.makedirs(self.vis_folder, exist_ok = True)
         self.data = self.init_data()
      
     def save_visualization(self,frames, uid):
@@ -450,15 +451,17 @@ def init_data(self):
                 'end_second': end_second,
                 'vid_path': vid_path
             }
-
         return ret
 
-    def multi_process_run(self, n_samples = -1, disable_api_calling = False):
+    def multi_process_run(self, offset= 0, n_samples = -1, disable_api_calling = False):
         # inside GPT inference annotator
 
-        if n_samples != -1:
-            indices = list(range(len(self.data)))[:n_samples]
+        if n_samples == -1:
+            # do not use offset if n_samples is -1
+            assert offset == 0
 
+        if n_samples != -1:
+            indices = list(range(len(self.data)))[offset:offset + n_samples]
         num_chunks = os.cpu_count() if not self.debug else 2
 
         indices_groups = self.split_indices(indices, num_chunks)
@@ -472,7 +475,7 @@ def multi_process_run(self, n_samples = -1, disable_api_calling = False):
             for future in futures:
                 result_dict = future.result()
                 combined_results.update(result_dict)
-
+            print (combined_results)
         if self.debug:
             print (combined_results)
         if combined_results and 'mc_' in self.question_type:
@@ -560,9 +563,7 @@ def predict_images(self, images, parsed_item):
      
         if 'o1' in self.gpt_model:
             system_prompt += format_prompt
-     
-        #print (system_prompt)
-
+                    
         if self.handobj_root is not None:
             system_prompt += f"""To further assist you, we mark hands and object when they are visible. The left hand is marked with a bounding box that contains letter L and the right hand's bounding box contains letter R. The object is marked as 'O'."""
         
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
@@ -12,21 +12,27 @@
 
 from llava.action.chatgpt_utils import GPTInferenceAnnotator
 
-root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
-annotation_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
-avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
-tim_prediction_file = '/data/epic_kitchen/TIM_PREDS/tim_pred_ids_val.json'
-n_frames = 4
+# root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
+# annotation_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+# avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
+# tim_prediction_file = '/data/epic_kitchen/TIM_PREDS/tim_pred_ids_val.json'
+
+root = '/data/shaokai/EK100_512/EK100'
+annotation_file = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+avion_prediction_file = '/data/shaokai/AVION_PREDS/avion_pred_ids_val.json'
+tim_prediction_file = '/data/shaokai/TIM_PREDS/tim_pred_ids_val.json'
+
+n_frames = 32
 topk = 5
 action_representation = 'GT_random_narration'
-gpt_model = 'gpt-4o-mini-2024-07-18'
-#gpt_model = 'gpt-4o-2024-08-06'
+#gpt_model = 'gpt-4o-mini-2024-07-18'
+gpt_model = 'gpt-4o-2024-08-06'
 perspective = 'first_person'
 benchmark_testing = True
 
 
 
-def visualize_with_random(n_samples, question_type = 'mc_'):
+def visualize_with_random(n_samples, offset = 0, question_type = 'mc_'):
     """
     Here we should test gpt-4o, gpt-4o-mini with different prompts
     """
@@ -43,9 +49,9 @@ def visualize_with_random(n_samples, question_type = 'mc_'):
                                         do_visualization = True,
                                         topk = topk) 
     
-    inferencer.multi_process_run(n_samples, disable_api_calling=False)
+    inferencer.multi_process_run(n_samples = n_samples, offset = offset, disable_api_calling=False)
 
-def visualize_with_gpt_with_tim(n_samples, question_type = 'mc_'):
+def visualize_with_gpt_with_tim(n_samples, offset = 0, question_type = 'mc_'):
     """
     Here we should test gpt-4o, gpt-4o-mini with different prompts
     """
@@ -62,10 +68,10 @@ def visualize_with_gpt_with_tim(n_samples, question_type = 'mc_'):
                                         do_visualization = True,
                                         topk = topk) 
     
-    inferencer.multi_process_run(n_samples, disable_api_calling=False)    
+    inferencer.multi_process_run(n_samples = n_samples, offset = offset, disable_api_calling=False)    
 
 
-def visualize_with_gpt_with_avion(n_samples, question_type = 'mc_'):
+def visualize_with_gpt_with_avion(n_samples, offset = 0, question_type = 'mc_'):
     """
     Here we should test gpt-4o, gpt-4o-mini with different prompts
     """
@@ -82,9 +88,11 @@ def visualize_with_gpt_with_avion(n_samples, question_type = 'mc_'):
                                         do_visualization = True,
                                         topk = topk) 
     
-    inferencer.multi_process_run(n_samples, disable_api_calling=False) 
+    inferencer.multi_process_run(n_samples = n_samples, offset = offset, disable_api_calling=False) 
 
 if __name__ == '__main__':
-    #visualize_with_random(1, question_type = "mc_")
-    #visualize_with_gpt_with_tim(1, question_type = "mc_")
-    visualize_with_gpt_with_avion(1, question_type = "mc_")
+    
+    question_type = "gpt-gt-reason"
+    #visualize_with_random(20, offset = 40, question_type = "gpt-gt-reason")
+    #visualize_with_gpt_with_tim(20, offset = 40, question_type = "gpt-gt-reason")
+    visualize_with_gpt_with_avion(20, offset = 40, question_type = "gpt-gt-reason")
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -232,7 +232,7 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
     """
     
     if perspective == "first_person":
-        perspective_prefix = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing?"
+        perspective_prefix = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing? Note that you need to use first person perspective."
     elif perspective == "third_person":
         perspective_prefix = "The video is taken from egocentric view. The person's hands are sometimes interacting with objects. What action is the person doing?"