WIP

Ye Shaokai · Ye Shaokai · commit c6a1a25c635f · 2025-02-07T09:56:17.000+01:00
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
@@ -25,8 +25,8 @@
 n_frames = 32
 topk = 5
 action_representation = 'GT_random_narration'
-#gpt_model = 'gpt-4o-mini-2024-07-18'
-gpt_model = 'gpt-4o-2024-08-06'
+gpt_model = 'gpt-4o-mini-2024-07-18'
+#gpt_model = 'gpt-4o-2024-08-06'
 perspective = 'first_person'
 benchmark_testing = True
 
@@ -89,10 +89,15 @@ def visualize_with_gpt_with_avion(n_samples, offset = 0, question_type = 'mc_'):
                                         topk = topk) 
     
     inferencer.multi_process_run(n_samples = n_samples, offset = offset, disable_api_calling=False) 
+    
+    
+def visualize_with_llava(uid, ):
+    """
+    
+    """
+        
+    
 
 if __name__ == '__main__':
     
-    question_type = "gpt-gt-reason"
-    #visualize_with_random(20, offset = 40, question_type = "gpt-gt-reason")
-    #visualize_with_gpt_with_tim(20, offset = 40, question_type = "gpt-gt-reason")
-    visualize_with_gpt_with_avion(20, offset = 40, question_type = "gpt-gt-reason")
+    visualize_with_gpt_with_avion(10, offset = 100, question_type = "caption")
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -232,7 +232,7 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
     """
     
     if perspective == "first_person":
-        perspective_prefix = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing? Note that you need to use first person perspective."
+        perspective_prefix = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with objects. What action are you doing? "
     elif perspective == "third_person":
         perspective_prefix = "The video is taken from egocentric view. The person's hands are sometimes interacting with objects. What action is the person doing?"
                     
@@ -262,8 +262,8 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
             
     elif question_type == "temporal_detection":
         ret = question
-    elif question_type == "gpt-gt-reason":
-        ret = f"{perspective_prefix}Describe in details what you see from the video frames."
+    elif question_type == "gpt-gt-reason" or question_type == "caption":
+        ret = f"{perspective_prefix} Describe in details what you see from the video frames. You must talk in the first person perspective. Try to focus on what you are doing. "
     
     elif question_type == "triple_direct_answer":
         assert meta_data