better inference behavior

Haozhe Qi · Haozhe Qi · commit e9b81f2b9fd4 · 2025-02-19T23:04:50.000+01:00
diff --git a/llava/action/llava_inference.py b/llava/action/llava_inference.py
@@ -14,7 +14,7 @@ def llava_inference(
     tokenizer, 
     model, 
     image_processor, 
-    mc_data,
+    input,
     clip_length = 16,
     num_frames = 16,
     temperature = 0,
@@ -44,35 +44,29 @@ def llava_inference(
         image_tensors.append(frames)
 
         conv_template = "qwen_1_5"
-
-        options = mc_data['options'][0]
+        original_input = input
+        if isinstance(input, dict):
+            input = input['options'][0] if input else None      
+                                    
         if test_type == 'base':
             question_type = "mc_top5_official_key"
-        elif test_type == "direct_narration":
-            question_type = "direct_narration"
-        elif test_type == 'caption' or test_type == 'debug':
-            question_type = "caption"
-        elif test_type == 'temporal_cot_pseudo':
-            question_type = 'temporal_cot_pseudo'
-        elif test_type == 'temporal_cot_oracle':
-            question_type = 'temporal_cot_oracle'            
-        elif test_type == 'temporal_cot_caption':
-            question_type = 'temporal_cot_caption'
+        else:
+            question_type = test_type
                     
         if  test_type == 'caption_then_answer':        
             caption_answer = llava_inference([video_frames], 
             tokenizer, 
             model,  
             image_processor, 
-            mc_data,
+            original_input,
             test_type = 'caption',
             clip_length = clip_length,
             num_frames = num_frames,
             temperature = 0,
             time_meta = time_meta)
 
             question = format_llava_prompt(DEFAULT_IMAGE_TOKEN,
-                                        options,
+                                        input,
                                         video_duration,                                  
                                         n_frames,
                                         "mc_top5_official_key",
@@ -85,7 +79,7 @@ def llava_inference(
             
         else:                        
             question = format_llava_prompt(DEFAULT_IMAGE_TOKEN,
-                                        options,
+                                        input,
                                         video_duration,                                  
                                         n_frames,
                                         question_type,
@@ -102,7 +96,8 @@ def llava_inference(
         conv.append_message(conv.roles[0], question)
         conv.append_message(conv.roles[1], None)
         prompt_question = conv.get_prompt()
-
+        print ("what is the question?", question)
+               
 
         input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
         image_sizes = [frame.size for frame in video_frames]
diff --git a/llava/action/selective_inference.py b/llava/action/selective_inference.py
@@ -3,12 +3,21 @@
 """
 from llava.action.ek_eval import prepare_llava
 from llava.action.generate_interval_pred import  get_lookup_dict
-from llava.action.inference import llava_inference
- 
-val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'  
-root = '/data/shaokai/EK100_512/EK100'
+from llava.action.llava_inference import llava_inference
+
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+# val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'  
+# root = '/data/shaokai/EK100_512/EK100'
+val_metadata = '/iopsstor/scratch/cscs/hqi/VFM/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv'
+root = '/iopsstor/scratch/cscs/hqi/VFM/onevision/EK100_512/EK100'
+
 n_frames = 32
 action_representation = 'GT_random_narration'
+perspective = 'first_person'
+include_time_instruction = False
+image_token = DEFAULT_IMAGE_TOKEN
+
+
 
 def get_frames_by_uid(uid, root):
     from llava.action.utils import avion_video_loader
@@ -29,32 +38,63 @@ def get_frames_by_uid(uid, root):
                                             fast_rrc=False,
                                             fast_rcc = False,
                                             jitter = False) 
-    return frames   
+    return frames, time_meta   
+#
+
+ 
+                        
+                        
+                        
 
-def inference_task_by_uid(checkpoint_folder, uid, task):
+# for prior actions
+def get_meta_data():
+    pass
+
+
+def inference_task_by_uid(question, checkpoint_folder, uid, task):
     
     tokenizer, model, image_processor, max_length = prepare_llava(checkpoint_folder)
     
-    frames = get_frames_by_uid(uid, root)
-
+    frames, time_meta = get_frames_by_uid(uid, root)
+    
+    meta_data = None
+    learn_neighbor_actions = ""
     if 'temporal_cot' in task:
-        get_lookup_dict(val_metadata, 
+        lookup_table = get_lookup_dict(val_metadata, 
                         action_representation,
                         test_type = task, 
                         pseudo_folder = '')
-        pred = llava_inference(
-                            frames, 
-                            tokenizer, 
-                            model, 
-                            image_processor,  
-                            mc_data,  
-                            test_type = test_type,
-                            clip_length = clip_length, 
-                            num_frames=num_frames, 
-                            temperature = temperature,
-                            time_meta = time_meta,
-                            learn_neighbor_actions = learn_neighbor_actions,
-                            meta_data = meta_data,
-                            perspective = perspective,
-                            include_time_instruction = include_time_instruction
-                            )
+        meta_data = lookup_table.get(uid, None)
+        learn_neighbor_actions = "prior"
+    
+    video_duration = time_meta['duration']
+            
+        
+    pred = llava_inference(
+                        [frames], 
+                        tokenizer, 
+                        model, 
+                        image_processor,  
+                        question,  
+                        test_type = task,
+                        clip_length = n_frames, 
+                        num_frames= n_frames, 
+                        temperature = 0,
+                        time_meta = time_meta,
+                        learn_neighbor_actions = learn_neighbor_actions,
+                        meta_data = meta_data,
+                        perspective = perspective,
+                        include_time_instruction = include_time_instruction
+                        )
+    print (pred)
+    
+if __name__ == '__main__':
+    pretrained_model_folder = 'experiments/dev_LLaVA-Video-7B-Qwen2_64f_top5_gpt4o_avion_tim_last_layer_one_token_detection_direct_neighbor_178K_100percent_time'
+    uid = 'P28-P28_15_50.66_51.69'
+    task = 'open-ended'
+    question = "What is the object that is to the left of the knife?"
+    
+    inference_task_by_uid(question,
+                          pretrained_model_folder,
+                          uid,
+                          task)
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -291,6 +291,9 @@ def format_task_related_prompt(question, question_type, meta_data = None, perspe
     elif question_type == "dpo":
         ret = "You are seeing this video from egocentric view and you are the person. Your hands are sometimes interacting with obects. Describe in details what you see and what you are doing."
 
+    elif question_type == "open-ended":
+        ret = f"You are seeing this video from egocentric view and you are the person. {question}"
+
     elif question_type == "gpt-gt-instruct-reason":
         ret = question
     elif question_type == "gpt-hand-object":