Added vis utils

Ye Shaokai · Ye Shaokai · commit b84d560afd1d · 2025-02-19T10:51:03.000+01:00
diff --git a/llava/action/benchmark.py b/llava/action/benchmark.py
@@ -5,6 +5,23 @@
 import glob
 import json
 import os
+import re
+
+def process_raw_pred(raw_pred):
+    matches = re.findall(r"[A-Z]\.\s(.+)", raw_pred)
+    
+    if 'None' in raw_pred:
+        return raw_pred.replace('None. ', '')
+    
+    if matches:
+        # Get the last match
+        last_match = matches[-1]
+        # Remove a trailing period and anything after it
+        last_match = re.sub(r"\.\s*.*$", "", last_match)
+        return last_match
+    else:
+        return raw_pred
+
 # root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
 # annotation_file = '/data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_validation.csv'
 # avion_prediction_file = '/data/epic_kitchen/AVION_PREDS/avion_pred_ids_val.json'
@@ -23,7 +40,7 @@
 benchmark_testing = True
 
 
-def benchmark_avion_mcq(n_samples, gpt_model):
+def benchmark_avion_mcq(n_samples, gpt_model, action_representation, benchmark_testing = True, n_frames = 8):
 
     inferencer = GPTInferenceAnnotator(gpt_model,
                                        root,
@@ -39,7 +56,7 @@ def benchmark_avion_mcq(n_samples, gpt_model):
     inferencer.multi_process_run(n_samples = n_samples,
                                  offset = 0)
                                        
-def benchmark_tim_mcq(n_samples, gpt_model):
+def benchmark_tim_mcq(n_samples, gpt_model, action_representation, benchmark_testing = True, n_frames = 8):
     
     inferencer = GPTInferenceAnnotator(gpt_model,
                                         root,
@@ -54,7 +71,7 @@ def benchmark_tim_mcq(n_samples, gpt_model):
                                         topk = topk) 
     inferencer.multi_process_run(n_samples = n_samples, offset = 0)    
 
-def benchmark_random_mcq(n_samples, gpt_model):
+def benchmark_random_mcq(n_samples, gpt_model, action_representation, benchmark_testing = True, n_frames = 8):
     inferencer = GPTInferenceAnnotator(gpt_model,
                                        root,
                                        annotation_file,
@@ -75,18 +92,34 @@ def calcuate_acc_from_jsons(json_folder):
         print (file)
         preds = json.load(open(file))
         correct = 0
+        something = 0
         for k,v in preds.items():
+            options = v['options']
+            options = [process_raw_pred(e) for e in options]
+            
+            #assert v['gt_name'] in options, f"{v['gt_name']} not in {options}"
+            if v['gt_name'] not in options:
+                print ('what?', options)
+                print ('what?', v)
+                break
+            
             if v['gt_name'] == v['chatgpt_answer']:
                 correct+=1
+            else:
+                pass
+                #print ('wrong prediction! pred: gt', v['chatgpt_answer'] + "," + v['gt_name'])
         print ('acc ', correct/len(preds))
+        print ('gt not in options', something)
 
     
     
 if __name__ == '__main__':
-    # benchmark_avion_mcq(-1, 'gpt-4o-mini-2024-07-18')
-    # benchmark_tim_mcq(-1, 'gpt-4o-mini-2024-07-18')
-    # benchmark_random_mcq(-1, 'gpt-4o-mini-2024-07-18')
-    # benchmark_avion_mcq(-1, 'gpt-4o-2024-08-06')
-    # benchmark_tim_mcq(-1, 'gpt-4o-2024-08-06')
-    # benchmark_random_mcq(-1, 'gpt-4o-2024-08-06')    
-    calcuate_acc_from_jsons('gpt_full_benchmark_results')
+    # benchmark_avion_mcq(-1, 'gpt-4o-mini-2024-07-18', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    # benchmark_tim_mcq(-1, 'gpt-4o-mini-2024-07-18', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    # benchmark_random_mcq(-1, 'gpt-4o-mini-2024-07-18', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    # benchmark_avion_mcq(-1, 'gpt-4o-2024-08-06', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    # benchmark_tim_mcq(-1, 'gpt-4o-2024-08-06', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    # benchmark_random_mcq(-1, 'gpt-4o-2024-08-06', 'GT_random_narration', benchmark_testing = True, n_frames = 8)
+    benchmark_tim_mcq(1, 'gpt-4o-mini-2024-07-18', 'official_key', benchmark_testing = False, n_frames = 16)
+    #benchmark_tim_mcq(-1, 'gpt-4o-mini-2024-07-18', 'GT_random_narration', benchmark_testing = False, n_frames = 16)
+    #calcuate_acc_from_jsons('gpt_EK100_results')
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -483,6 +483,9 @@ def multi_process_run(self, offset= 0, n_samples = -1, disable_api_calling = Fal
         if combined_results and 'mc_' in self.question_type:
             calculation = calculate_gpt_accuracy(data = combined_results)
 
+        if n_samples == -1:
+            n_samples = len(self.data)
+        
         checkpoint_name = f"{self.gpt_model}_{self.gen_type}_{self.action_representation}_top{self.topk}_{self.clip_length}f_{n_samples}samples.json"
 
         if self.do_visualization:
diff --git a/llava/action/make_visualizations.py b/llava/action/make_visualizations.py
@@ -150,9 +150,41 @@ def save_visualization(vis_folder, frames, uid):
     out_dir = Path(vis_folder)
     out_dir.mkdir(parents=True, exist_ok=True)        
     sub_folder = out_dir / uid
+    fps = 30
+    height, width = frames[0].shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_path = str(sub_folder / f"{uid}.mp4")
+    video_out = cv2.VideoWriter(video_path, fourcc, fps, (width, height))
     sub_folder.mkdir(parents=True, exist_ok=True)
     for idx, frame in enumerate(frames):            
         cv2.imwrite(str(sub_folder / f"{uid}_{idx}.jpg"), cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))    
+        bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        video_out.write(bgr_frame)
+    video_out.release()
+
+def visualize_with_uid(uid):
+    from llava.action.utils import avion_video_loader
+    val_metadata = '/data/shaokai/epic-kitchens-100-annotations/EPIC_100_validation.csv'                
+    vid_path = '_'.join(uid.split('_')[:2]).replace('-', '/')
+    start_timestamp, end_timestamp = uid.split('_')[2:]
+    start_timestamp = float(start_timestamp)
+    end_timestamp = float(end_timestamp)
+    print (vid_path, start_timestamp, end_timestamp)
+    # split uid to video path and start, end second
+    frames, time_meta = avion_video_loader(root,
+                                           vid_path,
+                                           'MP4',
+                                            start_timestamp,
+                                            end_timestamp,
+                                            chunk_len = 15,
+                                            clip_length = n_frames,
+                                            threads = 1,
+                                            fast_rrc=False,
+                                            fast_rcc = False,
+                                            jitter = False)
+    
+    vis_folder = f"figure1_vis"                       
+    save_visualization(vis_folder, frames, uid)       
     
 def visualize_with_llava(pretrained_path, uid, question_type, gen_type):
     """    
@@ -216,7 +248,9 @@ def visualize_with_llava(pretrained_path, uid, question_type, gen_type):
     
     #visualize_with_gpt_with_avion(10, offset = 100, question_type = "caption")
     #llava_pretrained_path = 'lmms-lab/LLaVA-Video-7B-Qwen2'
-    llava_pretrained_path = 'experiments/LLaVA-Video-7B-Qwen2'
-    uid = 'P01-P01_11_182.65_192.07'
-    visualize_with_llava(llava_pretrained_path, uid, 'caption', 'tim')
-    
+    # llava_pretrained_path = 'experiments/LLaVA-Video-7B-Qwen2'
+    # uid = 'P01-P01_11_182.65_192.07'
+    # visualize_with_llava(llava_pretrained_path, uid, 'caption', 'tim')
+    visualize_with_uid("P28-P28_16_73.84_74.66")
+    visualize_with_uid("P28-P28_15_50.66_51.69")
+    visualize_with_uid("P26-P26_41_113.0_114.1")
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -18,10 +18,12 @@
 from collections import defaultdict
 import json
 from llava.utils import rank0_print
-
+import re
 # set random seed
 random.seed(42)
 
+
+
 def remove_sub_nouns(nlp, narration, verb, nouns):
     narration = copy.deepcopy(narration)
     noun_list = ast.literal_eval(nouns)
@@ -433,9 +435,8 @@ def generate_multi_choice(self,
     def train_generate(self, gt_vn, narration, k, action_representation, n_narrations, labels, mapping_vn2narration, verb_maps, noun_maps, benchmark_testing = False):
         # letters as A, B, C, D, .. Note we maximally support 26 letters
         letters = [chr(65+i) for i in range(26)][:k]                
-        answer_list = [vn for vn in mapping_vn2narration.keys()]     
         
-                   
+        answer_list = [vn for vn in mapping_vn2narration.keys()]                                
         wrong_answers = np.random.choice(answer_list, size = k-1, replace = False)        
         answer_ids = [gt_vn] + list(wrong_answers)
         random.shuffle(answer_ids)
@@ -456,7 +457,9 @@ def train_generate(self, gt_vn, narration, k, action_representation, n_narration
 
         gt_letter = letters[answer_ids.index(gt_vn)]
         gt_answer = answers[answer_ids.index(gt_vn)]
-
+        print ('got here')
+        import sys
+        sys.exit()
         mc_data = {
                 'options': {0: options},
                 # the correct letter in mc
diff --git a/llava/action/vis_utils.py b/llava/action/vis_utils.py
diff --git a/llava/model/language_model/llava_qwen.py b/llava/model/language_model/llava_qwen.py