merge conflict

yeshaokai · yeshaokai · commit 6ff8a9f6eec5 · 2024-10-21T07:06:12.000Z
diff --git a/action/chatgpt_utils.py b/action/chatgpt_utils.py
@@ -150,14 +150,21 @@ class GPTInferenceAnnotator(ChatGPT):
     Given the images, this class will annotate the video frames
     """
 
-    def __init__(self, root, prediction_save_folder, clip_length = 4, debug = False):
+    def __init__(self, 
+                 root, 
+                 prediction_save_folder, 
+                 clip_length = 4, 
+                 debug = False,
+                 topk = 10
+                 ):
         super().__init__(clip_length = clip_length)
         self.root = root
         self.prediction_save_folder = prediction_save_folder 
         self.prediction_analysis = PredictionAnalysis(self.prediction_save_folder)
         self.prediction_analysis.load()
         self.data = self.prediction_analysis.data
         self.debug = debug
+        self.topk = topk
 
     def multi_process_run(self):
         prediction_analysis = PredictionAnalysis(self.prediction_save_folder)
@@ -187,7 +194,14 @@ def multi_process_run(self):
     def parse_item(self, item):
 
         gt_name = item['gt_name']
-        avion_predictions = item['avion_preds']['predictions']       
+        avion_predictions = item['avion_preds']['predictions']
+        assert self.topk <= len(avion_predictions)
+        avion_predictions = avion_predictions[:self.topk]
+        # _avion_predictions = [e.replace(':', ' ', 1) for e in avion_predictions]
+        # if gt_name not in _avion_predictions:
+        #     print ('gt_name not in avion_predictions')
+        # else:
+        #     print ('gt_name in avion_predictions')
         
         vid_path = item['vid_path'][0]
         start_second = item['start_second']
@@ -453,12 +467,17 @@ def explore_wrong_examples(root, prediction_save_folder, debug = False):
                                     debug = debug)
     annotator.explore_wrong_examples()
 
-def multi_process_inference(root, prediction_save_folder, debug = False):
+def multi_process_inference(root, 
+                            prediction_save_folder, 
+                            clip_length = 4,
+                            topk = 10, 
+                            debug = False):
 
     annotator = GPTInferenceAnnotator(root, 
     prediction_save_folder, 
-    clip_length = 32,
-    debug = debug)
+    clip_length = clip_length,
+    debug = debug,
+    topk = topk)
 
     annotator.multi_process_run()
 
@@ -488,5 +507,8 @@ def calculate_gpt_accuracy(path):
 
     #multi_process_annotate(train_file_path, root)
     #explore_wrong_examples(root, pred_folder)
-    multi_process_inference(root, pred_folder, debug = True)
-    #calculate_gpt_accuracy('valset_chatgpt_inference_results/gpt-4o-avion_top10_4frames.json')
+    multi_process_inference(root, 
+                            pred_folder, 
+                            debug = False,
+                            clip_length = 4,
+                            topk = 5)
diff --git a/action/ek_eval.py b/action/ek_eval.py
@@ -465,9 +465,7 @@ def evaluate_on_EK100(eval_args,
             global_total_samples.add_(1)
 
             logger.info(f'Process {dist.get_rank()} - local_total_samples: {local_total_samples:.4f}')
-
             logger.info(f'Process {dist.get_rank()} - loca_llava_correct: {llava_correct:.4f}')
-
             logger.info(f'Process {dist.get_rank()} - local_running_corrects: {local_running_corrects:.4f}')
 
 
diff --git a/action/llava_ov_inference.py b/action/llava_ov_inference.py
@@ -83,8 +83,9 @@ def llava_video_process(
     video_duration = time_meta['duration'].item()
     n_frames = time_meta['n_frames'].item()
     frame_time = time_meta['frame_time']
-    frame_time = [e[0] for e in frame_time]
-    time_instruciton = f"The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."    
+    print ('frame time', frame_time)
+    frame_time = frame_time[0]
+    time_instruciton = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it.  What is the person doing? Format your answer letter. verb noun such as A. move knife."    
     
     frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
 
@@ -97,12 +98,15 @@ def llava_video_process(
     
     question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\n:{options}"
 
+    print ('what is the question')
+    print (question)
 
     conv = copy.deepcopy(conv_templates[conv_template])
     conv.append_message(conv.roles[0], question)
     conv.append_message(conv.roles[1], None)
     prompt_question = conv.get_prompt()
 
+
     input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
     image_sizes = [frame.size for frame in video_frames]
 
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -1232,7 +1232,7 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
                 processor = self.data_args.image_processor
                 image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
                 if self.data_args.add_time_instruction:
-                    time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
+                    time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
                     sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
                 image = [(image, video[0].size, "video")]
                 sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)