@@ -212,7 +212,7 @@ def run(self, indices):
212212 ret = {}
213213
214214 for k ,v in tqdm (data_batch .items ()):
215- parsed_item = self .parse_item (v )
215+ parsed_item = self .parse_item (v )
216216 start_timestamp = parsed_item ['start_second' ]
217217 end_timestamp = parsed_item ['end_second' ]
218218 vid_path = parsed_item ['vid_path' ]
@@ -274,11 +274,10 @@ def explore_wrong_examples(self):
274274 def predict_images (self , images , parsed_item ):
275275 """
276276 Predict the action from the images
277- """
278-
277+ """
279278 option_text = parsed_item ['options' ]
280- start_second = parsed_item [ 'start_second' ]
281- end_second = parsed_item ['end_second' ]
279+ start_second = 0
280+ end_second = parsed_item ['end_second' ] - parsed_item [ 'start_second' ]
282281 temperature = 0
283282 system_prompt_prefix = f"""
284283 You are seeing video frames from an egocentric view of a person. Pretend that you are the person. Your task is to describe what action you are performing.
@@ -399,8 +398,8 @@ def annotate(self, images, data_item):
399398 """
400399 gt_answer = data_item ['gt_answer' ]
401400 option_text = data_item ['options' ]
402- start_second = data_item [ 'start_second' ]
403- end_second = data_item ['end_second' ]
401+ start_second = 0
402+ end_second = data_item ['end_second' ] - data_item [ 'start_second' ]
404403 temperature = 0
405404 system_prompt_prefix = f"""
406405You are seeing video frames from an egocentric view of a person.
@@ -489,5 +488,5 @@ def calculate_gpt_accuracy(path):
489488
490489 #multi_process_annotate(train_file_path, root)
491490 #explore_wrong_examples(root, pred_folder)
492- # multi_process_inference(root, pred_folder, debug = False )
493- calculate_gpt_accuracy ('valset_chatgpt_inference_results/gpt-4o-avion_top10_4frames.json' )
491+ multi_process_inference (root , pred_folder , debug = True )
492+ # calculate_gpt_accuracy('valset_chatgpt_inference_results/gpt-4o-avion_top10_4frames.json')
0 commit comments