@@ -70,6 +70,7 @@ def select_train_subset(self):
7070 def init_data (self ):
7171 ret = {}
7272 csv_reader = csv .reader (open (self .annotation_file ))
73+ print ('loading data from ' , self .annotation_file )
7374 _ = next (csv_reader ) # skip the header
7475
7576 indices = self .select_train_subset ()
@@ -141,6 +142,11 @@ def predict_images(self, images, parsed_item):
141142
142143 system_prompt = time_instruction + task_related_prompt
143144
145+ suffix = " Note that you need to use first person perspective. Make sure you do not mention you are watching a video or an image."
146+
147+ system_prompt += suffix
148+
149+
144150 format_prompt = """
145151**Return only a JSON object** with the following two properties:
146152
@@ -150,9 +156,7 @@ def predict_images(self, images, parsed_item):
150156
151157 if 'o1' in self .gpt_model :
152158 system_prompt += format_prompt
153-
154- print (system_prompt )
155-
159+
156160 if 'o1-mini' == self .gpt_model :
157161 system_role = "user"
158162 temperature = 1
@@ -270,13 +274,26 @@ def create_comparison_data(positive_filename, negative_filename, out_filename):
270274 video_root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
271275 anno_root = '/data/shaokai/epic-kitchens-100-annotations/'
272276 clip_length = 8
277+ gpt_model = 'gpt-4o'
273278
274- # cap = CaptionInference(video_root,
275- # os.path.join(anno_root, 'EPIC_100_train.csv'),
276- # clip_length,
277- # debug = False,
278- # fraction = 0.01)
279- # cap.multi_process_run(n_samples = -1, filename = f'gpt4o_inference_{clip_length}frame_1percent.json')
280-
281-
282- create_comparison_data ('gpt4o_inference_8frame_1percent.json' , 'gpt4o_inference_1frame_1percent.json' , 'comparison_data_1percent.jsonl' )
279+ cap = CaptionInference (
280+ gpt_model ,
281+ video_root ,
282+ os .path .join (anno_root , 'EPIC_100_train.csv' ),
283+ clip_length ,
284+ debug = False ,
285+ fraction = 0.1 )
286+ cap .multi_process_run (n_samples = - 1 , filename = f'gpt4o_inference_{ clip_length } frame_10percent.json' )
287+
288+ clip_length = 1
289+ cap = CaptionInference (
290+ gpt_model ,
291+ video_root ,
292+ os .path .join (anno_root , 'EPIC_100_train.csv' ),
293+ clip_length ,
294+ debug = False ,
295+ fraction = 0.1 )
296+ cap .multi_process_run (n_samples = - 1 , filename = f'gpt4o_inference_{ clip_length } frame_10percent.json' )
297+
298+
299+ create_comparison_data ('gpt4o_inference_8frame_10percent.json' , 'gpt4o_inference_1frame_10percent.json' , 'comparison_data_10percent.jsonl' )
0 commit comments