gpt annotation works

yeshaokai · yeshaokai · commit 4cfe5ae7b497 · 2024-10-17T13:10:34.000Z
diff --git a/action/chatgpt_utils.py b/action/chatgpt_utils.py
@@ -5,9 +5,11 @@
 import numpy as np
 import openai
 from pydantic import BaseModel
-from multiprocessing.pool import Pool
+from concurrent.futures import ProcessPoolExecutor
 from action.utils import avion_video_loader
+import torch
 import cv2
+from pathlib import Path
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
@@ -25,11 +27,21 @@ class MultiChoiceResponse(BaseModel):
     """
 
     explanation: str
-
-
 def split_indices(indices, num_chunks):
+    # Calculate the size of each chunk and the remainder
     chunk_size = len(indices) // num_chunks
-    return [indices[i:i + chunk_size] for i in range(0, len(indices), chunk_size)]
+    remainder = len(indices) % num_chunks
+
+    # Create chunks, distributing the remainder across the first few chunks
+    chunks = []
+    start = 0
+    for i in range(num_chunks):
+        # Each of the first 'remainder' chunks will have one extra element
+        end = start + chunk_size + (1 if i < remainder else 0)
+        chunks.append(indices[start:end])
+        start = end
+
+    return chunks
 
 class GPTAnnotator:
     def __init__(self, ann_file, data_root, clip_length = 32):
@@ -51,8 +63,10 @@ def prepare_multiple_images(self, images):
 
         """               
         encoded_image_list = []
-
         for image in images:
+         
+            if isinstance(image, torch.Tensor):
+                image = image.cpu().detach().numpy()
             # images from matplotlib etc.
             if isinstance(image, io.BytesIO):
                 image_bytes = image
@@ -82,6 +96,7 @@ def extract_frames(self, data_root, vid_path, start_second, end_second):
                         'MP4',
                         start_second,
                         end_second,
+                        chunk_len = 15,
                         clip_length = self.clip_length,
                         threads = 1,
                         fast_rrc=False,
@@ -98,52 +113,69 @@ def parse_conversation(self, item):
         human_dict = conversations[0]
 
         # the offset is to remove the quote ' 
-        option_start = human_dict['value'].index['['] + 2
-        option_end = human_dict['value'].index[']'] - 1
+        option_start = human_dict['value'].index('[') + 2
+        option_end = human_dict['value'].index(']') - 1
 
         option_text =  human_dict['value'][option_start:option_end]        
         gpt_dict = conversations[1]
         gt_answer = gpt_dict['value']
+        gt_answer = gt_answer[gt_answer.index('.'):].strip()
 
         assert human_dict['from'] == 'human' and gpt_dict['from'] =='gpt'
 
         ret = {'options': option_text,
                'gt_answer': gt_answer,
                'start_second': item['start_timestamp'],
-               'end_second':  item['end_timestemp']}
+               'end_second':  item['end_timestamp']}
         
         return ret
 
     def annotate(self, indices):
 
         data_batch = [self.data[i] for i in range(len(self.data)) if i in indices]
 
-        for item in data_batch:
+        ret = {}
+        for index in indices:
+            item = self.data[index]
             start_timestamp = item['start_timestamp']
             end_timestamp = item['end_timestamp']
             vid_path = '{}/{}'.format(item['video'].split('-')[0], item['video'].split('-')[1])
             frames, time_meta = self.extract_frames(self.data_root, vid_path, start_timestamp, end_timestamp)
-            data_item = self.parse_conversation(item)
-            anno = self.annotate_images(frames, data_item)
-            print (anno)
+            parsed_item = self.parse_conversation(item)
+            gpt_answer = self.annotate_images(frames, parsed_item).explanation
+            item['conversations'][1]['value'] = gpt_answer
+            ret[index] = item
             break
 
+        return ret         
+
     def annotate_images(self, images, data_item):
         """
         Annotate with mc_data
         {
         }
         """
         gt_answer = data_item['gt_answer']
-        option_text = data_item['option_text']
+        option_text = data_item['options']
         start_second = data_item['start_second']
         end_second = data_item['end_second']        
         temperature = 0
         system_prompt_prefix = f"""
-You are seeing video frames from an egocentric view. You are determining what action the person is performing.
-The video's start time is {start_second} and the end time is {end_second}.
-In a multi-choice video question answering, you were given following options {option_text} and the correct answer is {gt_answer}.
-Please describe what you see and why wrong answers are wrong and why right answer is right.
+You are seeing video frames from an egocentric view of a person. 
+Please talk as if you are the person in the video and describe what action you are performing.
+To assist you for how to describe the action, the video's start time is {start_second} and the end time is {end_second} and the duration is {end_second - start_second} seconds.
+To further assist you for how to describe the action, note that in a multi-choice video question answering, you were given following options {option_text} and the correct answer is {gt_answer}.
+In addition to describe what you see, why wrong answers were wrong and why right answer was right.
+When you explain why wrong answers were wrong and why right answer was right, you should use the following flow of reasoning:
+
+The flow of reasoning:
+1. What objects need to be visible to support the answer?
+2. What sequence of actions before and after the current action need to be seen to support the answer?
+3. Whether the duration in time supports that answer?
+
+Based on the answers above, why right answer is right and why wrong answers were wrong.
+
+
 """
         system_prompt_suffix = """"""
 
@@ -165,21 +197,54 @@ def annotate_images(self, images, data_item):
         return response.choices[0].message.parsed
     
 
-def annotate_using_chatgpt():
-    """
-    Multi processing to speed up 
-    """
-    with Pool() as pool:
-        pass
-        #pool.starmap(annotate, task_args)
+def process_subset(indices_subset, train_file_path, root):
+    # Initialize a new annotator instance within each process
+    annotator = GPTAnnotator(train_file_path, root)
+    return annotator.annotate(indices_subset)
 
-    pass
 
+if __name__ == '__main__':
+    #train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl'
+    #root = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100'
+    train_file_path = '/data/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl'
+    root = '/data/EK100/EK100_320p_15sec_30fps_libx264'
+    
+    num_cores = 2 #os.cpu_count()
 
+    print (f'Using {num_cores} cores thus splitting the data into {num_cores} chunks')
 
-if __name__ == '__main__':
-    train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl'
-    root = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100'
+    with open(train_file_path, 'r') as f:
+        num_lines = len([line for line in f])
 
+    print (f'Total number of lines in the file: {num_lines}')
+    indices = list(range(num_lines))
+    print ('indices', len(indices))
     
-    GPTAnnotator(train_file_path, root)
+    indices_groups = split_indices(indices, num_cores)
+
+    print ('number of groups')
+    print (len(indices_groups))
+
+    with ProcessPoolExecutor(max_workers=num_cores) as executor:
+        # Pass additional arguments to the function
+        futures = [executor.submit(process_subset, group, train_file_path, root) for group in indices_groups]
+        
+        # Wait for all futures to complete
+        combined_results = {}
+        for future in futures:
+            result_dict = future.result()
+            combined_results.update(result_dict)
+        
+    keys = sorted(list(combined_results.keys()))
+
+    print ('resulted number of keys', len(keys))
+    
+    result = []
+    for key in keys:
+        result.append(combined_results[key])
+
+    anno_root = Path(train_file_path).parent
+
+    with open(anno_root / 'gpt_annotated.jsonl', 'w') as f:
+        for item in result:
+            f.write(json.dumps(item) + '\n')
diff --git a/action/generate_description.py b/action/generate_description.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 import json
 import csv
 import os
diff --git a/action/utils.py b/action/utils.py
@@ -226,6 +226,9 @@ def avion_video_loader(root, vid, ext, second, end_second,
         time_meta = {}
         
         time_meta['duration'] = end_second - second
+
+        assert end_second > second, 'end_second should be greater than second'
+
         chunk_start = int(second) // chunk_len * chunk_len
         chunk_end = int(end_second) // chunk_len * chunk_len
         while True:
@@ -262,7 +265,7 @@ def avion_video_loader(root, vid, ext, second, end_second,
                 print(error)
                 frames = vr.get_batch([0] * len(rel_frame_ids)).asnumpy()
             except IndexError:
-                print(root, vid, ext, second, end_second)
+                print('IndexError', root, vid, ext, second, end_second)
             all_frames.append(frames)
             all_frame_ids.append(frame_ids)
             if sum(map(lambda x: x.shape[0], all_frames)) == clip_length:
diff --git a/shaokai_generate_train.sh b/shaokai_generate_train.sh
@@ -1,14 +1,14 @@
-# python3 action/generate_description.py  \
-#     --train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
-#     --out_folder /data/shaokai/EK100_avion_mc/ \
-#     --gen_type avion_mc \
-#     --n_options 10 \
-#   > train_gen.out 2>&1
+python3 action/generate_description.py  \
+    --train_metadata /data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_train.csv \
+    --out_folder /data/EK100_inst_train/ \
+    --avion_train_predictions /data/epic_kitchen/avion_predictions_train.json \
+    --gen_type avion_mc \
+    --n_options 10 
 
-python3 action/generate_description.py \
-   --train_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
-   --out_folder /capstor/scratch/cscs/hqi/llava/EK100/EK100_inst_train \
-   --avion_train_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_train.json \
-   --gen_type avion_mc \
-   --n_options 10
+# python3 action/generate_description.py \
+#    --train_metadata /capstor/scratch/cscs/hqi/llava/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv \
+#    --out_folder /capstor/scratch/cscs/hqi/llava/EK100/EK100_inst_train \
+#    --avion_train_predictions /capstor/scratch/cscs/hqi/llava/EK100/avion_predictions_train.json \
+#    --gen_type avion_mc \
+#    --n_options 10
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+from __future__ import annotations`
`1`	`2`	`import json`
`2`	`3`	`import csv`
`3`	`4`	`import os`