WIP

Ye Shaokai · Ye Shaokai · commit b560afa02173 · 2024-10-17T10:56:55.000+02:00
diff --git a/action/chatgpt_utils.py b/action/chatgpt_utils.py
@@ -2,11 +2,12 @@
 import io
 import json
 import os
-import cv2
 import numpy as np
 import openai
 from pydantic import BaseModel
 from multiprocessing.pool import Pool
+from action.utils import avion_video_loader
+import cv2
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
@@ -26,11 +27,24 @@ class MultiChoiceResponse(BaseModel):
     explanation: str
 
 
+def split_indices(indices, num_chunks):
+    chunk_size = len(indices) // num_chunks
+    return [indices[i:i + chunk_size] for i in range(0, len(indices), chunk_size)]
 
 class GPTAnnotator:
-    def __init__(self, prediction_file_path):
-        with open(prediction_file_path, 'r') as f:
-            self.prediction_file = json.load(f)
+    def __init__(self, ann_file, data_root, clip_length = 32):
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.clip_length = clip_length
+        data = []
+        with open(ann_file, 'r') as f:
+            for line in f:
+            # Parse the JSON data
+                _data = json.loads(line)
+                # Process your data
+                data.append(_data)
+        self.data = data
+        
 
     def prepare_multiple_images(self, images):
         """
@@ -62,38 +76,84 @@ def prepare_multiple_images(self, images):
         return multi_image_content
 
 
-    def annotate(self, images):
+    def extract_frames(self, data_root, vid_path, start_second, end_second):
+        frames, time_meta = avion_video_loader(data_root,
+                        vid_path,
+                        'MP4',
+                        start_second,
+                        end_second,
+                        clip_length = self.clip_length,
+                        threads = 1,
+                        fast_rrc=False,
+                        fast_rcc = False,
+                        jitter = False)
+        return frames, time_meta
+
+    def parse_conversation(self, item):
         """
-        Annotate to do image caption only
+        We should get time steps, duration
+        We shoudd also get gt and wrong answers
         """
-        pass
+        conversations = item['conversations']
+        human_dict = conversations[0]
+
+        # the offset is to remove the quote ' 
+        option_start = human_dict['value'].index['['] + 2
+        option_end = human_dict['value'].index[']'] - 1
+
+        option_text =  human_dict['value'][option_start:option_end]        
+        gpt_dict = conversations[1]
+        gt_answer = gpt_dict['value']
+
+        assert human_dict['from'] == 'human' and gpt_dict['from'] =='gpt'
 
-    def annotate_with_multichoice(self, images, mc_data):
+        ret = {'options': option_text,
+               'gt_answer': gt_answer,
+               'start_second': item['start_timestamp'],
+               'end_second':  item['end_timestemp']}
+        
+        return ret
+
+    def annotate(self, indices):
+
+        data_batch = [self.data[i] for i in range(len(self.data)) if i in indices]
+
+        for item in data_batch:
+            start_timestamp = item['start_timestamp']
+            end_timestamp = item['end_timestamp']
+            vid_path = '{}/{}'.format(item['video'].split('-')[0], item['video'].split('-')[1])
+            frames, time_meta = self.extract_frames(self.data_root, vid_path, start_timestamp, end_timestamp)
+            data_item = self.parse_conversation(item)
+            anno = self.annotate_images(frames, data_item)
+            print (anno)
+            break
+
+    def annotate_images(self, images, data_item):
         """
         Annotate with mc_data
-
         {
-
         }
-
         """
-
+        gt_answer = data_item['gt_answer']
+        option_text = data_item['option_text']
+        start_second = data_item['start_second']
+        end_second = data_item['end_second']        
         temperature = 0
-        include_images = True
-
-        system_prompt_prefix = """Inspect the images from the video and explain why the answer of the multi-choice question is D. """
-        system_prompt_suffix = """Yes"""
+        system_prompt_prefix = f"""
+You are seeing video frames from an egocentric view. You are determining what action the person is performing.
+The video's start time is {start_second} and the end time is {end_second}.
+In a multi-choice video question answering, you were given following options {option_text} and the correct answer is {gt_answer}.
+Please describe what you see and why wrong answers are wrong and why right answer is right.
+"""
+        system_prompt_suffix = """"""
 
         system_prompt = system_prompt_prefix + system_prompt_suffix
 
         system_message =  [{"role": "system", "content": system_prompt}]
 
-        if include_images:
-            multi_image_content = self.prepare_multiple_images(images)
-            multi_modal_content = [{"type": "text", "text": ""}] + multi_image_content
-            user_message = [{"role": "user", "content": multi_modal_content}]
-        else:
-            user_message = [{"role": "user", "content": ""}]        
+        multi_image_content = self.prepare_multiple_images(images)
+        multi_modal_content = [{"type": "text", "text": ""}] + multi_image_content
+        user_message = [{"role": "user", "content": multi_modal_content}]               
 
         response = client.beta.chat.completions.parse(
             model=GPT_MODEL,
@@ -114,10 +174,12 @@ def annotate_using_chatgpt():
         #pool.starmap(annotate, task_args)
 
     pass
-    
-def annotate_from_train_conv_file(train_file_path):
-    pass
+
+
 
 if __name__ == '__main__':
-    train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai'
-    annotate_from_train_conv_file(train_file_path)
+    train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl'
+    root = '/storage-rcp-pure/upmwmathis_scratch/shaokai/EK100'
+
+    
+    GPTAnnotator(train_file_path, root)
diff --git a/action/ek_eval.py b/action/ek_eval.py
@@ -18,7 +18,7 @@
 import json
 import logging
 from llava.utils import rank0_print
-from action.utils import generate_label_map, MultiChoiceGenerator, match_answer, parse_avion_predictions
+from action.utils import generate_label_map, MultiChoiceGenerator, match_answer, parse_avion_predictions, avion_video_loader
 from action.prediction_analysis import PredictionAnalysis
 import copy
 from collections import Counter 
@@ -33,125 +33,6 @@ def datetime2sec(str):
     hh, mm, ss = str.split(':')
     return int(hh) * 3600 + int(mm) * 60 + float(ss)
 
-
-def get_frame_ids(start_frame, end_frame, num_segments=32, jitter=True):
-    frame_ids = np.convolve(np.linspace(start_frame, end_frame, num_segments + 1), [0.5, 0.5], mode='valid')
-    if jitter:
-        seg_size = float(end_frame - start_frame - 1) / num_segments
-        shift = (np.random.rand(num_segments) - 0.5) * seg_size
-        frame_ids += shift
-    return frame_ids.astype(int).tolist()
-
-
-def get_video_reader(videoname, num_threads, fast_rrc, rrc_params, fast_rcc, rcc_params):
-    video_reader = None
-    if fast_rrc:
-        video_reader = decord.VideoReader(
-            videoname,
-            num_threads=num_threads,
-            width=rrc_params[0], height=rrc_params[0],
-            use_rrc=True, scale_min=rrc_params[1][0], scale_max=rrc_params[1][1],
-        )
-    elif fast_rcc:
-        video_reader = decord.VideoReader(
-            videoname,
-            num_threads=num_threads,
-            width=rcc_params[0], height=rcc_params[0],
-            use_rcc=True,
-        )
-    else:
-        video_reader = decord.VideoReader(videoname, num_threads=num_threads)
-    return video_reader
-
-
-def video_loader(root, vid, ext, second, end_second,
-                 chunk_len=300, fps=30, clip_length=32,
-                 threads=1,
-                 fast_rrc=False, rrc_params=(224, (0.5, 1.0)),
-                 fast_rcc=False, rcc_params=(224, ),
-                 jitter=False):
-    assert fps > 0, 'fps should be greater than 0'
-    if chunk_len == -1:
-        vr = get_video_reader(
-            osp.join(root, '{}.{}'.format(vid, ext)),
-            num_threads=threads,
-            fast_rrc=fast_rrc, rrc_params=rrc_params,
-            fast_rcc=fast_rcc, rcc_params=rcc_params,
-        )
-        end_second = min(end_second, len(vr) / fps)
-
-        # calculate frame_ids
-        frame_offset = int(np.round(second * fps))
-        total_duration = max(int((end_second - second) * fps), clip_length)
-        frame_ids = get_frame_ids(frame_offset, min(frame_offset + total_duration, len(vr)), num_segments=clip_length, jitter=jitter)
-
-        # load frames
-        assert max(frame_ids) < len(vr)
-        try:
-            frames = vr.get_batch(frame_ids).asnumpy()
-        except decord.DECORDError as error:
-            print(error)
-            frames = vr.get_batch([0] * len(frame_ids)).asnumpy()
-
-        return torch.from_numpy(frames.astype(np.float32))
-
-    else:
-        time_meta = {}
-        
-        time_meta['duration'] = end_second - second
-        chunk_start = int(second) // chunk_len * chunk_len
-        chunk_end = int(end_second) // chunk_len * chunk_len
-        while True:
-            video_filename = osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk_end, ext))
-            
-            if not osp.exists(video_filename):
-                # print("{} does not exists!".format(video_filename))
-                chunk_end -= chunk_len
-            else:
-                vr = decord.VideoReader(video_filename)
-                end_second = min(end_second, (len(vr) - 1) / fps + chunk_end)
-                assert chunk_start <= chunk_end
-                break
-        # calculate frame_ids
-        frame_ids = get_frame_ids(
-            int(np.round(second * fps)),
-            int(np.round(end_second * fps)),
-            num_segments=clip_length, jitter=jitter
-        )
-        all_frames = []
-        all_frame_ids = []
-        # allocate absolute frame-ids into the relative ones
-        for chunk in range(chunk_start, chunk_end + chunk_len, chunk_len):
-            rel_frame_ids = list(filter(lambda x: int(chunk * fps) <= x < int((chunk + chunk_len) * fps), frame_ids))
-            rel_frame_ids = [int(frame_id - chunk * fps) for frame_id in rel_frame_ids]
-            vr = get_video_reader(
-                osp.join(root, '{}.{}'.format(vid, ext), '{}.{}'.format(chunk, ext)),
-                num_threads=threads,
-                fast_rrc=fast_rrc, rrc_params=rrc_params,
-                fast_rcc=fast_rcc, rcc_params=rcc_params,
-            )
-            try:
-                frames = vr.get_batch(rel_frame_ids).asnumpy()
-            except decord.DECORDError as error:
-                print(error)
-                frames = vr.get_batch([0] * len(rel_frame_ids)).asnumpy()
-            except IndexError:
-                print(root, vid, ext, second, end_second)
-            all_frames.append(frames)
-            all_frame_ids.append(frame_ids)
-            if sum(map(lambda x: x.shape[0], all_frames)) == clip_length:
-                break
-        res = torch.from_numpy(np.concatenate(all_frames, axis=0).astype(np.float32))
-        time_meta['n_frames'] = res.shape[0]
-        all_frame_ids = np.concatenate(all_frame_ids, axis = 0)
-        frame_time = [e/fps for e in all_frame_ids]
-        frame_time-= frame_time[0]
-        frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
-        time_meta['frame_time'] = frame_time
-        assert res.shape[0] == clip_length, "{}, {}, {}, {}, {}, {}, {}".format(root, vid, second, end_second, res.shape[0], rel_frame_ids, frame_ids)
-        return res, time_meta
-
-
 class VideoCaptionDatasetBase(torch.utils.data.Dataset):
     def __init__(self, dataset, root, metadata, is_trimmed=True):
         self.dataset = dataset
@@ -216,7 +97,7 @@ def get_raw_item(
             vid_path, start_second, end_second, fps, narration, verb, noun = self.samples[i]
             # chunk length is the chunked video clip length
             # clip length is number of frames we want to sample from the clip
-            frames, time_meta = video_loader(self.root, vid_path, 'MP4',
+            frames, time_meta = avion_video_loader(self.root, vid_path, 'MP4',
                                   start_second, end_second,
                                   chunk_len=chunk_len, fps=fps,
                                   clip_length=clip_length,
diff --git a/action/prediction_analysis.py b/action/prediction_analysis.py
@@ -89,12 +89,17 @@ def analysis(self):
             gt_name = items['gt_name']
             # only replacing the first : 
             avion_pred = items['avion_preds']['predictions'][0].replace(':', ' ', 1)
-            
-            llava_verb, llava_noun = llava_pred.split(' ')
+            avion_preds = items['avion_preds']['predictions'][:5]
+            avion_preds = [e.replace(':', ' ', 1) for e  in avion_preds]
+            try:
+                llava_verb, llava_noun = llava_pred.split(' ')
+            except:
+                lst =  llava_pred.split(' ')
+                llava_verb, llava_noun = lst[0], lst[1]
             avion_verb, avion_noun = avion_pred.split(' ')
             gt_verb, gt_noun = gt_name.split(' ')
 
-            if llava_pred != gt_name:               
+            if llava_pred != gt_name:     
                 wrong_llava_collections[idx] = 0
             else:
                 wrong_llava_collections[idx] = 1
diff --git a/action/utils.py b/action/utils.py