refactored the code for consistent prompt

Ye Shaokai · Ye Shaokai · commit 819d73ab0df2 · 2024-10-22T21:56:05.000+02:00
diff --git a/llava/action/chatgpt_utils.py b/llava/action/chatgpt_utils.py
@@ -11,7 +11,7 @@
 import cv2
 from pathlib import Path
 from tqdm import tqdm
-from action.prediction_analysis import PredictionAnalysis
+from llava.action.prediction_analysis import PredictionAnalysis
 
 client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
@@ -145,6 +145,15 @@ def extract_frames(self,  vid_path, start_second, end_second):
         return frames, time_meta               
 
 
+class GPTDataClenaer(ChatGPT):
+    """
+    To clean the training annotation
+    Instead of using the first verb appeared in the verb csv, we use the csv file to
+    have chatgpt select the best ones.
+    We also inject rules to correct some confusing convention of how EK100 names verbs
+    """
+    
+
 class GPTInferenceAnnotator(ChatGPT):
     """
     Given the images, this class will annotate the video frames
diff --git a/llava/action/ek_eval.py b/llava/action/ek_eval.py
@@ -13,20 +13,16 @@
 from pathlib import Path
 import sys
 import os
-sys.path[0] = os.path.dirname(sys.path[0])
-from action.llava_ov_inference import llava_inference
+from llava.action.llava_inference import llava_inference
 import json
 import logging
 from llava.utils import rank0_print
-from action.utils import generate_label_map, MultiChoiceGenerator, match_answer, parse_avion_predictions, avion_video_loader, create_multi_choice_from_avion_predictions
-from action.prediction_analysis import PredictionAnalysis
-import copy
+from llava.action.utils import generate_label_map, MultiChoiceGenerator, match_answer, parse_avion_predictions, avion_video_loader, create_multi_choice_from_avion_predictions
+from llava.action.prediction_analysis import PredictionAnalysis
 from collections import Counter 
 import torch.distributed as dist
 
 
-
-
 def setup(rank, world_size):
     # Check if the process group is already initialized
     if not dist.is_initialized():
@@ -229,7 +225,6 @@ def get_args_parser():
     parser.add_argument('--use-multi-epochs-loader', action='store_true')
     
     # llava related
-    # llm size is type of string and can only be '7b' or '5b' etc.
     parser.add_argument('--pretrained_name', default = '', type = str, help ='the name in huggingface')
     parser.add_argument('--llava_num_frames', default=16, type=int, help='number of frames for llava')
     ## avion refinement 
@@ -467,23 +462,7 @@ def evaluate_on_EK100(eval_args,
             logger.info(f'Process {dist.get_rank()} - local_total_samples: {local_total_samples:.4f}')
             logger.info(f'Process {dist.get_rank()} - loca_llava_correct: {llava_correct:.4f}')
             logger.info(f'Process {dist.get_rank()} - local_running_corrects: {local_running_corrects:.4f}')
-
-
-        # Calculate and log running mean accuracy
-        # dist.barrier()
-        # dist.all_reduce(local_running_corrects, op=dist.ReduceOp.SUM)
-        # dist.all_reduce(local_total_samples, op=dist.ReduceOp.SUM)
-        # if eval_args.action_predictions:
-        #     dist.all_reduce(local_avion_correct, op=dist.ReduceOp.SUM)
-        # dist.barrier()
-        # # Calculate global accuracy after reduction
-        # local_running_accuracy = local_running_corrects.item() / local_total_samples.item()
-        # local_avion_accuracy = local_avion_correct.item() / local_total_samples.item()
-
-        # logger.info(f'Process {dist.get_rank()} - Running accuracy: {local_running_accuracy:.4f}')
-        # logger.info(f'Process {dist.get_rank()} - AvionRunning accuracy: {local_avion_accuracy:.4f}')
-
-    
+            
 
     dist.barrier()
     dist.all_reduce(global_running_corrects, op=dist.ReduceOp.SUM)
diff --git a/llava/action/generate_description.py b/llava/action/generate_description.py
@@ -3,7 +3,8 @@
 import csv
 import os
 import argparse
-from action.utils import generate_label_map, MultiChoiceGenerator, AvionMultiChoiceGenerator
+import sys
+from llava.action.utils import generate_label_map, MultiChoiceGenerator, AvionMultiChoiceGenerator, format_task_related_prompt
 from pathlib import Path
 
 
@@ -78,7 +79,7 @@ def generate_naive_conversation(vn_str:str):
 
 def generate_random_mc_conversation(options:list[str], gt_answer_letter, gt_answer_name):
     return [
-        {"from": "human", "value": f"<image>\n the video is taken from egocentric view. What action is the person performing? Please select the letter for the right answer {options}"},
+        {"from": "human", "value": f"{options}"},
         {"from": "gpt", "value": f"{gt_answer_letter}. {gt_answer_name}"} 
     ]
 
diff --git a/llava/action/llava_inference.py b/llava/action/llava_inference.py
@@ -5,6 +5,7 @@
 import torch
 import numpy as np
 import copy
+from llava.action.utils import format_llava_prompt
 
 
 def llava_ov_process(video_frames, 
@@ -31,7 +32,7 @@ def llava_ov_process(video_frames,
 
     question = mc_data['question'][0]
     options = mc_data['options'][0]
-
+    
     question = f"{DEFAULT_IMAGE_TOKEN}\n{question}:{options}"     
     
     conv = copy.deepcopy(conv_templates[conv_template])
@@ -82,21 +83,21 @@ def llava_video_process(
 
     video_duration = time_meta['duration'].item()
     n_frames = time_meta['n_frames'].item()
-    frame_time = time_meta['frame_time']
-    print ('frame time', frame_time)
-    frame_time = frame_time[0]
-    time_instruciton = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it.  What is the person doing? Format your answer letter. verb noun such as A. move knife."    
     
     frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
 
     image_tensors.append(frames)
 
     conv_template = "qwen_1_5"
 
-    question = mc_data['question'][0]
     options = mc_data['options'][0]
     
-    question = DEFAULT_IMAGE_TOKEN + f"{time_instruciton}\n:{options}"
+    question = format_llava_prompt(DEFAULT_IMAGE_TOKEN,
+                                   options,
+                                   video_duration,
+                                   n_frames,
+                                   include_frame_time = True,
+                                   include_time_instruction= True)
 
     print ('what is the question')
     print (question)
diff --git a/llava/action/prediction_analysis.py b/llava/action/prediction_analysis.py
@@ -169,9 +169,9 @@ def analysis(self):
 if __name__ == '__main__':
 
     # at rcp server
-    #save_folder = '/storage-rcp-pure/upmwmathis_scratch/shaokai/LLaVA-NeXT/llavavideo_avion_mc_top10_5epoch_preds'
+    save_folder = '/storage-rcp-pure/upmwmathis_scratch/shaokai/LLaVA-NeXT/llavavideo_avion_mc_top10_5epoch_preds_without_frame_time'
     # at amg0 
-    save_folder = '/data/epic_kitchen/llavavideo_avion_mc_top10_5epoch_preds'
+    #save_folder = '/data/epic_kitchen/llavavideo_avion_mc_top10_5epoch_preds'
 
 
     prediction_analysis = PredictionAnalysis(save_folder = save_folder,
diff --git a/llava/action/utils.py b/llava/action/utils.py
@@ -43,10 +43,53 @@ def generate_label_map(anno_root):
     return labels, mapping_vn2act, verb_maps, noun_maps
 
 
+def format_task_related_prompt(option_list):
+    prefix = "The video is taken from egocentric view. What action is the person performing? Given multiple choices, format your answer as the 'option letter. option_name' such as 'A. move knife' where A is the option letter and knife is the option_name.\n"
+    assert isinstance(option_list, list)
+    suffix = ",".join(option_list)
+    suffix = "Here are the options you are tasked:\n" + suffix 
+    ret = prefix + suffix
+    return ret
+
+def format_time_instruction(video_duration, n_frames, include_frame_time = False):
+
+    prefix = f"You are seeing a video taken from egocentric view. The video lasts for {video_duration:.2f} seconds, and {n_frames} frames are uniformly sampled from it."
+
+    frame_time = [i * (video_duration / n_frames) for i in range(n_frames)]
+    frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
+    
+    suffix = ""
+    if include_frame_time:
+        suffix = f"These frames are located at {frame_time}."
+    
+    return prefix + suffix
+
+
+def format_llava_prompt(image_token, 
+                        option_list, 
+                        video_duration,
+                        n_frames,
+                        include_time_instruction = False,
+                        include_frame_time = False
+                        ):
+    """
+    baseline llava prompt: {image_token}\n{task_related_prompt}
+    with time instruction: {image_token}\n{time_instruction}\n{task_related_prompt}
+
+    """
+    task_related_prompt = format_task_related_prompt(option_list)
+    time_instruction =  format_time_instruction(video_duration, n_frames, include_frame_time)
+
+    if include_time_instruction:
+        ret = f"{image_token}\n{time_instruction}{task_related_prompt}"
+    else:
+        ret = f"{image_token}\n{task_related_prompt}"
+
+    return ret
+
 def match_answer(pred, gt):          
     return pred == gt
 
-
 def parse_avion_predictions(predictions):
     return [pred.replace(':', ' ', 1) for pred in predictions]   
 
@@ -90,7 +133,6 @@ def generate_multi_choice(self, gt_vn, k):
 
         gt_letter = letters[answers.index(gt_answer)]
         data = {
-                'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer'},
                 'options': {0: options},
                 # the correct letter in mc
                 # for inspecting
@@ -142,8 +184,8 @@ def generate_multi_choice(self, gt_vn, avion_predictions, k):
 
         gt_letter = letters[answers.index(gt_answer)]
         
+
         data = {
-                'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer'},
                 'options': {0: options},
                 # the correct letter in mc
                 # for inspecting
@@ -194,7 +236,6 @@ def create_multi_choice_from_avion_predictions(avion_predictions, k):
         options[i] = f'{letters[i]}. {predictions[i]}'
                 
     mc_data = {
-        'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer.'},
         'options': {0: options},
         'valid_letters': letters,
         'avion_pred': predictions[0]
diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py
@@ -17,6 +17,8 @@
 from transformers.trainer_pt_utils import AcceleratorConfig
 from typing import List, Optional
 from datetime import timedelta
+import llava
+from llava.action.ek_eval import evaluate_on_EK100
 
 if is_accelerate_available():
     from accelerate import Accelerator, skip_first_batches, InitProcessGroupKwargs
@@ -248,8 +250,7 @@ def __init__(self, *args, tokenizer = None, eval_args = None, model_max_length =
 
 
 
-    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
-        from action.ek_eval import evaluate_on_EK100        
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):                
 
         accuracy = evaluate_on_EK100(self.eval_args, self.model, self.tokenizer)
 
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -36,7 +36,8 @@
 import transformers
 import tokenizers
 import deepspeed
-
+import sys
+import llava
 from transformers import AutoConfig
 from torch.utils.data import Dataset
 from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_INDEX
@@ -47,6 +48,7 @@
 from llava.mm_utils import process_highres_image, process_anyres_image, process_highres_image_crop_split, tokenizer_image_token
 from llava.utils import rank0_print, process_video_with_pyav, process_video_with_decord, process_EK100_video_with_decord
 
+from llava.action.utils import format_llava_prompt
 
 torch.multiprocessing.set_sharing_strategy("file_system")
 
@@ -978,10 +980,11 @@ def get_tokenize_len(prompts):
 
 
 class LazySupervisedDataset(Dataset):
-    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments, eval_args):
         super(LazySupervisedDataset, self).__init__()
         self.tokenizer = tokenizer
-        self.list_data_dict = []        
+        self.list_data_dict = []  
+        self.eval_args = eval_args      
 
         # Handle multiple JSON files specified in the data_path
         if "{" in data_path and "}" in data_path:
@@ -1231,9 +1234,24 @@ def _get_item(self, i) -> Dict[str, torch.Tensor]:
 
                 processor = self.data_args.image_processor
                 image = processor.preprocess(video, return_tensors="pt")["pixel_values"]
-                if self.data_args.add_time_instruction:
-                    time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
-                    sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
+                if 'EK100' not in video_file:
+                    if self.data_args.add_time_instruction:
+                        time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {num_frames_to_sample} frames are uniformly sampled from it. Please answer the following questions related to this video."
+                        sources[0]["conversations"][0]["value"] = f'{DEFAULT_IMAGE_TOKEN}\n{time_instruciton}\n{sources[0]["conversations"][0]["value"].replace(DEFAULT_IMAGE_TOKEN, "")}'
+                else:
+                    # We use our own prompting logic when it's EK100
+                    options = eval(sources[0]["conversations"][0]["value"])
+                    assert isinstance(options, list)
+                    assert len(options) == self.eval_args.topk_predictions
+                    # We only store the option list in the annotation file to make it easier to use consistent prompting
+                    llava_prompt = format_llava_prompt(DEFAULT_IMAGE_TOKEN,
+                                                 options,
+                                                 video_time,
+                                                 num_frames_to_sample,
+                                                 include_time_instruction= self.data_args.add_time_instruction,
+                                                 include_frame_time = True)
+                    sources[0]["conversations"][0]["value"] = llava_prompt
+
                 image = [(image, video[0].size, "video")]
                 sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args)
                 # print(sources)
@@ -1322,9 +1340,9 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         return batch
 
 
-def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args, eval_args) -> Dict:
     """Make dataset and collator for supervised fine-tuning."""
-    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args)
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args, eval_args = eval_args)
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
     return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
 
@@ -1728,7 +1746,7 @@ def make_inputs_require_grad(module, input, output):
                     if training_args.bf16 and module.weight.dtype == torch.float32:
                         module = module.to(torch.bfloat16)
 
-    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)   
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args, eval_args=eval_args)   
 
     eval_args.pretrained_name = model_args.model_name_or_path.split('/')[1]
 
diff --git a/scripts/train/EK100_avion_mc_top10.yaml b/scripts/train/EK100_avion_mc_top10.yaml
@@ -1,4 +1,4 @@
 datasets:
-  # - json_path: /data/shaokai/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl
-  - json_path: /capstor/scratch/cscs/hqi/llava/EK100/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl
+  - json_path: /data/shaokai/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl
+  #- json_path: /capstor/scratch/cscs/hqi/llava/EK100/EK100_inst_train/avion_mc_top10/train_convs_narration.jsonl
     sampling_strategy: all
diff --git a/shaokai_generate_train.sh b/shaokai_generate_train.sh
@@ -1,7 +1,7 @@
-python3 action/generate_description.py  \
-    --train_metadata /data/epic_kitchen/epic-kitchens-100-annotations/EPIC_100_train.csv \
-    --out_folder /data/EK100_inst_train/ \
-    --avion_train_predictions /data/epic_kitchen/avion_predictions_train.json \
+python3 llava/action/generate_description.py  \
+    --train_metadata /data/shaokai/epic-kitchens-100-annotations/EPIC_100_train.csv \
+    --out_folder /data/shaokai/EK100_inst_train/ \
+    --avion_train_predictions /data/shaokai/avion_predictions_train.json \
     --gen_type avion_mc \
     --n_options 10