able to do ensemble evaluation

Ye Shaokai · Ye Shaokai · commit 21742a4e0718 · 2024-10-11T13:53:29.000+02:00
diff --git a/action/ek_eval.py b/action/ek_eval.py
@@ -19,6 +19,8 @@
 import logging
 from llava.utils import rank0_print
 from action.utils import generate_label_map, MultiChoiceGenerator, match_answer, parse_avion_predictions
+import copy
+from collections import Counter 
 
 def datetime2sec(str):
     hh, mm, ss = str.split(':')
@@ -370,40 +372,93 @@ def prepare_llava(pretrained):
 
     return tokenizer, model, image_processor, max_length
 
-
-def get_topk_predictions(data, idx,  k):
+def get_topk_predictions(data, idx, k):
 
     letters = [chr(65+i) for i in range(26)][:k]
     options = list(range(26))[:k]
 
     predictions = data[str(idx)]['predictions'][:k]
-
     predictions = parse_avion_predictions(predictions)    
 
     for i in range(len(options)):              
         options[i] = f'{letters[i]}. {predictions[i]}'
                 
     mc_data = {
         'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer.'},
-        'option': {0: options}        
+        'options': {0: options},
+        'valid_letters': letters,
+        'avion_pred': predictions[0]
         }    
+    
+    return mc_data
+
+def ensemble_llava_evaluation(gt_name,
+                              frames, 
+                              tokenizer, 
+                              model, 
+                              image_processor, 
+                              mc_data,
+                              clip_length,  
+                              num_frames,
+                              temperature = 0,
+                              ensemble_k = 1,
+                              is_test = False
+                              ):
+    """
+    This function tests how consistent the model is if we shuffle the position of the answers
+    It also should use a higher temperature so we might get better performance by ensemble
+    """
+
+    # shuffle the options
+    options = mc_data['options'][0]
+    letters = mc_data['valid_letters']
+    avion_pred = mc_data['avion_pred']
+    # each option was in the format of {letter}. {answer}
+    preds = []
+    for _ in range(ensemble_k):
+            # let's just shuffle the options
+        random.shuffle(options)
+        for idx, (option, letter) in enumerate(zip(options, letters)):
+            sep = option.index('.')
+            options[idx] = f'{letter}.{option[sep+1:]}'
+        rank0_print ('generated new option sequence')
+        rank0_print (options)
+
+        pred = llava_inference(frames, 
+                               tokenizer, 
+                               model, 
+                               image_processor,  
+                               mc_data,  
+                               clip_length = clip_length, 
+                               num_frames=num_frames, 
+                               temperature = temperature,
+                               is_test = is_test
+                               )
+        
+        rank0_print ('llava pred', pred, 'avion_pred', avion_pred, 'gt_name', gt_name) 
+        sep = pred.index('.')
+        pred = pred[sep+1:].strip()
+        preds.append(pred)
+        
+    counter = Counter(preds)
+    rank0_print ('inspecting the counter', counter)
+    rank0_print ('most common', counter.most_common(1)[0][0])
+
+    return match_answer(counter.most_common(1)[0][0], gt_name)
+
 
-    return mc_data, predictions[0]
 
 def evaluate_on_EK100(eval_args, 
                       model= None, 
                       tokenizer= None, 
                       image_processor= None):
 
-    if image_processor is None:
+    if model is not None:
         image_processor = model.get_vision_tower().image_processor
 
     gpu_val_transform_ls = []
-
     val_transform_gpu = torch.nn.Sequential(*gpu_val_transform_ls)
-
     crop_size = 336
-
     labels, mapping_vn2act, verb_maps, noun_maps = generate_label_map(Path(eval_args.val_metadata).parent) 
 
     val_dataset = VideoMultiChoiceDataset(
@@ -468,7 +523,8 @@ def evaluate_on_EK100(eval_args,
         gt_name = mc_data['gt_answer_name'][0][0]
                 
         if eval_args.action_predictions:
-            mc_data, avion_pred = get_topk_predictions(predictions, idx, eval_args.topk_predictions)
+            mc_data = get_topk_predictions(predictions, idx, eval_args.topk_predictions)
+            avion_pred = mc_data['avion_pred']
             if gt_name == avion_pred:
                 avaion_correct+=1
 
@@ -477,18 +533,30 @@ def evaluate_on_EK100(eval_args,
         if finish_early and idx>999:
             break
         
-        pred = llava_inference(frames, tokenizer, model, image_processor,  mc_data,  clip_length = eval_args.clip_length, num_frames=eval_args.llava_num_frames)
+        # pred = llava_inference(frames, tokenizer, model, image_processor,  mc_data,  clip_length = eval_args.clip_length, num_frames=eval_args.llava_num_frames)
         
-        # if valid letter is found in the prediction, then we will use that as the prediction
-        rank0_print ('llava pred', pred, 'avion_pred', avion_pred, 'gt_name', gt_name)        
+        # # if valid letter is found in the prediction, then we will use that as the prediction
+        # rank0_print ('llava pred', pred, 'avion_pred', avion_pred, 'gt_name', gt_name)        
 
         # Update running corrects and total samples
-        running_corrects += (match_answer(pred, gt_name))
+        running_corrects += ensemble_llava_evaluation(gt_name,
+                                                      frames, 
+                                                      tokenizer,
+                                                      model,
+                                                      image_processor,
+                                                      mc_data,
+                                                      eval_args.clip_length,
+                                                      eval_args.llava_num_frames,
+                                                      temperature = 2.0,
+                                                      ensemble_k = 5,
+                                                      is_test = not finish_early)
+                                                              
         total_samples += 1
 
         # Calculate and log running mean accuracy
         running_accuracy = running_corrects / total_samples
 
+        logger.info(f'running accuracy: {running_accuracy:.4f}')
         if eval_args.action_predictions:
             avaion_accuracy = avaion_correct / total_samples
 
diff --git a/action/generate_description.py b/action/generate_description.py
@@ -40,15 +40,15 @@ def generate_train_ann(ann_file, verb_ids, noun_ids, gen_type = 'naive', avion_p
             # here we use the index
             vn_str = f'{row[10]}:{row[12]}'
             mc_data = mc_generator.generate_multi_choice(vn_str, n_options)
-            options = mc_data['option'][0]
+            options = mc_data['options'][0]
             gt_answer_letter = mc_data['gt_answer_letter'][0]
             gt_answer_name = mc_data['gt_answer_name'][0]
             conversation = generate_random_mc_conversation(options, gt_answer_letter, gt_answer_name )
         elif gen_type == "avion_mc":
             vn_str = f'{row[10]}:{row[12]}'
             avion_preds = avion_train_predictions[str(idx)]['predictions']
             mc_data = mc_generator.generate_multi_choice(vn_str, avion_preds, n_options)
-            options = mc_data['option'][0]
+            options = mc_data['options'][0]
             gt_answer_letter = mc_data['gt_answer_letter'][0]
             gt_answer_name = mc_data['gt_answer_name'][0]
             conversation = generate_random_mc_conversation(options, gt_answer_letter, gt_answer_name )
diff --git a/action/llava_ov_inference.py b/action/llava_ov_inference.py
@@ -20,23 +20,29 @@ def llava_inference(video_frames,
     image_processor, 
     mc_data,
     clip_length = 16,
-    num_frames=16):
+    num_frames = 16,
+    temperature = 0,
+    is_test = False
+    ):
 
     model.eval()    
     device = "cuda"    
     video_frames = video_frames[0]
     temporal_stride = clip_length // num_frames
     video_frames = video_frames[::temporal_stride]
     image_tensors = []
-    frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
+    if is_test:
+        frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].half().cuda()
+    else:
+        frames = image_processor.preprocess(video_frames, return_tensors="pt")["pixel_values"].cuda().to(torch.bfloat16)
     image_tensors.append(frames)
 
     conv_template = "qwen_1_5"
 
     question = mc_data['question'][0]
-    option = mc_data['option'][0]
+    options = mc_data['options'][0]
 
-    question = f"{DEFAULT_IMAGE_TOKEN}\n{question}:{option}"     
+    question = f"{DEFAULT_IMAGE_TOKEN}\n{question}:{options}"     
     
     conv = copy.deepcopy(conv_templates[conv_template])
     conv.append_message(conv.roles[0], question)
@@ -52,7 +58,7 @@ def llava_inference(video_frames,
         images=image_tensors,
         image_sizes=image_sizes,
         do_sample=False,
-        temperature=0,
+        temperature=temperature,
         max_new_tokens=4096,
         modalities=["video"],
     )
diff --git a/action/utils.py b/action/utils.py
@@ -100,7 +100,7 @@ def generate_multi_choice(self, gt_vn, k):
         gt_letter = letters[answers.index(gt_answer)]
         data = {
                 'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer'},
-                'option': {0: options},
+                'options': {0: options},
                 # the correct letter in mc
                 # for inspecting
                 'gt_answer_letter': {0: gt_letter},
@@ -153,7 +153,7 @@ def generate_multi_choice(self, gt_vn, avion_predictions, k):
         
         data = {
                 'question': {0: 'the video is an egocentric view of a person. What is the person doing? Pick the the letter that has the correct answer'},
-                'option': {0: options},
+                'options': {0: options},
                 # the correct letter in mc
                 # for inspecting
                 'gt_answer_letter': {0: gt_letter},
diff --git a/llava/model/builder.py b/llava/model/builder.py
@@ -234,8 +234,7 @@ def load_from_hf(repo_id, filename, subfolder=None):
                 else:
                     from llava.model.language_model.llava_qwen import LlavaQwenConfig
 
-                    #if overwrite_config is not None:
-                    if True:
+                    if overwrite_config is not None:                    
                         llava_cfg = LlavaQwenConfig.from_pretrained(model_path)
                         for k, v in overwrite_config.items():
                             setattr(llava_cfg, k, v)
diff --git a/llava/train/train.py b/llava/train/train.py
@@ -1725,6 +1725,8 @@ def make_inputs_require_grad(module, input, output):
 
     data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)   
 
+    
+
     trainer = LLaVATrainer(model=model, 
                            tokenizer = tokenizer, 
                            eval_args = eval_args, 
@@ -1734,7 +1736,6 @@ def make_inputs_require_grad(module, input, output):
 
     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
         trainer.train(resume_from_checkpoint=True)
-        #trainer.train()
     else:
         trainer.train()
     trainer.save_state()