fixed evaluation

Ye Shaokai · Ye Shaokai · commit c5fe6d36fa34 · 2024-10-16T15:32:37.000+02:00
diff --git a/action/chatgpt_utils.py b/action/chatgpt_utils.py
@@ -0,0 +1,123 @@
+import base64
+import io
+import json
+import os
+import cv2
+import numpy as np
+import openai
+from pydantic import BaseModel
+from multiprocessing.pool import Pool
+
+client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+GPT_MODEL = "gpt-4o-2024-08-06"
+
+
+class ImageOnlyResponse(BaseModel):
+    """
+    """
+    explanation: str
+
+class MultiChoiceResponse(BaseModel):
+    """
+    The output format of the response
+    """
+
+    explanation: str
+
+
+
+class GPTAnnotator:
+    def __init__(self, prediction_file_path):
+        with open(prediction_file_path, 'r') as f:
+            self.prediction_file = json.load(f)
+
+    def prepare_multiple_images(self, images):
+        """
+
+        """               
+        encoded_image_list = []
+
+        for image in images:
+            # images from matplotlib etc.
+            if isinstance(image, io.BytesIO):
+                image_bytes = image
+                base64_image = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
+            # images from opencv
+            elif isinstance(image, np.ndarray):
+                result, buffer = cv2.imencode(".jpeg", image)
+                image_bytes = io.BytesIO(buffer)
+                base64_image = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
+
+            encoded_image_list.append(base64_image)
+
+        multi_image_content = [
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"},
+            }
+            for encoded_image in encoded_image_list
+        ]
+
+        return multi_image_content
+
+
+    def annotate(self, images):
+        """
+        Annotate to do image caption only
+        """
+        pass
+
+    def annotate_with_multichoice(self, images, mc_data):
+        """
+        Annotate with mc_data
+
+        {
+
+        }
+
+        """
+
+        temperature = 0
+        include_images = True
+
+        system_prompt_prefix = """Inspect the images from the video and explain why the answer of the multi-choice question is D. """
+        system_prompt_suffix = """Yes"""
+
+        system_prompt = system_prompt_prefix + system_prompt_suffix
+
+        system_message =  [{"role": "system", "content": system_prompt}]
+
+        if include_images:
+            multi_image_content = self.prepare_multiple_images(images)
+            multi_modal_content = [{"type": "text", "text": ""}] + multi_image_content
+            user_message = [{"role": "user", "content": multi_modal_content}]
+        else:
+            user_message = [{"role": "user", "content": ""}]        
+
+        response = client.beta.chat.completions.parse(
+            model=GPT_MODEL,
+            messages=system_message + user_message, 
+            response_format = MultiChoiceResponse,
+            temperature = temperature
+        )
+
+        return response.choices[0].message.parsed
+    
+
+def annotate_using_chatgpt():
+    """
+    Multi processing to speed up 
+    """
+    with Pool() as pool:
+        pass
+        #pool.starmap(annotate, task_args)
+
+    pass
+    
+def annotate_from_train_conv_file(train_file_path):
+    pass
+
+if __name__ == '__main__':
+    train_file_path = '/storage-rcp-pure/upmwmathis_scratch/shaokai'
+    annotate_from_train_conv_file(train_file_path)
diff --git a/action/ek_eval.py b/action/ek_eval.py
@@ -525,67 +525,68 @@ def evaluate_on_EK100(eval_args,
 
     for idx, (frames, mc_data, time_meta, global_index) in tqdm(enumerate(val_dataloader)):        
 
-        global_index = global_index.item()
+        with torch.no_grad():
+            global_index = global_index.item()
 
-        gt_name = mc_data['gt_answer_name'][0][0]
-        local_avion_correct = torch.tensor(0.0, device=device)
-        local_running_corrects = torch.tensor(0.0, device=device)
-        local_total_samples = torch.tensor(0.0, device=device)
-              
-        if eval_args.action_predictions:
-            mc_data = get_topk_predictions(predictions, global_index, eval_args.topk_predictions)
-            avion_pred = mc_data['avion_pred']
-            if gt_name == avion_pred:
-                local_avion_correct.add_(1)
-                global_avion_correct.add_(1)
-
-        # we don't want to evaluate the whole thing
-        # let's evaluate 1000 samples to get the complete picture       
-        if finish_early and idx> (1000 / dist.get_world_size()):
-            break                     
-     
-        # Update running corrects and total samples
+            gt_name = mc_data['gt_answer_name'][0][0]
+            local_avion_correct = torch.tensor(0.0, device=device)
+            local_running_corrects = torch.tensor(0.0, device=device)
+            local_total_samples = torch.tensor(0.0, device=device)
+                
+            if eval_args.action_predictions:
+                mc_data = get_topk_predictions(predictions, global_index, eval_args.topk_predictions)
+                avion_pred = mc_data['avion_pred']
+                if gt_name == avion_pred:
+                    local_avion_correct.add_(1)
+                    global_avion_correct.add_(1)
+
+            # we don't want to evaluate the whole thing
+            # let's evaluate 1000 samples to get the complete picture       
+            if finish_early and idx> (1000 / dist.get_world_size()):
+                break                     
+        
+            # Update running corrects and total samples
+            
+            llava_correct, llava_pred = ensemble_llava_evaluation(
+                                                        eval_args.pretrained_name,
+                                                        gt_name,
+                                                        frames, 
+                                                        tokenizer,
+                                                        model,
+                                                        image_processor,
+                                                        mc_data,
+                                                        eval_args.clip_length,
+                                                        eval_args.llava_num_frames,
+                                                        temperature = 0,
+                                                        ensemble_k = 1,
+                                                        time_meta = time_meta,
+                                                        is_test = not finish_early)
+
+            # log the predictions into prediciton analysis
         
-        llava_correct, llava_pred = ensemble_llava_evaluation(
-                                                      eval_args.pretrained_name,
-                                                      gt_name,
-                                                      frames, 
-                                                      tokenizer,
-                                                      model,
-                                                      image_processor,
-                                                      mc_data,
-                                                      eval_args.clip_length,
-                                                      eval_args.llava_num_frames,
-                                                      temperature = 0,
-                                                      ensemble_k = 1,
-                                                      time_meta = time_meta,
-                                                      is_test = not finish_early)
-
-        # log the predictions into prediciton analysis
-
-        val_dataset.prediction_analysis.log(global_index,
-                                            llava_pred,
-                                            gt_name,
-                                            predictions[str(global_index)],
-                                            time_meta['start_second'].item(),
-                                            time_meta['end_second'].item(),
-                                            time_meta['vid_path'],
-                                            dataset_name = 'EK100')
+            val_dataset.prediction_analysis.log(global_index,
+                                                llava_pred,
+                                                gt_name,
+                                                predictions[str(global_index)],
+                                                time_meta['start_second'].item(),
+                                                time_meta['end_second'].item(),
+                                                time_meta['vid_path'],
+                                                dataset_name = 'EK100')
 
         
 
 
-        local_running_corrects.add_(llava_correct)
-        global_running_corrects.add_(llava_correct)
-                                                              
-        local_total_samples.add_(1)
-        global_total_samples.add_(1)
+            local_running_corrects.add_(llava_correct)
+            global_running_corrects.add_(llava_correct)
+                                                                
+            local_total_samples.add_(1)
+            global_total_samples.add_(1)
 
-        logger.info(f'Process {dist.get_rank()} - local_total_samples: {local_total_samples:.4f}')
+            logger.info(f'Process {dist.get_rank()} - local_total_samples: {local_total_samples:.4f}')
 
-        logger.info(f'Process {dist.get_rank()} - loca_llava_correct: {llava_correct:.4f}')
+            logger.info(f'Process {dist.get_rank()} - loca_llava_correct: {llava_correct:.4f}')
 
-        logger.info(f'Process {dist.get_rank()} - local_running_corrects: {local_running_corrects:.4f}')
+            logger.info(f'Process {dist.get_rank()} - local_running_corrects: {local_running_corrects:.4f}')
 
 
         # Calculate and log running mean accuracy
diff --git a/action/llava_ov_inference.py b/action/llava_ov_inference.py
@@ -43,15 +43,16 @@ def llava_ov_process(video_frames,
     image_sizes = [frame.size for frame in video_frames]
 
     # Generate response
-    cont = model.generate(
-        input_ids,
-        images=image_tensors,
-        image_sizes=image_sizes,
-        do_sample=False,
-        temperature=temperature,
-        max_new_tokens=4096,
-        modalities=["video"],
-    )
+    with torch.no_grad():
+        cont = model.generate(
+            input_ids,
+            images=image_tensors,
+            image_sizes=image_sizes,
+            do_sample=False,
+            temperature=temperature,
+            max_new_tokens=4096,
+            modalities=["video"],
+        )
 
     text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)
     return text_outputs[0]
diff --git a/action/prediction_analysis.py b/action/prediction_analysis.py
@@ -1,6 +1,7 @@
 import json
 import glob
 import os
+import numpy as np
 class PredictionAnalysis:
     """
     We save data that can be used for ad-hoc analysis
@@ -24,7 +25,7 @@ def __init__(self, save_folder = '.', rank = 0):
         self.rank = rank
         self.prefix = 'prediction_analysis_buf'
         self.save_path = os.path.join(save_folder, f'{self.prefix}_rank{rank}.json')       
-        self.data = {}
+        self.data = {}   
     def log(self, 
             global_index,
             llava_pred,
@@ -62,10 +63,11 @@ def load(self):
                 with open(file, 'r') as f:
                     _data = json.load(f)
                     self.data.update(_data)
+            print ('length', len(self.data))
+            assert len(self.data) == 9668
+            #print (sorted(list(self.data.keys()), key = lambda x: int(x)))
 
-            print (sorted(list(self.data.keys()), key = lambda x: int(x)))
-
-    def wrong_verb(self):
+    def analysis(self):
 
         N = len(self.data)
         llava_wrong_verb_collections = []
@@ -76,27 +78,83 @@ def wrong_verb(self):
         avion_wrong_noun_collections = []
         avion_wrong_verb_noun_collections = []
 
-        wrong_llava_collections = []
-        wrong_avion_collections = []
+        wrong_llava_collections = [0] * N
+        wrong_avion_collections = [0] * N
 
         indices = sorted(list(self.data.keys()), key = lambda x: int(x))
 
-        for index in indices:
+        for idx, index in enumerate(indices):
             items = self.data[index]
             llava_pred = items['llava_pred']
             gt_name = items['gt_name']
             # only replacing the first : 
             avion_pred = items['avion_preds']['predictions'][0].replace(':', ' ', 1)
             
+            llava_verb, llava_noun = llava_pred.split(' ')
+            avion_verb, avion_noun = avion_pred.split(' ')
+            gt_verb, gt_noun = gt_name.split(' ')
+
             if llava_pred != gt_name:
-                wrong_llava_collections.append((llava_pred, gt_name))
+                if set(llava_pred).intersection(set(gt_name)) == set(gt_name):
+                    print ('what is going on')
+                    print ('nooo', llava_pred, gt_name)
+                #wrong_llava_collections.append((llava_pred, gt_name))
+                #print (llava_pred, gt_name)
+                wrong_llava_collections[idx] = 0
+            else:
+                wrong_llava_collections[idx] = 1
             if avion_pred!= gt_name:
-                # pred, gt
-                wrong_avion_collections.append((avion_pred, gt_name))
+                wrong_avion_collections[idx] = 0
+            else:
+                wrong_avion_collections[idx] = 1
+
             
+            if llava_verb == gt_verb and llava_noun!=gt_noun:
+                llava_wrong_noun_collections.append((llava_pred, gt_name))
+            if llava_noun == gt_noun and llava_verb!=gt_verb:
+                llava_wrong_verb_collections.append((llava_pred, gt_name))
+            if llava_noun!= gt_noun and llava_verb!=gt_verb:
+                llava_wrong_verb_noun_collections.append((llava_pred, gt_name))
+
+            if avion_verb == gt_verb and avion_noun!=gt_noun:
+                avion_wrong_noun_collections.append((avion_pred, gt_name))
+            if avion_noun == gt_noun and avion_verb!=gt_verb:
+                avion_wrong_verb_collections.append((avion_pred, gt_name))
+            if avion_noun!= gt_noun and avion_verb!=gt_verb:
+                avion_wrong_verb_noun_collections.append((avion_pred, gt_name))
+
+        wrong_llava_collections = np.array(wrong_llava_collections)
+        wrong_avion_collections = np.array(wrong_avion_collections)
+        llava_wrong_noun_collections = np.array(llava_wrong_noun_collections)
+        llava_wrong_verb_collections = np.array(llava_wrong_verb_collections)
+        llava_wrong_verb_noun_collections = np.array(llava_wrong_verb_noun_collections)
+        avion_wrong_noun_collections = np.array(avion_wrong_noun_collections)
+        avion_wrong_verb_collections = np.array(avion_wrong_verb_collections)
+        avion_wrong_verb_noun_collections = np.array(avion_wrong_verb_noun_collections)
+                
+        # first, the correlation between avion and llava
+        correlation = np.corrcoef(wrong_llava_collections, wrong_avion_collections)[0, 1]
+
+        print("Correlation:", correlation)
+
+        print ('llava top1 action accuracy {:.3f}'.format(np.sum(wrong_llava_collections == 1) / len(wrong_llava_collections)))
+        print ('avion top1 action accuracy {:.3f}'.format(np.sum(wrong_avion_collections == 1) / len(wrong_avion_collections)))
+
+        print ('llava percentage of wrong noun {:.2f}'.format(len(llava_wrong_noun_collections) / np.sum(wrong_llava_collections == 0)))
+        print ('llava percentage of wrong verb {:.2f}'.format(len(llava_wrong_verb_collections) / np.sum(wrong_llava_collections == 0)))
+        print ('llava percentage of both verb noun wrong {:.2f}'.format(len(llava_wrong_verb_noun_collections) / np.sum(wrong_llava_collections == 0)))
+
+
+        print ('avion percentage of wrong noun {:.2f}'.format(len(avion_wrong_noun_collections) / np.sum(wrong_avion_collections == 0)))
+        print ('avion percentage of wrong verb {:.2f}'.format(len(avion_wrong_verb_collections) / np.sum(wrong_avion_collections == 0)))
+        print ('avion percentage of both verb noun wrong {:.2f}'.format(len(avion_wrong_verb_noun_collections) / np.sum(wrong_avion_collections == 0)))
+
+
+
 
 if __name__ == '__main__':
 
 
     prediction_analysis = PredictionAnalysis(save_folder = '/storage-rcp-pure/upmwmathis_scratch/shaokai/LLaVA-NeXT')
     prediction_analysis.load()
+    prediction_analysis.analysis()
diff --git a/action/utils.py b/action/utils.py
diff --git a/llava/train/train.py b/llava/train/train.py