diff --git a/llmc/eval/eval_custom_generate_just_infer.py b/llmc/eval/eval_custom_generate_just_infer.py index 658de5e2..5a720103 100644 --- a/llmc/eval/eval_custom_generate_just_infer.py +++ b/llmc/eval/eval_custom_generate_just_infer.py @@ -1,14 +1,8 @@ -import glob import json import os import torch -from human_eval.data import stream_jsonl, write_jsonl -from human_eval.evaluation import evaluate_functional_correctness from loguru import logger -from tqdm import tqdm - -from .eval_base import BaseEval class CustomGenerateJustInfer: @@ -29,8 +23,45 @@ def eval(self, model, eval_pos=None): self.eval_cfg ) - with open(os.path.join('custom_samples_ans.json'), 'w') as f: + self.eval_answer(custom_samples_ans) + + with open(os.path.join(self.config.save.save_path), 'w') as f: json.dump(custom_samples_ans, f, indent=4) torch.cuda.empty_cache() return 'custom gen done.' + + def eval_answer(self, data): + T1V = 0 + T1V_T2V = 0 + + def create_pairs(lst): + return [(lst[i], lst[i + 1]) for i in range(0, len(lst), 2)] + + def check_acc(gt, answer, turn): + if gt[turn].lower() in answer[turn].lower(): + return True + return False + + pair_data = create_pairs(data) + + for idx, item in enumerate(pair_data): + assert item[0]['image'] == item[1]['image'] + + pair1 = item[0] + pair2 = item[1] + + if check_acc(pair1['gt'], pair1['answer'], 0): + T1V += 1 + if check_acc(pair2['gt'], pair2['answer'], 1): + T1V_T2V += 1 + assert pair1['question'][0] == pair2['question'][1] + + if check_acc(pair2['gt'], pair2['answer'], 0): + T1V += 1 + if check_acc(pair1['gt'], pair1['answer'], 1): + T1V_T2V += 1 + assert pair2['question'][0] == pair1['question'][1] + + logger.info(f'CustomGenerateJustInfer T1V: {T1V}, T1V_T2V: {T1V_T2V}') + logger.info(f'CustomGenerateJustInfer Possibility: {T1V_T2V / T1V}')