Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eval_anything/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"benchmarks.text_to_text.CEval.eval",
"benchmarks.text_image_to_text.mmmu.eval",
"benchmarks.text_image_to_text.mathvision.eval",
"benchmarks.text_image_to_text.olympiadbench.eval",
"benchmarks.text_audio_to_text.mmau.eval",
"benchmarks.text_video_to_text.mmvu.eval",
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# ============================================
# 评测配置:
# - 数据集hf路径
# - 数据集split
# - 数据集size
# - 数据集模态
# - 评测方式(multiple choice / generation / ...)
# - 评测指标(accuracy / ...)
# - 是否支持few_shot
# - 选项编号形式(ABCD或1234...),判断对错时的标识(true/false或者1/0...)
# ============================================
dataset:
name: olympiadbench
path: Hothan/OlympiadBench
split: train
size:
modality: text-image-to-text
fewshot_data_path: null
fewshot_data_name: null
fewshot_data_split: null
cot_fewshot_data_path: null
cot_fewshot_data_name: null
cot_fewshot_data_split: null
max_shot: 0
default_task_list: ["OE_MM_maths_en_COMP", "OE_MM_maths_zh_CEE", "OE_MM_maths_zh_COMP","OE_MM_physics_en_COMP", "OE_MM_physics_zh_CEE", "OE_TO_maths_en_COMP","OE_TO_maths_zh_CEE", "OE_TO_maths_zh_COMP", "OE_TO_physics_en_COMP","OE_TO_physics_zh_CEE",]
task_defaults: &task_defaults
type: MultiChoice
question_key: question
answer_key: options
ground_truth_key: answer
candidate_labels: ["A", "B", "C", "D", "E"]
avalable_evaluate_tools: ["match_multi-choice_and_open-ended"]
task:
- name: OE_MM_maths_en_COMP
data_files: null
<<: *task_defaults
- name: OE_MM_maths_zh_CEE
data_files: null
<<: *task_defaults
- name: OE_MM_maths_zh_COMP
data_files: null
<<: *task_defaults
- name: OE_MM_physics_en_COMP
data_files: null
<<: *task_defaults
- name: OE_MM_physics_zh_CEE
data_files: null
<<: *task_defaults
- name: OE_TO_maths_en_COMP
data_files: null
<<: *task_defaults
- name: OE_TO_maths_zh_CEE
data_files: null
<<: *task_defaults
- name: OE_TO_maths_zh_COMP
data_files: null
<<: *task_defaults
- name: OE_TO_physics_en_COMP
data_files: null
<<: *task_defaults
- name: OE_TO_physics_zh_CEE
data_files: null
<<: *task_defaults
answer_extractor:
- name: match_multi-choice_and_open-ended
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name should match the function.

function: regex_match_latex_math
args:
match_index: -1 # Index for multiple choice
metrics:
- name: accuracy
function: accuracy
args:
overall_metrics:
- name: average
function: average_across_tasks
args:
null

59 changes: 59 additions & 0 deletions eval_anything/benchmarks/text_image_to_text/olympiadbench/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
继承自eval-anything.pipeline.mm_und_benchmark
"""
from eval_anything.pipeline.mm_und_benchmark import MMUndBenchmark
from collections import namedtuple
from eval_anything.utils.data_type import EvaluationResult
from eval_anything.models.base_model import BaseModel
from eval_anything.utils.logger import EvalLogger
from eval_anything.utils.register import BenchmarkRegistry
from eval_anything.utils.cache_manager import CacheManager

@BenchmarkRegistry.register('olympiadbench')
class olympiadbenchBenchmark(MMUndBenchmark):
def __init__(self,
model: BaseModel,
eval_cfgs: namedtuple,
model_cfgs: namedtuple,
infer_cfgs: namedtuple,
output_path: str,
cache_manager: CacheManager,
logger: EvalLogger):
super().__init__(model, eval_cfgs, model_cfgs, infer_cfgs, output_path, cache_manager, logger)
self.benchmark_name = "olympiadbench"
self.benchmark_cfgs = self.get_benchmark_cfgs(self.benchmark_name)

def run(self,
task_list: list[str],) -> tuple[dict[str, list[EvaluationResult]], dict[str, dict[str, float]], dict[str, dict[str, float]]]:
"""Run benchmark
Args:
task_list (list[str]): task list

Returns:
evaluation_details (dict[str, list[EvaluationResult]]): evaluation details
evaluation_results (dict[str, dict[str, float]]): evaluation results
"""

self.logger.log('info', f'Evaluating {self.benchmark_name}...')

dataloader = self.init_dataloader(self.eval_cfgs, self.benchmark_cfgs)
input_data = dataloader.load_dataset(task_list) # Input_data: list[InferenceInput]

inference_outputs = self.batch_inference(self.model, input_data)
ref_answers = {task: self.get_ref_answer(input_data[task], inference_outputs[task]) for task in task_list}
evaluation_details = {}
evaluation_results = {}

for task in task_list:
evaluation_details[task], evaluation_results[task] = self.calculate_metrics(self.benchmark_name, inference_outputs[task], ref_answers[task], self.benchmark_cfgs.answer_extractor, self.benchmark_cfgs.metrics, judge_method="judge_equal_list")

if len(task_list) > 1:
overall_result = self.calculate_overall_metrics(self.benchmark_cfgs.overall_metrics, result=evaluation_results)
else:
overall_result = {}

self.display_benchmark_results(self.benchmark_name, evaluation_results)
if overall_result != {}:
self.display_benchmark_results(self.benchmark_name, overall_result)
self.save_benchmark_details(self.output_path, self.benchmark_name, input_data, evaluation_details)
return evaluation_details, evaluation_results, overall_result
5 changes: 4 additions & 1 deletion eval_anything/configs/evaluate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"CEval": ['accountant', 'advanced_mathematics', 'art_studies', 'basic_medicine', 'business_administration', 'chinese_language_and_literature', 'civil_servant', 'clinical_medicine', 'college_chemistry', 'college_economics', 'college_physics', 'college_programming', 'computer_architecture', 'computer_network', 'discrete_mathematics', 'education_science', 'electrical_engineer', 'environmental_impact_assessment_engineer', 'fire_engineer', 'high_school_biology', 'high_school_chemistry', 'high_school_chinese', 'high_school_geography', 'high_school_history', 'high_school_mathematics', 'high_school_physics', 'high_school_politics', 'ideological_and_moral_cultivation', 'law', 'legal_professional', 'logic', 'mao_zedong_thought', 'marxism', 'metrology_engineer', 'middle_school_biology', 'middle_school_chemistry', 'middle_school_geography', 'middle_school_history', 'middle_school_mathematics', 'middle_school_physics', 'middle_school_politics', 'modern_chinese_history', 'operating_system', 'physician', 'plant_protection', 'probability_and_statistics', 'professional_tour_guide', 'sports_science', 'tax_accountant', 'teacher_qualification', 'urban_and_rural_planner', 'veterinary_medicine'],
"mmmu": ["Accounting"],
"mathvision": ["default"],
"olympiadbench": ["OE_MM_maths_en_COMP", "OE_MM_maths_zh_CEE", "OE_MM_maths_zh_COMP","OE_MM_physics_en_COMP", "OE_MM_physics_zh_CEE", "OE_TO_maths_en_COMP","OE_TO_maths_zh_CEE", "OE_TO_maths_zh_COMP", "OE_TO_physics_en_COMP","OE_TO_physics_zh_CEE"],
"mmau": ["MMAU-mini"],
"mmvu": ["default"],
}
Expand All @@ -35,6 +36,7 @@
"CEval": 5,
"mmmu": 5,
"mathvision": 0,
"olympiadbench": 0,
"mmau": 0,
"mmvu": 0,
}
Expand All @@ -51,6 +53,7 @@
"CEval": False,
"mmmu": False,
"mathvision": False,
"olympiadbench": False,
"mmau": False,
"mmvu": False,
}
Expand All @@ -68,7 +71,7 @@
# Model type ("LM" or "MM")
model_type: "MM"
# Chat template
chat_template: Llava
chat_template: Qwen2-VL
infer_cfgs:
# Inference backend
infer_backend: "vllm"
Expand Down
148 changes: 148 additions & 0 deletions eval_anything/dataloader/format_mm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,154 @@ def _to_InferenceInput(self, dataset: Dataset) -> List["InferenceInput"]:
metadata="video",
)
)
return inference_inputs

@MMDatasetRegistry.register("olympiadbench")
class olympiadbenchDataset(BaseMMDataset):
def __init__(self, bench_cfgs: namedtuple, task: namedtuple, enable_cot: bool, num_shot: int):
super().__init__(bench_cfgs, task, enable_cot, num_shot)

def set_few_shot_examples(self, few_shot_dataset: Dataset | None):
raise NotImplementedError("olympiadbench does not support few-shot learning.")

def get_image_indice(self, text: str)->List[int]:
pattern = r'<image (\d+)>'
matches = re.findall(pattern, text)
return [int(num) for num in matches]


# refer: https://github.com/mathllm/MATH-V/blob/main/models/Qwen-VL.py#L19
def _to_InferenceInput(self, dataset: Dataset) -> List["InferenceInput"]:
"""
Convert a dataset to a list of InferenceInput objects.

Args:
dataset: Dataset object containing questions, options, and images

Returns:
List of InferenceInput objects ready for model inference
"""

# refer: https://github.com/OpenBMB/OlympiadBench/blob/main/inference/code/evaluators/evaluator.py
chinese_answer_type_dict = {
'Numerical': '数值',
'Expression': '表达式',
'Equation': '方程',
'Interval': '区间'
}
english_answer_type_dict = {
'Numerical': 'a numerical value',
'Expression': 'an expression',
'Equation': 'an equation',
'Interval': 'an interval'
}

def get_single_answer_type_text(answer_type, is_chinese):
if '-' in answer_type: # No need now
answer_type = answer_type[:answer_type.find('-')]
for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
if t in answer_type:
if is_chinese:
return chinese_answer_type_dict[t]
else:
return english_answer_type_dict[t]
exit(f'Error parsing answer type {answer_type}!')

def get_answer_type_text(answer_type, is_chinese, multiple_answer):
if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
full_answer_text = ''
else:
if not multiple_answer:
answer_text = get_single_answer_type_text(answer_type, is_chinese)
if is_chinese:
full_answer_text = f',答案类型为{answer_text}'
else:
full_answer_text = f"The answer of The problem should be {answer_text}. "
else:
if ',' not in answer_type: # Same answer type for all answers
answer_text = get_single_answer_type_text(answer_type, is_chinese)
if is_chinese:
full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
else:
full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
else:
answer_types = answer_type.split(',')
answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
if len(set(answer_types)) == 1:
answer_text = answer_types[0]
if is_chinese:
full_answer_text = f',题目有多个答案,答案类型均为{answer_text}'
else:
full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
else:
if is_chinese:
answer_text = '、'.join(answer_types)
full_answer_text = f',题目有多个答案,答案类型分别为{answer_text}'
else:
answer_text = ', '.join(answer_types)
full_answer_text = f'The problem has multiple answers, with the answers in order being {answer_text}. '
return full_answer_text

def make_prompt(sample, is_chinese, is_math, is_theorem_proving):
if is_chinese:
subject_content = '数学' if is_math else '物理'
if is_theorem_proving:
prompt = f'以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。证明过程中使用的变量和公式请使用LaTeX格式表示。'
else:
answer_type_text = get_answer_type_text(sample['answer_type'], is_chinese=True, multiple_answer=sample['is_multiple_answer'])
if sample['is_multiple_answer']:
multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
else:
multiple_answer_text = '\\boxed{答案}'
unit_text = ''
if sample['unit']:
multiple_answer_text += '(单位)'
unit_text = ',注意答案的单位不要放在\\boxed{}中'
prompt = f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"所以最终答案是{multiple_answer_text}。"显式给出结果{unit_text}。'
else:
subject_content = 'Math' if is_math else 'Physics'
if is_theorem_proving:
prompt = f'The following is a theorem proving problem from an International {subject_content} competition. Please use logical reasoning and common theorems to prove the proposition in the problem according to the given requirements. Please use LaTeX format to represent the variables and formulas used in the proof.'
else:
if sample['is_multiple_answer']:
multiple_answer_text = '\\boxed{multiple answers connected with commas}'
else:
multiple_answer_text = '\\boxed{answer}'
unit_text = ''
if sample['unit']:
multiple_answer_text += '(unit)'
unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
answer_type_text = get_answer_type_text(sample['answer_type'], is_chinese=False, multiple_answer=sample['is_multiple_answer'])
prompt = f'The following is an open-ended problem from an International {subject_content} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "So the final answer is {multiple_answer_text}." and give the result explicitly{unit_text}.'
return prompt

inference_inputs = []
for item in dataset:
is_chinese = item['language'] != 'English'
is_math = item['subject'] == 'Math'
is_theorem_proving = False # we won't test theorem proving in olympiadbench
if is_chinese:
subject = '数学' if is_math else '物理'
system_prompt = f"你是一个中文人工智能助手。请根据要求,完成下面的{subject}竞赛题目。"
else:
subject = 'Math' if is_math else 'Physics'
system_prompt = f'You are an AI assistant. Please answer the following {subject} competition problems as required.'

formatted_prompt = make_prompt(item, is_chinese, is_math, is_theorem_proving) + item['question']
# original image placeholder is <image_1> , we need to transform it to <image 1> for the function prompt_to_conversation
formatted_prompt = re.sub(r'<image_(\d+)>', r'<image \1>', formatted_prompt)

image_ids = self.get_image_indice(formatted_prompt)
images = [item[f'image_{id}'] for id in image_ids]
conversation = ImageManager.prompt_to_conversation(user_prompt=formatted_prompt, system_prompt=system_prompt, images=images)

inference_inputs.append(
InferenceInput(
task=self.task.name,
conversation=conversation,
ref_answer=str(item['final_answer'][0])
)
)

if len(inference_inputs) > 10:
break
Expand Down
2 changes: 1 addition & 1 deletion eval_anything/evaluate_tools/t2t_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def apply(self, data_1, data_2) -> bool:
break
return correct
else:
return gold_answer in data_1
return str(gold_answer) in data_1

def __call__(self, data_1, data_2) -> bool:
return self.apply(data_1, data_2)
Expand Down
1 change: 1 addition & 0 deletions eval_anything/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
'beavertails': 'text_to_text',
'mmmu': 'text_image_to_text',
'mathvision': 'text_image_to_text',
'olympiadbench': 'text_image_to_text',
'mmau': 'text_audio_to_text',
'mmvu': 'text_video_to_text',
}
Expand Down