PKU-Alignment · W3en2g · Apr 1, 2025 · Apr 1, 2025 · Jun 2, 2025 · Lavezlyn
diff --git a/eval_anything/benchmarks/__init__.py b/eval_anything/benchmarks/__init__.py
@@ -12,6 +12,7 @@
     "benchmarks.text_to_text.CEval.eval",
     "benchmarks.text_image_to_text.mmmu.eval",
     "benchmarks.text_image_to_text.mathvision.eval",
+    "benchmarks.text_image_to_text.olympiadbench.eval",
     "benchmarks.text_audio_to_text.mmau.eval",
     "benchmarks.text_video_to_text.mmvu.eval",
 ]

diff --git a/eval_anything/benchmarks/text_image_to_text/olympiadbench/configs.yaml b/eval_anything/benchmarks/text_image_to_text/olympiadbench/configs.yaml
@@ -0,0 +1,78 @@
+# ============================================
+# 评测配置：
+#   - 数据集hf路径
+#   - 数据集split
+#   - 数据集size
+#   - 数据集模态
+#   - 评测方式（multiple choice / generation / ...）
+#   - 评测指标（accuracy / ...）
+#   - 是否支持few_shot
+#   - 选项编号形式（ABCD或1234...），判断对错时的标识(true/false或者1/0...)
+# ============================================
+dataset:
+  name: olympiadbench
+  path: Hothan/OlympiadBench
+  split: train
+  size: 
+  modality: text-image-to-text
+  fewshot_data_path: null
+  fewshot_data_name: null
+  fewshot_data_split: null
+  cot_fewshot_data_path: null
+  cot_fewshot_data_name: null
+  cot_fewshot_data_split: null
+  max_shot: 0
+  default_task_list: ["OE_MM_maths_en_COMP", "OE_MM_maths_zh_CEE", "OE_MM_maths_zh_COMP","OE_MM_physics_en_COMP", "OE_MM_physics_zh_CEE", "OE_TO_maths_en_COMP","OE_TO_maths_zh_CEE", "OE_TO_maths_zh_COMP", "OE_TO_physics_en_COMP","OE_TO_physics_zh_CEE",]
+task_defaults: &task_defaults
+  type: MultiChoice
+  question_key: question
+  answer_key: options
+  ground_truth_key: answer
+  candidate_labels: ["A", "B", "C", "D", "E"]
+  avalable_evaluate_tools: ["match_multi-choice_and_open-ended"]
+task: 
+  - name: OE_MM_maths_en_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_MM_maths_zh_CEE
+    data_files: null
+    <<: *task_defaults
+  - name: OE_MM_maths_zh_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_MM_physics_en_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_MM_physics_zh_CEE
+    data_files: null
+    <<: *task_defaults
+  - name: OE_TO_maths_en_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_TO_maths_zh_CEE
+    data_files: null
+    <<: *task_defaults
+  - name: OE_TO_maths_zh_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_TO_physics_en_COMP
+    data_files: null
+    <<: *task_defaults
+  - name: OE_TO_physics_zh_CEE
+    data_files: null
+    <<: *task_defaults
+answer_extractor:
+  - name: match_multi-choice_and_open-ended
+    function: regex_match_latex_math
+    args:
+      match_index: -1   # Index for multiple choice
+metrics:
+  - name: accuracy
+    function: accuracy
+    args:
+overall_metrics:
+  - name: average
+    function: average_across_tasks
+    args:
+      null
+
diff --git a/eval_anything/benchmarks/text_image_to_text/olympiadbench/eval.py b/eval_anything/benchmarks/text_image_to_text/olympiadbench/eval.py
@@ -0,0 +1,59 @@
+"""
+继承自eval-anything.pipeline.mm_und_benchmark
+"""
+from eval_anything.pipeline.mm_und_benchmark import MMUndBenchmark
+from collections import namedtuple
+from eval_anything.utils.data_type import EvaluationResult
+from eval_anything.models.base_model import BaseModel
+from eval_anything.utils.logger import EvalLogger
+from eval_anything.utils.register import BenchmarkRegistry
+from eval_anything.utils.cache_manager import CacheManager
+
+@BenchmarkRegistry.register('olympiadbench')
+class olympiadbenchBenchmark(MMUndBenchmark):
+    def __init__(self, 
+                 model: BaseModel, 
+                 eval_cfgs: namedtuple, 
+                 model_cfgs: namedtuple, 
+                 infer_cfgs: namedtuple, 
+                 output_path: str,
+                 cache_manager: CacheManager,
+                 logger: EvalLogger):
+        super().__init__(model, eval_cfgs, model_cfgs, infer_cfgs, output_path, cache_manager, logger)
+        self.benchmark_name = "olympiadbench"
+        self.benchmark_cfgs = self.get_benchmark_cfgs(self.benchmark_name)
+
+    def run(self, 
+            task_list: list[str],) -> tuple[dict[str, list[EvaluationResult]], dict[str, dict[str, float]], dict[str, dict[str, float]]]:
+        """Run benchmark
+        Args:
+            task_list (list[str]): task list
+
+        Returns:
+            evaluation_details (dict[str, list[EvaluationResult]]): evaluation details
+            evaluation_results (dict[str, dict[str, float]]): evaluation results
+        """
+
+        self.logger.log('info', f'Evaluating {self.benchmark_name}...')
+
+        dataloader = self.init_dataloader(self.eval_cfgs, self.benchmark_cfgs)
+        input_data = dataloader.load_dataset(task_list)    # Input_data: list[InferenceInput]
+
+        inference_outputs = self.batch_inference(self.model, input_data)
+        ref_answers = {task: self.get_ref_answer(input_data[task], inference_outputs[task]) for task in task_list}
+        evaluation_details = {}
+        evaluation_results = {}
+
+        for task in task_list:
+            evaluation_details[task], evaluation_results[task] = self.calculate_metrics(self.benchmark_name, inference_outputs[task], ref_answers[task], self.benchmark_cfgs.answer_extractor, self.benchmark_cfgs.metrics, judge_method="judge_equal_list")
+
+        if len(task_list) > 1:
+            overall_result = self.calculate_overall_metrics(self.benchmark_cfgs.overall_metrics, result=evaluation_results)
+        else:
+            overall_result = {}
+
+        self.display_benchmark_results(self.benchmark_name, evaluation_results)
+        if overall_result != {}:
+            self.display_benchmark_results(self.benchmark_name, overall_result)
+        self.save_benchmark_details(self.output_path, self.benchmark_name, input_data, evaluation_details)
+        return evaluation_details, evaluation_results, overall_result
diff --git a/eval_anything/configs/evaluate.yaml b/eval_anything/configs/evaluate.yaml
@@ -19,6 +19,7 @@
       "CEval": ['accountant', 'advanced_mathematics', 'art_studies', 'basic_medicine', 'business_administration', 'chinese_language_and_literature', 'civil_servant', 'clinical_medicine', 'college_chemistry', 'college_economics', 'college_physics', 'college_programming', 'computer_architecture', 'computer_network', 'discrete_mathematics', 'education_science', 'electrical_engineer', 'environmental_impact_assessment_engineer', 'fire_engineer', 'high_school_biology', 'high_school_chemistry', 'high_school_chinese', 'high_school_geography', 'high_school_history', 'high_school_mathematics', 'high_school_physics', 'high_school_politics', 'ideological_and_moral_cultivation', 'law', 'legal_professional', 'logic', 'mao_zedong_thought', 'marxism', 'metrology_engineer', 'middle_school_biology', 'middle_school_chemistry', 'middle_school_geography', 'middle_school_history', 'middle_school_mathematics', 'middle_school_physics', 'middle_school_politics', 'modern_chinese_history', 'operating_system', 'physician', 'plant_protection', 'probability_and_statistics', 'professional_tour_guide', 'sports_science', 'tax_accountant', 'teacher_qualification', 'urban_and_rural_planner', 'veterinary_medicine'],
       "mmmu": ["Accounting"],
       "mathvision": ["default"],
+      "olympiadbench":  ["OE_MM_maths_en_COMP", "OE_MM_maths_zh_CEE", "OE_MM_maths_zh_COMP","OE_MM_physics_en_COMP", "OE_MM_physics_zh_CEE", "OE_TO_maths_en_COMP","OE_TO_maths_zh_CEE", "OE_TO_maths_zh_COMP", "OE_TO_physics_en_COMP","OE_TO_physics_zh_CEE"],
       "mmau": ["MMAU-mini"],
       "mmvu": ["default"],
       }
@@ -35,6 +36,7 @@
       "CEval": 5,
       "mmmu": 5,
       "mathvision": 0,
+      "olympiadbench": 0,
       "mmau": 0,
       "mmvu": 0,
       }
@@ -51,6 +53,7 @@
       "CEval": False,
       "mmmu": False,
       "mathvision": False,
+      "olympiadbench": False,
       "mmau": False,
       "mmvu": False,
       }
@@ -68,7 +71,7 @@
     # Model type ("LM" or "MM")
     model_type: "MM"
     # Chat template
-    chat_template: Llava
+    chat_template: Qwen2-VL
   infer_cfgs:
     # Inference backend
     infer_backend: "vllm"

diff --git a/eval_anything/dataloader/format_mm_dataset.py b/eval_anything/dataloader/format_mm_dataset.py
@@ -275,6 +275,154 @@ def _to_InferenceInput(self, dataset: Dataset) -> List["InferenceInput"]:
                     metadata="video",
                 )
             )
+        return inference_inputs
+
+@MMDatasetRegistry.register("olympiadbench")
+class olympiadbenchDataset(BaseMMDataset):
+    def __init__(self, bench_cfgs: namedtuple, task: namedtuple, enable_cot: bool, num_shot: int):
+        super().__init__(bench_cfgs, task, enable_cot, num_shot)
+
+    def set_few_shot_examples(self, few_shot_dataset: Dataset | None):
+        raise NotImplementedError("olympiadbench does not support few-shot learning.")
+
+    def get_image_indice(self, text: str)->List[int]:
+        pattern = r'<image (\d+)>'
+        matches = re.findall(pattern, text)
+        return [int(num) for num in matches]
+
+
+    # refer: https://github.com/mathllm/MATH-V/blob/main/models/Qwen-VL.py#L19
+    def _to_InferenceInput(self, dataset: Dataset) -> List["InferenceInput"]:
+        """
+        Convert a dataset to a list of InferenceInput objects.
+
+        Args:
+            dataset: Dataset object containing questions, options, and images
+
+        Returns:
+            List of InferenceInput objects ready for model inference
+        """
+
+        # refer: https://github.com/OpenBMB/OlympiadBench/blob/main/inference/code/evaluators/evaluator.py
+        chinese_answer_type_dict = {
+            'Numerical': '数值',
+            'Expression': '表达式',
+            'Equation': '方程',
+            'Interval': '区间'
+        }
+        english_answer_type_dict = {
+            'Numerical': 'a numerical value',
+            'Expression': 'an expression',
+            'Equation': 'an equation',
+            'Interval': 'an interval'
+        }
+
+        def get_single_answer_type_text(answer_type, is_chinese):
+            if '-' in answer_type:    # No need now
+                answer_type = answer_type[:answer_type.find('-')]
+            for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
+                if t in answer_type:
+                    if is_chinese:
+                        return chinese_answer_type_dict[t]
+                    else:
+                        return english_answer_type_dict[t]
+            exit(f'Error parsing answer type {answer_type}!')
+
+        def get_answer_type_text(answer_type, is_chinese, multiple_answer):
+            if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
+                full_answer_text = ''
+            else:
+                if not multiple_answer:
+                    answer_text = get_single_answer_type_text(answer_type, is_chinese)
+                    if is_chinese:
+                        full_answer_text = f'，答案类型为{answer_text}'
+                    else:
+                        full_answer_text = f"The answer of The problem should be {answer_text}. "
+                else:
+                    if ',' not in answer_type:    # Same answer type for all answers
+                        answer_text = get_single_answer_type_text(answer_type, is_chinese)
+                        if is_chinese:
+                            full_answer_text = f'，题目有多个答案，答案类型均为{answer_text}'
+                        else:
+                            full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
+                    else:
+                        answer_types = answer_type.split(',')
+                        answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
+                        if len(set(answer_types)) == 1:
+                            answer_text = answer_types[0]
+                            if is_chinese:
+                                full_answer_text = f'，题目有多个答案，答案类型均为{answer_text}'
+                            else:
+                                full_answer_text = f'The problem has multiple answers, each of them should be {answer_text}. '
+                        else:
+                            if is_chinese:
+                                answer_text = '、'.join(answer_types)
+                                full_answer_text = f'，题目有多个答案，答案类型分别为{answer_text}'
+                            else:
+                                answer_text = ', '.join(answer_types)
+                                full_answer_text = f'The problem has multiple answers, with the answers in order being {answer_text}. '
+            return full_answer_text
+
+        def make_prompt(sample, is_chinese, is_math, is_theorem_proving):
+            if is_chinese:
+                subject_content = '数学' if is_math else '物理'
+                if is_theorem_proving:
+                    prompt = f'以下是中国{subject_content}竞赛中的证明题。请根据题目的要求，运用逻辑推理及常用定理证明题目中的命题。证明过程中使用的变量和公式请使用LaTeX格式表示。'
+                else:
+                    answer_type_text = get_answer_type_text(sample['answer_type'], is_chinese=True, multiple_answer=sample['is_multiple_answer'])
+                    if sample['is_multiple_answer']:
+                        multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
+                    else:
+                        multiple_answer_text = '\\boxed{答案}'
+                    unit_text = ''
+                    if sample['unit']:
+                        multiple_answer_text += '(单位)'
+                        unit_text = '，注意答案的单位不要放在\\boxed{}中'
+                    prompt = f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以"所以最终答案是{multiple_answer_text}。"显式给出结果{unit_text}。'
+            else:
+                subject_content = 'Math' if is_math else 'Physics'
+                if is_theorem_proving:
+                    prompt = f'The following is a theorem proving problem from an International {subject_content} competition. Please use logical reasoning and common theorems to prove the proposition in the problem according to the given requirements. Please use LaTeX format to represent the variables and formulas used in the proof.'
+                else:
+                    if sample['is_multiple_answer']:
+                        multiple_answer_text = '\\boxed{multiple answers connected with commas}'
+                    else:
+                        multiple_answer_text = '\\boxed{answer}'
+                    unit_text = ''
+                    if sample['unit']:
+                        multiple_answer_text += '(unit)'
+                        unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
+                    answer_type_text = get_answer_type_text(sample['answer_type'], is_chinese=False, multiple_answer=sample['is_multiple_answer'])
+                    prompt = f'The following is an open-ended problem from an International {subject_content} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "So the final answer is {multiple_answer_text}." and give the result explicitly{unit_text}.'
+            return prompt
+
+        inference_inputs = []
+        for item in dataset:
+            is_chinese = item['language'] != 'English'
+            is_math = item['subject'] == 'Math'
+            is_theorem_proving = False # we won't test theorem proving in olympiadbench
+            if is_chinese:
+                subject = '数学' if is_math else '物理'
+                system_prompt = f"你是一个中文人工智能助手。请根据要求，完成下面的{subject}竞赛题目。"
+            else:
+                subject = 'Math' if is_math else 'Physics'
+                system_prompt = f'You are an AI assistant. Please answer the following {subject} competition problems as required.'
+
+            formatted_prompt = make_prompt(item, is_chinese, is_math, is_theorem_proving)  + item['question']
+            # original image placeholder is <image_1> , we need to transform it to <image 1> for the function prompt_to_conversation
+            formatted_prompt = re.sub(r'<image_(\d+)>', r'<image \1>', formatted_prompt)
+
+            image_ids = self.get_image_indice(formatted_prompt)
+            images = [item[f'image_{id}'] for id in image_ids]
+            conversation = ImageManager.prompt_to_conversation(user_prompt=formatted_prompt, system_prompt=system_prompt, images=images)
+
+            inference_inputs.append(
+                InferenceInput(
+                    task=self.task.name,
+                    conversation=conversation,
+                    ref_answer=str(item['final_answer'][0])
+                )
+            )
 
             if len(inference_inputs) > 10:
                 break

diff --git a/eval_anything/evaluate_tools/t2t_tools.py b/eval_anything/evaluate_tools/t2t_tools.py
@@ -149,7 +149,7 @@ def apply(self, data_1, data_2) -> bool:
                             break
             return correct
         else:
-            return gold_answer in data_1
+            return str(gold_answer) in data_1
 
     def __call__(self, data_1, data_2) -> bool:
         return self.apply(data_1, data_2)

diff --git a/eval_anything/utils/utils.py b/eval_anything/utils/utils.py
@@ -37,6 +37,7 @@
     'beavertails': 'text_to_text',
     'mmmu': 'text_image_to_text',
     'mathvision': 'text_image_to_text',
+    'olympiadbench': 'text_image_to_text',
     'mmau': 'text_audio_to_text',
     'mmvu': 'text_video_to_text',
 }