mmlu benchmark (#164)

jyotianeja · jyotianeja · web-flow · commit 46a3f0de75f3 · 2025-06-27T17:05:20.000-07:00
Added the mmlu benchmark, the HFDataReader only downloads 32/57
catagories, that needs to be fixed.

Co-authored-by: jyotianeja &lt;jyotinaneja@microsoft.com&gt;
diff --git a/eureka_ml_insights/data_utils/mmlu_utils.py b/eureka_ml_insights/data_utils/mmlu_utils.py
@@ -0,0 +1,107 @@
+from dataclasses import dataclass
+
+import pandas as pd
+
+from .transform import DFTransformBase
+
+# The list of 57 tasks is taken from `https://github.com/hendrycks/test/blob/master/categories.py`
+
+MMLUCategories = {
+    "STEM": [
+        "astronomy",
+        "college_physics",
+        "conceptual_physics",
+        "high_school_physics",
+        "college_chemistry",
+        "high_school_chemistry",
+        "college_biology",
+        "high_school_biology",
+        "college_computer_science",
+        "computer_security",
+        "high_school_computer_science",
+        "machine_learning",
+        "abstract_algebra",
+        "college_mathematics",
+        "elementary_mathematics",
+        "high_school_mathematics",
+        "high_school_statistics",
+        "electrical_engineering",
+    ],
+    "Humanities": [
+        "high_school_european_history",
+        "high_school_us_history",
+        "high_school_world_history",
+        "prehistory",
+        "formal_logic",
+        "logical_fallacies",
+        "moral_disputes",
+        "moral_scenarios",
+        "philosophy",
+        "world_religions",
+        "international_law",
+        "jurisprudence",
+        "professional_law",
+    ],
+    "Social Sciences": [
+        "high_school_government_and_politics",
+        "public_relations",
+        "security_studies",
+        "us_foreign_policy",
+        "human_sexuality",
+        "sociology",
+        "high_school_macroeconomics",
+        "high_school_microeconomics",
+        "econometrics",
+        "high_school_geography",
+        "high_school_psychology",
+        "professional_psychology",
+    ],
+    "Other (Business, Health, Misc.)": [
+        "global_facts",
+        "miscellaneous",
+        "professional_accounting",
+        "business_ethics",
+        "management",
+        "marketing",
+        "anatomy",
+        "clinical_knowledge",
+        "college_medicine",
+        "human_aging",
+        "medical_genetics",
+        "nutrition",
+        "professional_medicine",
+        "virology",
+    ],
+}
+
+MMLUTaskToCategories = {task: cat for cat, tasks in MMLUCategories.items() for task in tasks}
+
+MMLUAll = [task for cat in MMLUCategories.values() for task in cat]
+
+
+@dataclass
+class CreateMMLUPrompts(DFTransformBase):
+    """Transform to create prompts for MMLU dataset."""
+    def __init__(self):
+        self.multi_option_example_format = "{}\n{}\nAnswer with the option's letter from the given options directly."
+
+    def _create_prompt(self, sample):
+        question = sample["question"]
+        options = sample["choices"]
+        example = ""
+        start_chr = "A"
+        index2ans = {}
+
+        for option in options:
+            example += f"({start_chr}) {option}\n"
+            index2ans[start_chr] = option
+            start_chr = chr(ord(start_chr) + 1)
+
+        prompt = self.multi_option_example_format.format(question, example)
+
+        return prompt
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df["prompt"] = df.apply(self._create_prompt, axis=1)
+
+        return df
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -62,6 +62,7 @@
 from .mathverse import MATHVERSE_PIPELINE
 from .mathvision import MATHVISION_PIPELINE
 from .mathvista import MATHVISTA_PIPELINE
+from .mmlu import MMLU_BASELINE_PIPELINE
 from .mmmu import MMMU_BASELINE_PIPELINE
 from .nocaps import NOCAPS_PIPELINE
 from .nondeterminism import (
@@ -132,6 +133,7 @@
     GPQA_PIPELINE_5Run,
     Drop_Experiment_Pipeline,
     GEOMETER_PIPELINE,
+    MMLU_BASELINE_PIPELINE,
     MMMU_BASELINE_PIPELINE,
     KITAB_ONE_BOOK_CONSTRAINT_PIPELINE,
     KITAB_ONE_BOOK_CONSTRAINT_PIPELINE_WITH_CONTEXT,
diff --git a/eureka_ml_insights/user_configs/mmlu.py b/eureka_ml_insights/user_configs/mmlu.py
@@ -0,0 +1,112 @@
+import os
+from typing import Any
+
+from eureka_ml_insights.configs.experiment_config import ExperimentConfig
+from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
+from eureka_ml_insights.data_utils import (
+    ASTEvalTransform,
+    ColumnRename,
+    CopyColumn,
+    DataReader,
+    HFDataReader,
+    MapStringsTransform,
+    SequenceTransform,
+    AddColumnAndData,
+    SamplerTransform,
+)
+from eureka_ml_insights.data_utils.mmlu_utils import (
+    CreateMMLUPrompts,
+    MMLUAll,
+    MMLUTaskToCategories,
+)
+from eureka_ml_insights.metrics import CountAggregator, MMMUMetric
+
+from eureka_ml_insights.data_utils.data import DataLoader
+
+from eureka_ml_insights.configs import(
+    AggregatorConfig,
+    DataSetConfig,
+    EvalReportingConfig,
+    InferenceConfig,
+    MetricConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+
+
+class MMLU_BASELINE_PIPELINE(ExperimentConfig):
+    """
+    This defines an ExperimentConfig pipeline for the MMLU dataset.
+    There is no model_config by default and the model config must be passed in via command lime.
+    """
+
+    def configure_pipeline(self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any] ) -> PipelineConfig:
+    
+        self.data_processing_comp = PromptProcessingConfig(
+        component_type=PromptProcessing,
+        data_reader_config=DataSetConfig(
+            HFDataReader,
+            {
+                "path": "cais/mmlu",
+                "split": "test",
+                "tasks": ["abstract_algebra"], #MMLUAll,
+                "transform": SequenceTransform(
+                    [
+                        # ASTEvalTransform(columns=["choices"]),
+                        CreateMMLUPrompts(),
+                        ColumnRename(name_mapping={"answer": "ground_truth", "choices": "target_options"}),
+                        AddColumnAndData("question_type", "multiple-choice"),
+                        # SamplerTransform(sample_count=10, random_seed=42),
+                    ]
+                ),
+            },
+        ),
+        output_dir=os.path.join(self.log_dir, "data_processing_output"),
+        ignore_failure=False,
+        )
+
+        # Configure the inference component
+        self.inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=model_config,
+            data_loader_config=DataSetConfig(
+                DataLoader,
+                {"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
+            ),
+            output_dir=os.path.join(self.log_dir, "inference_result"),
+            resume_from=resume_from,
+        )
+
+        # Configure the evaluation and reporting component.
+        self.evalreporting_comp = EvalReportingConfig(
+            component_type=EvalReporting,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        [
+                            CopyColumn(column_name_src="__hf_task", column_name_dst="category"),
+                            MapStringsTransform(
+                                columns=["category"],
+                                mapping=MMLUTaskToCategories,
+                            ),
+                        ]
+                    ),
+                },
+            ),
+            metric_config=MetricConfig(MMMUMetric),
+            aggregator_configs=[
+                AggregatorConfig(CountAggregator, {"column_names": ["MMMUMetric_result"], "normalize": True}),
+                AggregatorConfig(
+                    CountAggregator,
+                    {"column_names": ["MMMUMetric_result"], "group_by": "category", "normalize": True},
+                ),
+            ],
+            output_dir=os.path.join(self.log_dir, "eval_report"),
+        )
+
+        # Configure the pipeline
+        return PipelineConfig([self.data_processing_comp, self.inference_comp, self.evalreporting_comp], self.log_dir)