huggingface · clefourrier · Jul 25, 2025 · Jul 25, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -1862,6 +1862,30 @@ def mmlu_helm(line, task_name: str = None):
     )
 
 
+def mmlu_redux_2(line, topic, task_name: str = None):
+    """
+    MMLU-Redux-2 prompt function.
+    The dataset uses integer indices for answers and has additional metadata fields.
+    """
+    query = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n"
+    query += line["question"] + "\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
+    query += "Answer: "
+
+    # Handle answer format - MMLU-Redux-2 uses integer indices directly
+    gold_ix = line["answer"] if isinstance(line["answer"], int) else int(line["answer"])
+    is_few_shots = line.get("__few_shots", False)
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=LETTER_INDICES[:len(line["choices"])],
+        gold_index=gold_ix,
+        instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
+        target_for_fewshot_sorting=LETTER_INDICES[gold_ix] if not is_few_shots else None,
+    )
+
+
 def mmlu_qa_abstract_algebra(line, task_name: str = None):
     return mmlu_qa(line, "abstract_algebra", task_name)
 

diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -21905,3 +21905,99 @@
     trust_dataset=True,
     version=0,
 )
+
+# MMLU-Redux-2 Tasks
+_MMLU_REDUX_2_SUBSETS = [
+    "abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
+    "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
+    "college_medicine", "college_physics", "computer_security", "conceptual_physics",
+    "econometrics", "electrical_engineering", "elementary_mathematics", "formal_logic",
+    "global_facts", "high_school_biology", "high_school_chemistry", "high_school_computer_science",
+    "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
+    "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
+    "high_school_physics", "high_school_psychology", "high_school_statistics",
+    "high_school_us_history", "high_school_world_history", "human_aging", "human_sexuality",
+    "international_law", "jurisprudence", "logical_fallacies", "machine_learning",
+    "management", "marketing", "medical_genetics", "miscellaneous", "moral_disputes",
+    "moral_scenarios", "nutrition", "philosophy", "prehistory", "professional_accounting",
+    "professional_law", "professional_medicine", "professional_psychology", "public_relations",
+    "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"
+]
+
+_mmlu_redux_2_tasks = {
+    subset: LightevalTaskConfig(
+        name=f"mmlu_redux_2:{subset}",
+        suite=["lighteval"],
+        prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name),
+        hf_repo="edinburgh-dawg/mmlu-redux-2.0",
+        hf_subset=subset,
+        hf_avail_splits=["test"],
+        evaluation_splits=["test"],
+        few_shots_split=None,
+        few_shots_select=None,
+        generation_size=1,
+        metrics=[Metrics.loglikelihood_acc],
+        stop_sequence=["\n"],
+        trust_dataset=True,
+        version=0,
+    )
+    for subset in _MMLU_REDUX_2_SUBSETS
+}
+
+mmlu_redux_2_abstract_algebra = _mmlu_redux_2_tasks["abstract_algebra"]
+mmlu_redux_2_anatomy = _mmlu_redux_2_tasks["anatomy"]
+mmlu_redux_2_astronomy = _mmlu_redux_2_tasks["astronomy"]
+mmlu_redux_2_business_ethics = _mmlu_redux_2_tasks["business_ethics"]
+mmlu_redux_2_clinical_knowledge = _mmlu_redux_2_tasks["clinical_knowledge"]
+mmlu_redux_2_college_biology = _mmlu_redux_2_tasks["college_biology"]
+mmlu_redux_2_college_chemistry = _mmlu_redux_2_tasks["college_chemistry"]
+mmlu_redux_2_college_computer_science = _mmlu_redux_2_tasks["college_computer_science"]
+mmlu_redux_2_college_mathematics = _mmlu_redux_2_tasks["college_mathematics"]
+mmlu_redux_2_college_medicine = _mmlu_redux_2_tasks["college_medicine"]
+mmlu_redux_2_college_physics = _mmlu_redux_2_tasks["college_physics"]
+mmlu_redux_2_computer_security = _mmlu_redux_2_tasks["computer_security"]
+mmlu_redux_2_conceptual_physics = _mmlu_redux_2_tasks["conceptual_physics"]
+mmlu_redux_2_econometrics = _mmlu_redux_2_tasks["econometrics"]
+mmlu_redux_2_electrical_engineering = _mmlu_redux_2_tasks["electrical_engineering"]
+mmlu_redux_2_elementary_mathematics = _mmlu_redux_2_tasks["elementary_mathematics"]
+mmlu_redux_2_formal_logic = _mmlu_redux_2_tasks["formal_logic"]
+mmlu_redux_2_global_facts = _mmlu_redux_2_tasks["global_facts"]
+mmlu_redux_2_high_school_biology = _mmlu_redux_2_tasks["high_school_biology"]
+mmlu_redux_2_high_school_chemistry = _mmlu_redux_2_tasks["high_school_chemistry"]
+mmlu_redux_2_high_school_computer_science = _mmlu_redux_2_tasks["high_school_computer_science"]
+mmlu_redux_2_high_school_european_history = _mmlu_redux_2_tasks["high_school_european_history"]
+mmlu_redux_2_high_school_geography = _mmlu_redux_2_tasks["high_school_geography"]
+mmlu_redux_2_high_school_government_and_politics = _mmlu_redux_2_tasks["high_school_government_and_politics"]
+mmlu_redux_2_high_school_macroeconomics = _mmlu_redux_2_tasks["high_school_macroeconomics"]
+mmlu_redux_2_high_school_mathematics = _mmlu_redux_2_tasks["high_school_mathematics"]
+mmlu_redux_2_high_school_microeconomics = _mmlu_redux_2_tasks["high_school_microeconomics"]
+mmlu_redux_2_high_school_physics = _mmlu_redux_2_tasks["high_school_physics"]
+mmlu_redux_2_high_school_psychology = _mmlu_redux_2_tasks["high_school_psychology"]
+mmlu_redux_2_high_school_statistics = _mmlu_redux_2_tasks["high_school_statistics"]
+mmlu_redux_2_high_school_us_history = _mmlu_redux_2_tasks["high_school_us_history"]
+mmlu_redux_2_high_school_world_history = _mmlu_redux_2_tasks["high_school_world_history"]
+mmlu_redux_2_human_aging = _mmlu_redux_2_tasks["human_aging"]
+mmlu_redux_2_human_sexuality = _mmlu_redux_2_tasks["human_sexuality"]
+mmlu_redux_2_international_law = _mmlu_redux_2_tasks["international_law"]
+mmlu_redux_2_jurisprudence = _mmlu_redux_2_tasks["jurisprudence"]
+mmlu_redux_2_logical_fallacies = _mmlu_redux_2_tasks["logical_fallacies"]
+mmlu_redux_2_machine_learning = _mmlu_redux_2_tasks["machine_learning"]
+mmlu_redux_2_management = _mmlu_redux_2_tasks["management"]
+mmlu_redux_2_marketing = _mmlu_redux_2_tasks["marketing"]
+mmlu_redux_2_medical_genetics = _mmlu_redux_2_tasks["medical_genetics"]
+mmlu_redux_2_miscellaneous = _mmlu_redux_2_tasks["miscellaneous"]
+mmlu_redux_2_moral_disputes = _mmlu_redux_2_tasks["moral_disputes"]
+mmlu_redux_2_moral_scenarios = _mmlu_redux_2_tasks["moral_scenarios"]
+mmlu_redux_2_nutrition = _mmlu_redux_2_tasks["nutrition"]
+mmlu_redux_2_philosophy = _mmlu_redux_2_tasks["philosophy"]
+mmlu_redux_2_prehistory = _mmlu_redux_2_tasks["prehistory"]
+mmlu_redux_2_professional_accounting = _mmlu_redux_2_tasks["professional_accounting"]
+mmlu_redux_2_professional_law = _mmlu_redux_2_tasks["professional_law"]
+mmlu_redux_2_professional_medicine = _mmlu_redux_2_tasks["professional_medicine"]
+mmlu_redux_2_professional_psychology = _mmlu_redux_2_tasks["professional_psychology"]
+mmlu_redux_2_public_relations = _mmlu_redux_2_tasks["public_relations"]
+mmlu_redux_2_security_studies = _mmlu_redux_2_tasks["security_studies"]
+mmlu_redux_2_sociology = _mmlu_redux_2_tasks["sociology"]
+mmlu_redux_2_us_foreign_policy = _mmlu_redux_2_tasks["us_foreign_policy"]
+mmlu_redux_2_virology = _mmlu_redux_2_tasks["virology"]
+mmlu_redux_2_world_religions = _mmlu_redux_2_tasks["world_religions"]