diff --git a/src/lighteval/main_inspect.py b/src/lighteval/main_inspect.py index e51869f41..0fdb67ead 100644 --- a/src/lighteval/main_inspect.py +++ b/src/lighteval/main_inspect.py @@ -473,5 +473,6 @@ def eval( task = "lighteval|ifeval|0" task = "lighteval|gpqa|0" task = "lighteval|ifbench_test|0" + task = "lighteval|mmlu_pro|0" model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius" eval(models=[model], tasks=task) diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py new file mode 100644 index 000000000..5a536ffed --- /dev/null +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -0,0 +1,81 @@ +""" +name: +MMLU Pro + +dataset: +TIGER-Lab/MMLU-Pro + +abstract: +MMLU-Pro dataset is a more robust and challenging massive multi-task +understanding dataset tailored to more rigorously benchmark large language +models' capabilities. This dataset contains 12K complex questions across various +disciplines. + +languages: +english + +tags: +general-knowledge, knowledge, multiple-choice + +paper: +https://arxiv.org/abs/2406.01574 +""" + +from string import ascii_uppercase + +from inspect_ai.dataset import Sample +from inspect_ai.scorer import choice +from inspect_ai.solver import multiple_choice + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc + + +TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. + +{question} + +{choices} + +Answer:""".strip() + + +def mmlu_pro_prompt_function(line, task_name: str = None): + choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])]) + + query = TEMPLATE.format( + question=line["question"], + choices=choices, + ) + + return Doc( + task_name=task_name, + query=query, + choices=ascii_uppercase[: len(choices)], + gold_index=line["answer_index"], + instruction=query, + ) + + +def record_to_sample(record): + return Sample(input=record["question"], target=record["answer"], choices=record["options"]) + + +mmlu_pro = LightevalTaskConfig( + name="mmlu_pro", + prompt_function=mmlu_pro_prompt_function, + sample_fields=record_to_sample, + solver=[multiple_choice(cache=True)], + scorer=choice(), + suite=("lighteval",), + hf_repo="TIGER-Lab/MMLU-Pro", + hf_subset="default", + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", + evaluation_splits=("test",), + few_shots_split="validation", + metrics=[Metrics.gpqa_instruct_metric], +) + +TASKS_TABLE = [mmlu_pro]