|
| 1 | +""" |
| 2 | +name: |
| 3 | +MMLU Pro |
| 4 | +
|
| 5 | +dataset: |
| 6 | +TIGER-Lab/MMLU-Pro |
| 7 | +
|
| 8 | +abstract: |
| 9 | +MMLU-Pro dataset is a more robust and challenging massive multi-task |
| 10 | +understanding dataset tailored to more rigorously benchmark large language |
| 11 | +models' capabilities. This dataset contains 12K complex questions across various |
| 12 | +disciplines. |
| 13 | +
|
| 14 | +languages: |
| 15 | +english |
| 16 | +
|
| 17 | +tags: |
| 18 | +general-knowledge, knowledge, multiple-choice |
| 19 | +
|
| 20 | +paper: |
| 21 | +https://arxiv.org/abs/2406.01574 |
| 22 | +""" |
| 23 | + |
| 24 | +from string import ascii_uppercase |
| 25 | + |
| 26 | +from inspect_ai.dataset import Sample |
| 27 | +from inspect_ai.scorer import choice |
| 28 | +from inspect_ai.solver import multiple_choice |
| 29 | + |
| 30 | +from lighteval.metrics.metrics import Metrics |
| 31 | +from lighteval.tasks.lighteval_task import LightevalTaskConfig |
| 32 | +from lighteval.tasks.requests import Doc |
| 33 | + |
| 34 | + |
| 35 | +TEMPLATE = """ |
| 36 | +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. |
| 37 | +
|
| 38 | +{question} |
| 39 | +
|
| 40 | +{choices} |
| 41 | +
|
| 42 | +Answer:""".strip() |
| 43 | + |
| 44 | + |
| 45 | +def mmlu_pro_prompt_function(line, task_name: str = None): |
| 46 | + choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])]) |
| 47 | + |
| 48 | + query = TEMPLATE.format( |
| 49 | + question=line["question"], |
| 50 | + choices=choices, |
| 51 | + ) |
| 52 | + |
| 53 | + return Doc( |
| 54 | + task_name=task_name, |
| 55 | + query=query, |
| 56 | + choices=ascii_uppercase[: len(choices)], |
| 57 | + gold_index=line["answer_index"], |
| 58 | + instruction=query, |
| 59 | + ) |
| 60 | + |
| 61 | + |
| 62 | +def record_to_sample(record): |
| 63 | + return Sample(input=record["question"], target=record["answer"], choices=record["options"]) |
| 64 | + |
| 65 | + |
| 66 | +mmlu_pro = LightevalTaskConfig( |
| 67 | + name="mmlu_pro", |
| 68 | + prompt_function=mmlu_pro_prompt_function, |
| 69 | + sample_fields=record_to_sample, |
| 70 | + solver=[multiple_choice(cache=True)], |
| 71 | + scorer=choice(), |
| 72 | + suite=("lighteval",), |
| 73 | + hf_repo="TIGER-Lab/MMLU-Pro", |
| 74 | + hf_subset="default", |
| 75 | + hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", |
| 76 | + evaluation_splits=("test",), |
| 77 | + few_shots_split="validation", |
| 78 | + metrics=[Metrics.gpqa_instruct_metric], |
| 79 | +) |
| 80 | + |
| 81 | +TASKS_TABLE = [mmlu_pro] |
0 commit comments