Skip to content

Commit fa4860f

Browse files
authored
adds mmlu-pro (#1031)
* adds mmlu-pro * adds mmlu-pro * add mmlu-pro with inspectai
1 parent 880bebe commit fa4860f

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

src/lighteval/main_inspect.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,5 +473,6 @@ def eval(
473473
task = "lighteval|ifeval|0"
474474
task = "lighteval|gpqa|0"
475475
task = "lighteval|ifbench_test|0"
476+
task = "lighteval|mmlu_pro|0"
476477
model = "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct:nebius"
477478
eval(models=[model], tasks=task)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
name:
3+
MMLU Pro
4+
5+
dataset:
6+
TIGER-Lab/MMLU-Pro
7+
8+
abstract:
9+
MMLU-Pro dataset is a more robust and challenging massive multi-task
10+
understanding dataset tailored to more rigorously benchmark large language
11+
models' capabilities. This dataset contains 12K complex questions across various
12+
disciplines.
13+
14+
languages:
15+
english
16+
17+
tags:
18+
general-knowledge, knowledge, multiple-choice
19+
20+
paper:
21+
https://arxiv.org/abs/2406.01574
22+
"""
23+
24+
from string import ascii_uppercase
25+
26+
from inspect_ai.dataset import Sample
27+
from inspect_ai.scorer import choice
28+
from inspect_ai.solver import multiple_choice
29+
30+
from lighteval.metrics.metrics import Metrics
31+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
32+
from lighteval.tasks.requests import Doc
33+
34+
35+
TEMPLATE = """
36+
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
37+
38+
{question}
39+
40+
{choices}
41+
42+
Answer:""".strip()
43+
44+
45+
def mmlu_pro_prompt_function(line, task_name: str = None):
46+
choices = "\n".join([f"{letter}: {choice}" for letter, choice in zip(ascii_uppercase, line["options"])])
47+
48+
query = TEMPLATE.format(
49+
question=line["question"],
50+
choices=choices,
51+
)
52+
53+
return Doc(
54+
task_name=task_name,
55+
query=query,
56+
choices=ascii_uppercase[: len(choices)],
57+
gold_index=line["answer_index"],
58+
instruction=query,
59+
)
60+
61+
62+
def record_to_sample(record):
63+
return Sample(input=record["question"], target=record["answer"], choices=record["options"])
64+
65+
66+
mmlu_pro = LightevalTaskConfig(
67+
name="mmlu_pro",
68+
prompt_function=mmlu_pro_prompt_function,
69+
sample_fields=record_to_sample,
70+
solver=[multiple_choice(cache=True)],
71+
scorer=choice(),
72+
suite=("lighteval",),
73+
hf_repo="TIGER-Lab/MMLU-Pro",
74+
hf_subset="default",
75+
hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea",
76+
evaluation_splits=("test",),
77+
few_shots_split="validation",
78+
metrics=[Metrics.gpqa_instruct_metric],
79+
)
80+
81+
TASKS_TABLE = [mmlu_pro]

0 commit comments

Comments
 (0)