Skip to content

Commit 4c9af85

Browse files
authored
adds aime24, 25 and math500 (#586)
* commit * Apply suggestions from code review * commit * add prompt to math 500 * add prompt to math 500
1 parent 066f84f commit 4c9af85

File tree

6 files changed

+125
-16
lines changed

6 files changed

+125
-16
lines changed
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
model:
22
base_params:
3-
model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,revision=main,dtype=bfloat16" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
3+
model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B-Instruct,revision=main,dtype=bfloat16"
44
generation:
55
temperature: 0.3
66
repetition_penalty: 1.0
@@ -10,5 +10,4 @@ model:
1010
top_k: -1
1111
min_p: 0.0
1212
top_p: 0.9
13-
max_new_tokens: 256
14-
stop_tokens: ["<EOS>", "<PAD>"]
13+
max_new_tokens: 2048

src/lighteval/main_endpoint.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,8 +468,8 @@ def litellm(
468468
if model_args.endswith(".yaml"):
469469
model_config = LiteLLMModelConfig.from_path(model_args)
470470
else:
471-
model_name = model_args.split(",")[0].strip()
472-
model_config = LiteLLMModelConfig(model=model_name)
471+
model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
472+
model_config = LiteLLMModelConfig(**model_args_dict)
473473

474474
pipeline_params = PipelineParameters(
475475
launcher_type=parallelism_manager,

src/lighteval/metrics/metrics.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
from aenum import Enum
2626

2727
from lighteval.metrics.dynamic_metrics import (
28+
ExprExtractionConfig,
2829
IndicesExtractionConfig,
30+
LatexExtractionConfig,
2931
multilingual_extractive_match_metric,
3032
)
3133
from lighteval.metrics.harness_compatibility.drop import drop_metrics
@@ -178,6 +180,15 @@ class Metrics(Enum):
178180
corpus_level_fn=np.mean,
179181
higher_is_better=True,
180182
)
183+
expr_gold_metric = multilingual_extractive_match_metric(
184+
language=Language.ENGLISH,
185+
fallback_mode="first_match",
186+
precision=5,
187+
gold_extraction_target=(ExprExtractionConfig(),),
188+
# Match boxed first before trying other regexes
189+
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
190+
aggregation_function=max,
191+
)
181192
extractiveness = SampleLevelMetricGrouping(
182193
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
183194
sample_level_fn=Extractiveness(
@@ -238,6 +249,15 @@ class Metrics(Enum):
238249
corpus_level_fn=np.mean,
239250
higher_is_better=True,
240251
)
252+
latex_gold_metric = multilingual_extractive_match_metric(
253+
language=Language.ENGLISH,
254+
fallback_mode="first_match",
255+
precision=5,
256+
gold_extraction_target=(LatexExtractionConfig(),),
257+
# Match boxed first before trying other regexes
258+
pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig(boxed_match_priority=0)),
259+
aggregation_function=max,
260+
)
241261
loglikelihood_acc = SampleLevelMetric(
242262
metric_name="acc",
243263
sample_level_fn=LoglikelihoodAcc(logprob_normalization=None).compute,

src/lighteval/models/vllm/vllm_model.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,11 @@ def greedy_until(
247247
# the case! Because of that we only use batch size of 1
248248
stop_tokens = dataset[0].stop_sequence
249249

250-
max_new_tokens = dataset[0].generation_size # could be none
250+
max_new_tokens = (
251+
dataset[0].generation_size
252+
if self.sampling_params.max_tokens is None
253+
else self.sampling_params.max_tokens
254+
)
251255
returns_logits = dataset[0].use_logits
252256
num_samples = dataset[0].num_samples
253257

@@ -321,9 +325,7 @@ def _generate(
321325
sampling_params = self.sampling_params.clone() or SamplingParams()
322326
if generate:
323327
sampling_params.n = num_samples
324-
sampling_params.max_tokens = (
325-
max_new_tokens if sampling_params.max_tokens is None else sampling_params.max_tokens
326-
)
328+
sampling_params.max_tokens = max_new_tokens
327329
sampling_params.stop = stop_tokens
328330
sampling_params.logprobs = 1 if returns_logits else 0
329331

src/lighteval/tasks/default_prompts.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,24 @@
4343
# fmt: on
4444

4545

46+
def aime_prompt_fn(line, task_name: str = None):
47+
# Prompt template adapted from
48+
# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
49+
# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
50+
# Note that it is important to have the final answer in a box for math-verify to work correctly
51+
MATH_QUERY_TEMPLATE = """
52+
Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
53+
54+
{Question}
55+
""".strip()
56+
return Doc(
57+
task_name=task_name,
58+
query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
59+
choices=[line["answer"]],
60+
gold_index=0,
61+
)
62+
63+
4664
def anli(line, task_name: str = None):
4765
return Doc(
4866
task_name=task_name,
@@ -710,22 +728,31 @@ def ethics_virtue(line, task_name: str = None):
710728

711729

712730
def gpqa(line, task_name: str = None):
731+
# Prompt template from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14
732+
GPQA_QUERY_TEMPLATE = """
733+
Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
734+
735+
{Question}
736+
737+
A) {A}
738+
B) {B}
739+
C) {C}
740+
D) {D}
741+
""".strip()
713742
gold_index = random.randint(0, 3)
714743
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
715744
choices.insert(gold_index, line["Correct Answer"])
716745

717-
instruction = "Select the correct answer to the following questions.\n\n"
718-
719-
query = f"Question: {line['Question']}\n"
720-
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
721-
query += "Answer: "
746+
query = GPQA_QUERY_TEMPLATE.format(
747+
A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"]
748+
)
722749

723750
return Doc(
724751
task_name=task_name,
725-
query=f"{instruction}{query}",
752+
query=query,
726753
choices=LETTER_INDICES[: len(choices)],
727754
gold_index=gold_index,
728-
instruction=instruction,
755+
instruction=query,
729756
)
730757

731758

@@ -1257,6 +1284,25 @@ def lsat_qa(line, task_name: str = None):
12571284
)
12581285

12591286

1287+
def math_500(line, task_name: str = None):
1288+
# Prompt template adapted from
1289+
# - simple-evals: https://github.com/openai/simple-evals/blob/6e84f4e2aed6b60f6a0c7b8f06bbbf4bfde72e58/math_eval.py#L17
1290+
# - Llama 3: https://huggingface.co/datasets/meta-llama/Llama-3.2-1B-Instruct-evals/viewer/Llama-3.2-1B-Instruct-evals__math__details?views%5B%5D=llama_32_1b_instruct_evals__math__details
1291+
# Note that it is important to have the final answer in a box for math-verify to work correctly
1292+
MATH_QUERY_TEMPLATE = """
1293+
Solve the following math problem efficiently and clearly. The last line of your response should be of the following format: 'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' (without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering.
1294+
1295+
{Question}
1296+
""".strip()
1297+
1298+
return Doc(
1299+
task_name=task_name,
1300+
query=MATH_QUERY_TEMPLATE.format(Question=line["problem"]),
1301+
gold_index=0,
1302+
choices=[line["solution"]],
1303+
)
1304+
1305+
12601306
def math(line, task_name: str = None):
12611307
return Doc(
12621308
task_name=task_name,

src/lighteval/tasks/default_tasks.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,34 @@
312312
trust_dataset=True,
313313
version=0,
314314
)
315+
aime24 = LightevalTaskConfig(
316+
name="aime24",
317+
suite=["lighteval"],
318+
prompt_function=prompt.aime_prompt_fn,
319+
hf_repo="HuggingFaceH4/aime_2024",
320+
hf_subset="default",
321+
hf_avail_splits=["train"],
322+
evaluation_splits=["train"],
323+
few_shots_split=None,
324+
few_shots_select=None,
325+
generation_size=32768,
326+
metric=[Metrics.expr_gold_metric],
327+
version=1,
328+
)
329+
aime25 = LightevalTaskConfig(
330+
name="aime25",
331+
suite=["lighteval"],
332+
prompt_function=prompt.aime_prompt_fn,
333+
hf_repo="yentinglin/aime_2025",
334+
hf_subset="default",
335+
hf_avail_splits=["train"],
336+
evaluation_splits=["train"],
337+
few_shots_split=None,
338+
few_shots_select=None,
339+
generation_size=10000,
340+
metric=[Metrics.expr_gold_metric],
341+
version=1,
342+
)
315343
anachronisms_bigbench = LightevalTaskConfig(
316344
name="anachronisms",
317345
suite=["bigbench", "bigbench_json"],
@@ -9597,6 +9625,20 @@
95979625
trust_dataset=True,
95989626
version=0,
95999627
)
9628+
math_500 = LightevalTaskConfig(
9629+
name="math_500",
9630+
suite=["lighteval"],
9631+
prompt_function=prompt.math_500,
9632+
hf_repo="HuggingFaceH4/MATH-500",
9633+
hf_subset="default",
9634+
hf_avail_splits=["test"],
9635+
evaluation_splits=["test"],
9636+
few_shots_split=None,
9637+
few_shots_select=None,
9638+
generation_size=32768,
9639+
metric=[Metrics.latex_gold_metric],
9640+
version=1,
9641+
)
96009642
math_algebra_lighteval = LightevalTaskConfig(
96019643
name="math:algebra",
96029644
suite=["lighteval", "math"],

0 commit comments

Comments
 (0)