Skip to content

Commit 9619194

Browse files
NathanHBCopilot
andauthored
Adds GSM-PLUS (#780)
* commit * Update src/lighteval/tasks/default_prompts.py Co-authored-by: Copilot <[email protected]> --------- Co-authored-by: Copilot <[email protected]>
1 parent 9dc2e53 commit 9619194

File tree

2 files changed

+32
-0
lines changed

2 files changed

+32
-0
lines changed

src/lighteval/tasks/default_prompts.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -911,6 +911,22 @@ def gpqa_instruct(line, task_name: str = None):
911911
)
912912

913913

914+
def gsm_plus(line, task_name: str = None):
915+
# GSM8K with 8 prompt variations per sample
916+
917+
# Some prompts require critical thinking (around 1k/10k), we skip them as
918+
# they are a bit trickier to eval with regular text extraction.
919+
if line["perturbation_type"] == "critical thinking":
920+
return None
921+
922+
return Doc(
923+
task_name=task_name,
924+
query=f"Question: {line['question']}\n\nAnswer:",
925+
choices=[line["answer"]],
926+
gold_index=0,
927+
)
928+
929+
914930
def gsm8k(line, task_name: str = None):
915931
# Has special analysis in metric for number decomposition
916932
return Doc(

src/lighteval/tasks/default_tasks.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7960,6 +7960,22 @@
79607960
trust_dataset=True,
79617961
version=0,
79627962
)
7963+
gsm_plus = LightevalTaskConfig(
7964+
name="gsm_plus",
7965+
suite=["lighteval"],
7966+
prompt_function=prompt.gsm_plus,
7967+
hf_repo="qintongli/GSM-Plus",
7968+
hf_subset="default",
7969+
hf_avail_splits=["test", "testmini"],
7970+
evaluation_splits=["test"],
7971+
few_shots_split=None,
7972+
few_shots_select=None,
7973+
generation_size=None,
7974+
metric=[Metrics.expr_gold_metric],
7975+
stop_sequence=None,
7976+
trust_dataset=True,
7977+
version=0,
7978+
)
79637979
gsm8k_leaderboard = LightevalTaskConfig(
79647980
name="gsm8k",
79657981
suite=["leaderboard"],

0 commit comments

Comments
 (0)