Skip to content

Commit 180daf1

Browse files
committed
adds olympiad bench
1 parent ca47099 commit 180daf1

File tree

3 files changed

+142
-26
lines changed

3 files changed

+142
-26
lines changed

src/lighteval/metrics/dynamic_metrics.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
246246

247247
# Assert on empty gold and warn on empty pred
248248
if any(len(g) == 0 for g in extracted_golds):
249-
raise ValueError(f"No gold targets found for at least one gold. Gold: {golds}, Pred: {predictions}")
249+
logger.warning(f"We did not manage to extract a gold in the correct format. Gold: {golds}")
250+
extracted_golds = [[gold] for gold in golds]
250251

251252
if all(len(p) == 0 for p in extracted_predictions):
252253
logger.warning(

src/lighteval/models/litellm_model.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,6 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
141141

142142
response = litellm.completion(**kwargs)
143143

144-
print(response)
145-
print(kwargs)
146-
147144
# If response is empty, retry without caching (maybe the error is recoverable and solved with a retry)
148145
if response.choices[0].message.content is None:
149146
kwargs["caching"] = False

src/lighteval/tasks/extended/olympiade_bench/main.py

Lines changed: 140 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,104 @@
3131
from lighteval.utils.language import Language
3232

3333

34+
chinese_answer_type_dict = {"Numerical": "数值", "Expression": "表达式", "Equation": "方程", "Interval": "区间"}
35+
english_answer_type_dict = {
36+
"Numerical": "a numerical value",
37+
"Expression": "an expression",
38+
"Equation": "an equation",
39+
"Interval": "an interval",
40+
}
41+
42+
43+
def get_single_answer_type_text(answer_type, is_chinese):
44+
if "-" in answer_type: # No need now
45+
answer_type = answer_type[: answer_type.find("-")]
46+
for t in ["Numerical", "Expression", "Equation", "Interval"]:
47+
if t in answer_type:
48+
if is_chinese:
49+
return chinese_answer_type_dict[t]
50+
else:
51+
return english_answer_type_dict[t]
52+
exit(f"Error parsing answer type {answer_type}!")
53+
54+
55+
def get_answer_type_text(answer_type, is_chinese, multiple_answer):
56+
if (
57+
("Need_human_evaluate" in answer_type) or ("Tuple" in answer_type)
58+
): # 'Tuple' has various meanings in different context, such as position or values of a series of variable, so it may lead to confusion to directly use 'tuple' in the prompt.
59+
full_answer_text = ""
60+
else:
61+
if not multiple_answer:
62+
answer_text = get_single_answer_type_text(answer_type, is_chinese)
63+
if is_chinese:
64+
full_answer_text = f",答案类型为{answer_text}"
65+
else:
66+
full_answer_text = f"The answer of The problem should be {answer_text}. "
67+
else:
68+
if "," not in answer_type: # Same answer type for all answers
69+
answer_text = get_single_answer_type_text(answer_type, is_chinese)
70+
if is_chinese:
71+
full_answer_text = f",题目有多个答案,答案类型均为{answer_text}"
72+
else:
73+
full_answer_text = f"The problem has multiple answers, each of them should be {answer_text}. "
74+
else:
75+
answer_types = answer_type.split(",")
76+
answer_types = [get_single_answer_type_text(t, is_chinese) for t in answer_types]
77+
if len(set(answer_types)) == 1:
78+
answer_text = answer_types[0]
79+
if is_chinese:
80+
full_answer_text = f",题目有多个答案,答案类型均为{answer_text}"
81+
else:
82+
full_answer_text = f"The problem has multiple answers, each of them should be {answer_text}. "
83+
else:
84+
if is_chinese:
85+
answer_text = "、".join(answer_types)
86+
full_answer_text = f",题目有多个答案,答案类型分别为{answer_text}"
87+
else:
88+
answer_text = ", ".join(answer_types)
89+
full_answer_text = (
90+
f"The problem has multiple answers, with the answers in order being {answer_text}. "
91+
)
92+
return full_answer_text
93+
94+
3495
# Very specific task where there are no precise outputs but instead we test if the format obeys rules
3596
def olympiad_bench_prompt(line, task_name: str = None):
97+
is_math = "Math" in line["subject"]
98+
subject = "Math" if is_math else "Physics"
99+
100+
is_theorem_proving = "TP" in task_name
101+
unit = line["unit"]
102+
is_multiple_answer = line["is_multiple_answer"]
103+
104+
if is_theorem_proving:
105+
instruction = f"The following is a theorem proving problem from an International {subject} competition. Please use logical reasoning and common theorems to prove the proposition in the problem according to the given requirements. Please use LaTeX format to represent the variables and formulas used in the proof."
106+
else:
107+
if is_multiple_answer:
108+
multiple_answer_text = "\\boxed{multiple answers connected with commas}"
109+
else:
110+
multiple_answer_text = "\\boxed{answer}"
111+
unit_text = ""
112+
if unit:
113+
multiple_answer_text += "(unit)"
114+
unit_text = ", note that the unit of the answer should not be included in \\boxed{}"
115+
116+
answer_type_text = get_answer_type_text(
117+
line["answer_type"], is_chinese=False, multiple_answer=is_multiple_answer
118+
)
119+
120+
instruction = f'The following is an open-ended problem from an International {subject} competition. {answer_type_text}Please calculate the answer according to the given requirements and the information provided. Please use LaTeX format to represent the variables and formulas used in the solution process and results. Please end your solution with "So the final answer is {multiple_answer_text}." and give the result explicitly{unit_text}.'
121+
122+
# instruction += f"\nYou are an AI assistant. Please answer the following {subject} competition questions as required."
123+
124+
choice = line["final_answer"]
125+
36126
return Doc(
37127
task_name=task_name,
38-
query=line["question"],
39-
choices=[line["final_answer"]],
128+
query=instruction + "\n" + line["question"],
129+
choices=[choice],
40130
gold_index=0,
41-
instruction="",
131+
instruction=instruction,
42132
specific={},
43133
)
44134

@@ -54,7 +144,7 @@ def olympiad_bench_prompt(line, task_name: str = None):
54144
# * COMP: Competition problems
55145
# * CEE: Chinese College Entrance Exam problems
56146

57-
question_type = ["OE", "TP"]
147+
question_type = ["OE"] # "TP"]
58148
multimodality = ["TO"] # MM
59149
subject = ["physics", "maths"]
60150
language = ["en"] # "zh"]
@@ -69,6 +159,29 @@ def olympiad_bench_prompt(line, task_name: str = None):
69159
for src in source:
70160
olympiad_bench_subsets.append(f"{qt}_{mm}_{sub}_{lang}_{src}")
71161

162+
available_subsets = [
163+
"OE_MM_maths_en_COMP",
164+
"OE_MM_maths_zh_CEE",
165+
"OE_MM_maths_zh_COMP",
166+
"OE_MM_physics_en_COMP",
167+
"OE_MM_physics_zh_CEE",
168+
"OE_TO_maths_en_COMP",
169+
"OE_TO_maths_zh_CEE",
170+
"OE_TO_maths_zh_COMP",
171+
"OE_TO_physics_en_COMP",
172+
"OE_TO_physics_zh_CEE",
173+
"TP_MM_maths_en_COMP",
174+
"TP_MM_maths_zh_CEE",
175+
"TP_MM_maths_zh_COMP",
176+
"TP_MM_physics_en_COMP",
177+
"TP_TO_maths_en_COMP",
178+
"TP_TO_maths_zh_CEE",
179+
"TP_TO_maths_zh_COMP",
180+
"TP_TO_physics_en_COMP",
181+
]
182+
183+
olympiad_bench_subsets = set(olympiad_bench_subsets).intersection(available_subsets)
184+
72185
extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()]
73186

74187
metric = multilingual_extractive_match_metric(
@@ -77,23 +190,28 @@ def olympiad_bench_prompt(line, task_name: str = None):
77190
pred_extraction_target=extraction_targets,
78191
precision=6,
79192
)
80-
# We create the task config
81-
olympiad_bench = LightevalTaskConfig(
82-
name="olympiad_bench",
83-
prompt_function=olympiad_bench_prompt,
84-
suite=["extended"],
85-
hf_repo="Hothan/OlympiadBench",
86-
hf_subset=olympiad_bench_subsets[0],
87-
metric=[metric],
88-
hf_avail_splits=["train"],
89-
evaluation_splits=["train"],
90-
few_shots_split="train",
91-
few_shots_select="random_sampling",
92-
generation_size=2048,
93-
stop_sequence=[], # no stop sequence, will use eot token
94-
version="1.0",
95-
)
96193

97-
# print(olympiad_bench)
194+
task_configs = []
195+
196+
for subset in olympiad_bench_subsets:
197+
# We create the task config
198+
task_configs.append(
199+
LightevalTaskConfig(
200+
name="olympiad_bench:" + subset,
201+
prompt_function=olympiad_bench_prompt,
202+
suite=["extended"],
203+
hf_repo="Hothan/OlympiadBench",
204+
hf_subset=subset,
205+
metric=[metric],
206+
hf_avail_splits=["train"],
207+
evaluation_splits=["train"],
208+
few_shots_split="train",
209+
few_shots_select="random_sampling",
210+
generation_size=2048,
211+
stop_sequence=[], # no stop sequence, will use eot token
212+
version=1,
213+
)
214+
)
215+
98216

99-
TASKS_TABLE = [olympiad_bench]
217+
TASKS_TABLE = task_configs

0 commit comments

Comments
 (0)