Skip to content

Commit 5b3cd26

Browse files
committed
added instruct specific evals
1 parent 0ba8812 commit 5b3cd26

File tree

1 file changed

+346
-0
lines changed

1 file changed

+346
-0
lines changed
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
from functools import partial
24+
25+
import numpy as np
26+
from langcodes import standardize_tag
27+
28+
from lighteval.metrics.dynamic_metrics import (
29+
IndicesExtractionConfig,
30+
multilingual_extractive_match_metric,
31+
)
32+
from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric
33+
from lighteval.metrics.metrics_sample import (
34+
PassAtK,
35+
)
36+
from lighteval.tasks.default_prompts import LETTER_INDICES
37+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
38+
from lighteval.tasks.requests import Doc
39+
from lighteval.utils.language import Language
40+
41+
42+
TASKS_TABLE = []
43+
44+
lang_to_literal = {
45+
"deu": Language.GERMAN,
46+
"fra": Language.FRENCH,
47+
"ita": Language.ITALIAN,
48+
"por": Language.PORTUGUESE,
49+
"spa": Language.SPANISH,
50+
}
51+
52+
53+
def belebele_prompt(line, task_name: str = None):
54+
lang_to_template = {
55+
"eng_Latn": "Given the following passage, query, and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nPassage:\n{Passage}\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
56+
"deu_Latn": "Gib basierend auf dem folgenden Textabschnitt, der Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nTextabschnitt:\n{Passage}\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
57+
"fra_Latn": "A partir du passage suivant, de la question et des choix de réponses, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nPassage:\n{Passage}\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
58+
"ita_Latn": "Dato il seguente passaggio, un quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nPassaggio:\n{Passage}\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
59+
"por_Latn": "Tendo em conta a seguinte passagem, pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPassagem:\n{Passage}\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
60+
"spa_Latn": "Dado el siguiente contexto, pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\n###\nContexto:\n{Passage}\n###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
61+
}
62+
63+
gold_index = int(line["correct_answer_num"]) - 1
64+
choices = [line["mc_answer1"], line["mc_answer2"], line["mc_answer3"], line["mc_answer4"]]
65+
query_template = lang_to_template.get(line["dialect"], "eng_Latn")
66+
query = query_template.format(
67+
A=choices[0],
68+
B=choices[1],
69+
C=choices[2],
70+
D=choices[3],
71+
Passage=line["flores_passage"],
72+
Question=line["question"],
73+
)
74+
instruction = query_template.split("\n\n###")[0]
75+
76+
return Doc(
77+
task_name=task_name,
78+
query=query,
79+
choices=LETTER_INDICES[: len(choices)],
80+
gold_index=gold_index,
81+
instruction=instruction,
82+
)
83+
84+
85+
BELEBELE_TASKS = [
86+
LightevalTaskConfig(
87+
name=f"belebele_instruct_{lang}_Latn",
88+
prompt_function=belebele_prompt,
89+
suite=["extended"],
90+
hf_repo="facebook/belebele",
91+
hf_subset=f"{lang}_Latn",
92+
evaluation_splits=["test"],
93+
hf_avail_splits=["test"],
94+
few_shots_split=None,
95+
few_shots_select=None,
96+
generation_size=32768, # needed for reasoning models like R1
97+
metric=[
98+
SampleLevelMetric(
99+
metric_name="pass@1:1_samples",
100+
sample_level_fn=PassAtK(
101+
k=1,
102+
n=1,
103+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
104+
language=lang_to_literal[lang],
105+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
106+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
107+
precision=6,
108+
).sample_level_fn([ref], [pred], doc),
109+
).compute,
110+
category=MetricCategory.GENERATIVE_SAMPLING,
111+
use_case=MetricUseCase.REASONING,
112+
corpus_level_fn=np.mean,
113+
higher_is_better=True,
114+
)
115+
],
116+
stop_sequence=[], # no stop sequence, will use eos token
117+
trust_dataset=True,
118+
version=1,
119+
)
120+
for lang in [
121+
"deu",
122+
"fra",
123+
"ita",
124+
"por",
125+
"spa",
126+
]
127+
]
128+
129+
TASKS_TABLE.extend(BELEBELE_TASKS)
130+
131+
132+
MMLU_SUBSETS = [
133+
"abstract_algebra",
134+
"anatomy",
135+
"astronomy",
136+
"business_ethics",
137+
"clinical_knowledge",
138+
"college_biology",
139+
"college_chemistry",
140+
"college_computer_science",
141+
"college_mathematics",
142+
"college_medicine",
143+
"college_physics",
144+
"computer_security",
145+
"conceptual_physics",
146+
"econometrics",
147+
"electrical_engineering",
148+
"elementary_mathematics",
149+
"formal_logic",
150+
"global_facts",
151+
"high_school_biology",
152+
"high_school_chemistry",
153+
"high_school_computer_science",
154+
"high_school_european_history",
155+
"high_school_geography",
156+
"high_school_government_and_politics",
157+
"high_school_macroeconomics",
158+
"high_school_mathematics",
159+
"high_school_microeconomics",
160+
"high_school_physics",
161+
"high_school_psychology",
162+
"high_school_statistics",
163+
"high_school_us_history",
164+
"high_school_world_history",
165+
"human_aging",
166+
"human_sexuality",
167+
"international_law",
168+
"jurisprudence",
169+
"logical_fallacies",
170+
"machine_learning",
171+
"management",
172+
"marketing",
173+
"medical_genetics",
174+
"miscellaneous",
175+
"moral_disputes",
176+
"moral_scenarios",
177+
"nutrition",
178+
"philosophy",
179+
"prehistory",
180+
"professional_accounting",
181+
"professional_law",
182+
"professional_medicine",
183+
"professional_psychology",
184+
"public_relations",
185+
"security_studies",
186+
"sociology",
187+
"us_foreign_policy",
188+
"virology",
189+
"world_religions",
190+
]
191+
192+
193+
class GlobalMMLUPrompt:
194+
def __init__(self, lang):
195+
self.lang = lang
196+
self.lang_to_template = {
197+
"eng": "Given the following query and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
198+
"deu": "Gib basierend auf der folgenden Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
199+
"fra": "A partir de la question et des choix de réponses suivants, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
200+
"ita": "Dato il seguente quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
201+
"por": "Tendo em conta a seguinte pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
202+
"spa": "Dado el siguiente pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\\###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
203+
}
204+
205+
def prompt(self, line, task_name: str = None):
206+
gold_index = LETTER_INDICES.index(line["answer"])
207+
choices = [line["option_a"], line["option_b"], line["option_c"], line["option_d"]]
208+
query_template = self.lang_to_template.get(self.lang, "eng")
209+
query = query_template.format(
210+
A=choices[0],
211+
B=choices[1],
212+
C=choices[2],
213+
D=choices[3],
214+
Question=line["question"],
215+
)
216+
instruction = query_template.split("\n\n###")[0]
217+
218+
return Doc(
219+
task_name=task_name,
220+
query=query,
221+
choices=LETTER_INDICES[: len(choices)],
222+
gold_index=gold_index,
223+
instruction=instruction,
224+
)
225+
226+
227+
global_mmlu_tasks = [
228+
LightevalTaskConfig(
229+
name=f"global_mmlu_instruct_{sensitivity_label.lower()}_{language.value}:{subset}",
230+
prompt_function=GlobalMMLUPrompt(language).prompt,
231+
suite=("extended"),
232+
hf_repo="CohereForAI/Global-MMLU",
233+
hf_subset=standardize_tag(language.value),
234+
evaluation_splits=("test",),
235+
few_shots_split="dev",
236+
hf_filter=partial(
237+
lambda subset, sensitivity_label, x: x["subject"].lower() == subset
238+
and (
239+
sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
240+
)
241+
and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"),
242+
subset,
243+
sensitivity_label,
244+
),
245+
metric=SampleLevelMetric(
246+
metric_name="pass@1:1_samples",
247+
sample_level_fn=PassAtK(
248+
k=1,
249+
n=1,
250+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
251+
language=language,
252+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
253+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
254+
precision=6,
255+
).sample_level_fn([ref], [pred], doc),
256+
).compute,
257+
category=MetricCategory.GENERATIVE_SAMPLING,
258+
use_case=MetricUseCase.REASONING,
259+
corpus_level_fn=np.mean,
260+
higher_is_better=True,
261+
),
262+
generation_size=32768, # needed for reasoning models like R1
263+
stop_sequence=[], # no stop sequence, will use eos token
264+
)
265+
for subset in MMLU_SUBSETS
266+
for language in [
267+
Language.GERMAN,
268+
Language.ENGLISH,
269+
Language.SPANISH,
270+
Language.FRENCH,
271+
Language.HEBREW,
272+
Language.HINDI,
273+
Language.INDONESIAN,
274+
Language.ITALIAN,
275+
Language.JAPANESE,
276+
Language.KOREAN,
277+
Language.MALAY,
278+
Language.DUTCH,
279+
Language.NORWEGIAN,
280+
Language.POLISH,
281+
Language.PORTUGUESE,
282+
Language.ROMANIAN,
283+
Language.RUSSIAN,
284+
Language.SERBIAN,
285+
Language.SWEDISH,
286+
Language.SWAHILI,
287+
Language.TAMIL,
288+
Language.TELUGU,
289+
Language.THAI,
290+
Language.TURKISH,
291+
Language.UKRAINIAN,
292+
Language.URDU,
293+
Language.VIETNAMESE,
294+
Language.YORUBA,
295+
Language.ZULU,
296+
]
297+
for sensitivity_label in ["ALL", "CA", "CS", "UNK"]
298+
]
299+
300+
301+
def mmlu_pro(line, task_name: str = None):
302+
instruction = f"Given the following question about {line['category']} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of {' ,'.join(LETTER_INDICES[: len(line['choices'] - 1)])}, or {LETTER_INDICES[len(line['choices'])]}. Think step by step before answering.\n\n"
303+
query = f"{instruction}###\nQuery:\n{line['question']}\n###\nChoices:\n"
304+
query += "".join([f"\n{key}) {choice}" for key, choice in zip(LETTER_INDICES, line["choices"])])
305+
306+
return Doc(
307+
task_name=task_name,
308+
query=query,
309+
choices=LETTER_INDICES[: len(line["choices"])],
310+
gold_index=line["answer_index"],
311+
instruction=instruction,
312+
)
313+
314+
315+
mmlu_pro = LightevalTaskConfig(
316+
name="mmlu_pro",
317+
suite=["lighteval"],
318+
prompt_function=mmlu_pro,
319+
hf_repo="TIGER-Lab/MMLU-Pro",
320+
hf_subset="default",
321+
hf_avail_splits=["validation", "test"],
322+
evaluation_splits=["test"],
323+
few_shots_split="validation",
324+
few_shots_select=None,
325+
generation_size=32768, # needed for reasoning models like R1
326+
stop_sequence=[], # no stop sequence, will use eos token
327+
metric=SampleLevelMetric(
328+
metric_name="pass@1:1_samples",
329+
sample_level_fn=PassAtK(
330+
k=1,
331+
n=1,
332+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
333+
language=Language.ENGLISH,
334+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
335+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
336+
precision=6,
337+
).sample_level_fn([ref], [pred], doc),
338+
).compute,
339+
category=MetricCategory.GENERATIVE_SAMPLING,
340+
use_case=MetricUseCase.REASONING,
341+
corpus_level_fn=np.mean,
342+
higher_is_better=True,
343+
),
344+
trust_dataset=True,
345+
version=0,
346+
)

0 commit comments

Comments
 (0)