Skip to content
47 changes: 47 additions & 0 deletions examples/tasks/instruct_multilingual.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
extended|belebele_instruct_deu_Latn|0|0
extended|belebele_instruct_fra_Latn|0|0
extended|belebele_instruct_ita_Latn|0|0
extended|belebele_instruct_por_Latn|0|0
extended|belebele_instruct_spa_Latn|0|0
extended|global_mmlu_instruct_amh|0|0
extended|global_mmlu_instruct_ara|0|0
extended|global_mmlu_instruct_ben|0|0
extended|global_mmlu_instruct_ces|0|0
extended|global_mmlu_instruct_deu|0|0
extended|global_mmlu_instruct_ell|0|0
extended|global_mmlu_instruct_eng|0|0
extended|global_mmlu_instruct_spa|0|0
extended|global_mmlu_instruct_fas|0|0
extended|global_mmlu_instruct_fra|0|0
extended|global_mmlu_instruct_hau|0|0
extended|global_mmlu_instruct_heb|0|0
extended|global_mmlu_instruct_hin|0|0
extended|global_mmlu_instruct_ind|0|0
extended|global_mmlu_instruct_ibo|0|0
extended|global_mmlu_instruct_ita|0|0
extended|global_mmlu_instruct_jpn|0|0
extended|global_mmlu_instruct_kor|0|0
extended|global_mmlu_instruct_kir|0|0
extended|global_mmlu_instruct_lit|0|0
extended|global_mmlu_instruct_mlg|0|0
extended|global_mmlu_instruct_msa|0|0
extended|global_mmlu_instruct_nep|0|0
extended|global_mmlu_instruct_nld|0|0
extended|global_mmlu_instruct_nor|0|0
extended|global_mmlu_instruct_pol|0|0
extended|global_mmlu_instruct_por|0|0
extended|global_mmlu_instruct_ron|0|0
extended|global_mmlu_instruct_rus|0|0
extended|global_mmlu_instruct_sin|0|0
extended|global_mmlu_instruct_sna|0|0
extended|global_mmlu_instruct_som|0|0
extended|global_mmlu_instruct_srp|0|0
extended|global_mmlu_instruct_swe|0|0
extended|global_mmlu_instruct_swa|0|0
extended|global_mmlu_instruct_tel|0|0
extended|global_mmlu_instruct_tur|0|0
extended|global_mmlu_instruct_ukr|0|0
extended|global_mmlu_instruct_vie|0|0
extended|global_mmlu_instruct_yor|0|0
extended|global_mmlu_instruct_zho|0|0
extended|mmlu_pro|0|0
8 changes: 4 additions & 4 deletions src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import lighteval.tasks.extended.misc.instruct as instruct
from lighteval.utils.imports import can_load_extended_tasks


AVAILABLE_EXTENDED_TASKS_MODULES = [instruct]

if can_load_extended_tasks():
import lighteval.tasks.extended.hle.main as hle
import lighteval.tasks.extended.ifeval.main as ifeval
Expand All @@ -32,7 +35,4 @@
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks

AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]

else:
AVAILABLE_EXTENDED_TASKS_MODULES = []
AVAILABLE_EXTENDED_TASKS_MODULES.extend([ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb])
293 changes: 293 additions & 0 deletions src/lighteval/tasks/extended/misc/instruct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


import numpy as np

from lighteval.metrics.dynamic_metrics import (
IndicesExtractionConfig,
multilingual_extractive_match_metric,
)
from lighteval.metrics.metrics import MetricCategory, MetricUseCase, SampleLevelMetric
from lighteval.metrics.metrics_sample import (
PassAtK,
)
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils.language import Language


TASKS_TABLE = []

lang_to_literal = {
"deu": Language.GERMAN,
"fra": Language.FRENCH,
"ita": Language.ITALIAN,
"por": Language.PORTUGUESE,
"spa": Language.SPANISH,
}


def belebele_prompt(line, task_name: str = None):
lang_to_template = {
"eng_Latn": "Given the following passage, query, and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nPassage:\n{Passage}\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"deu_Latn": "Gib basierend auf dem folgenden Textabschnitt, der Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nTextabschnitt:\n{Passage}\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"fra_Latn": "A partir du passage suivant, de la question et des choix de réponses, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nPassage:\n{Passage}\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"ita_Latn": "Dato il seguente passaggio, un quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nPassaggio:\n{Passage}\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"por_Latn": "Tendo em conta a seguinte passagem, pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPassagem:\n{Passage}\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"spa_Latn": "Dado el siguiente contexto, pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\n###\nContexto:\n{Passage}\n###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
}

gold_index = int(line["correct_answer_num"]) - 1
choices = [line["mc_answer1"], line["mc_answer2"], line["mc_answer3"], line["mc_answer4"]]
query_template = lang_to_template.get(line["dialect"], "eng_Latn")
query = query_template.format(
A=choices[0],
B=choices[1],
C=choices[2],
D=choices[3],
Passage=line["flores_passage"],
Question=line["question"],
)
instruction = query_template.split("\n\n###")[0]

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=instruction,
)


BELEBELE_TASKS = [
LightevalTaskConfig(
name=f"belebele_instruct_{lang}_Latn",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should call this belebele_instruct_5_{lang}_Latn or belebele_instruct_smollm_{lang}_Latn to distinguish from the general case with more languages?

The alternative would be to have a separate belebele_instruct_en_{lang}_{script} for the full set of languages, but with English instructions

Copy link
Member Author

@clefourrier clefourrier Jun 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add the latter, and have a
`belebele_native_inst_{lang}" vs "belebele_en_inst_{lang}"
:)

prompt_function=belebele_prompt,
suite=["extended"],
hf_repo="facebook/belebele",
hf_subset=f"{lang}_Latn",
evaluation_splits=["test"],
hf_avail_splits=["test"],
few_shots_split=None,
few_shots_select=None,
generation_size=32768, # needed for reasoning models like R1
metric=[
SampleLevelMetric(
metric_name="pass@1:1_samples",
sample_level_fn=PassAtK(
k=1,
n=1,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=lang_to_literal[lang],
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=1,
)
for lang in [
"deu",
"fra",
"ita",
"por",
"spa",
]
]
TASKS_TABLE.extend(BELEBELE_TASKS)


class GlobalMMLUPrompt:
def __init__(self, lang):
self.lang = lang
self.lang_to_template = {
"eng": "Given the following query and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n\n###\nQuery:\n{Question}\n###\nChoices:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"deu": "Gib basierend auf der folgenden Frage und den Antwortmöglichkeiten den Buchstaben aus, der der richtigen Antwort entspricht. Die letzte Zeile deiner Antwort sollte folgendes Format haben: 'Antwort: $BUCHSTABE' (ohne Anführungszeichen), wobei BUCHSTABE einer der folgenden ist: A, B, C oder D. Denke Schritt für Schritt, bevor du antwortest.\n\n###\nFrage:\n{Question}\n###\nAntwortmöglichkeiten:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"fra": "A partir de la question et des choix de réponses suivants, indiquez la lettre correspondant à la bonne réponse. La dernière ligne de votre réponse doit avoir le format suivant : 'Réponse: '$LETTRE' (sans les guillemets) où LETTRE est l'une des lettres: A, B, C ou D. Réfléchissez étape par étape avant de répondre.\n\n###\nRequête:\n{Question}\n###\nChoix:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"ita": "Dato il seguente quesito e le diverse opzioni per una risposta, indicare la lettera corrispondente alla risposta corretta. L'ultima riga della risposta deve avere il seguente formato: 'Risposta: $LETTERA' (senza virgolette), e LETTERA è necessariamente una tra A, B, C, D. Prima di rispondere, è importante che si ragioni passo per passo.\n\n###\nQuesito:\n{Question}\n###\nOpzioni:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"por": "Tendo em conta a seguinte pergunta e opções de resposta, indique a letra correspondente à resposta correta. A última linha da sua resposta deve ter o seguinte formato: 'Resposta: $LETRA' (sem aspas) em que LETRA é uma de A, B, C ou D. Pense passo a passo antes de responder.\n\n###\nPergunta:\n{Question}\n###\nOpções:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
"spa": "Dado el siguiente pregunta y opciones para la respuesta, escriba la letra correspondiente a la respuesta correcta. La última línea de su respuesta debe seguir el siguiente formato: 'Respuesta: $LETTER' (sin comillas) donde LETTER es A, B, C o D. Piense paso a paso antes de responder.\n\\###\nPregunta:\n{Question}\n###\nOpciones:\nA) {A}\nB) {B}\nC) {C}\nD) {D}",
}

def prompt(self, line, task_name: str = None):
gold_index = LETTER_INDICES.index(line["answer"])
choices = [line["option_a"], line["option_b"], line["option_c"], line["option_d"]]
query_template = self.lang_to_template.get(self.lang, "eng")
query = query_template.format(
A=choices[0],
B=choices[1],
C=choices[2],
D=choices[3],
Question=line["question"],
)
instruction = query_template.split("\n\n###")[0]

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=instruction,
)


GLOBAL_MMLU_TASKS = [
LightevalTaskConfig(
name=f"global_mmlu_instruct_{language.value}",
prompt_function=GlobalMMLUPrompt(language.value).prompt,
suite=["extended"],
hf_repo="CohereForAI/Global-MMLU",
hf_subset=lang,
evaluation_splits=("test",),
few_shots_split="dev",
metric=[
SampleLevelMetric(
metric_name="pass@1:1_samples",
sample_level_fn=PassAtK(
k=1,
n=1,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=language,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
],
generation_size=32768, # needed for reasoning models like R1
stop_sequence=[], # no stop sequence, will use eos token
)
for lang, language in [
("am", Language.AMHARIC),
("ar", Language.ARABIC),
("bn", Language.BENGALI),
("cs", Language.CZECH),
("de", Language.GERMAN),
("el", Language.GREEK),
("en", Language.ENGLISH),
("es", Language.SPANISH),
("fa", Language.PERSIAN),
# ("fil", Language.FILIPINO),
("fr", Language.FRENCH),
("ha", Language.HAUSA),
("he", Language.HEBREW),
("hi", Language.HINDI),
("id", Language.INDONESIAN),
("ig", Language.IGBO),
("it", Language.ITALIAN),
("ja", Language.JAPANESE),
("ko", Language.KOREAN),
("ky", Language.KYRGYZ),
("lt", Language.LITHUANIAN),
("mg", Language.MALAGASY),
("ms", Language.MALAY),
("ne", Language.NEPALI),
("nl", Language.DUTCH),
("ny", Language.NORWEGIAN),
("pl", Language.POLISH),
("pt", Language.PORTUGUESE),
("ro", Language.ROMANIAN),
("ru", Language.RUSSIAN),
("si", Language.SINHALA),
("sn", Language.SHONA),
("so", Language.SOMALI),
("sr", Language.SERBIAN),
("sv", Language.SWEDISH),
("sw", Language.SWAHILI),
("te", Language.TELUGU),
("tr", Language.TURKISH),
("uk", Language.UKRAINIAN),
("vi", Language.VIETNAMESE),
("yo", Language.YORUBA),
("zh", Language.CHINESE),
]
]
TASKS_TABLE.extend(GLOBAL_MMLU_TASKS)


def mmlu_pro(line, task_name: str = None):
num_choices = len(line["options"])
instruction = f"Given the following question about {line['category']} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of {' ,'.join(LETTER_INDICES[: num_choices - 1])}, or {LETTER_INDICES[num_choices]}. Think step by step before answering.\n\n"
query = f"{instruction}###\nQuery:\n{line['question']}\n###\nChoices:"
query += "".join([f"\n{key}) {choice}" for key, choice in zip(LETTER_INDICES, line["options"])])

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[:num_choices],
gold_index=line["answer_index"],
instruction=instruction,
)


mmlu_pro = LightevalTaskConfig(
name="mmlu_pro",
suite=["extended"],
prompt_function=mmlu_pro,
hf_repo="TIGER-Lab/MMLU-Pro",
hf_subset="default",
hf_avail_splits=["validation", "test"],
evaluation_splits=["test"],
few_shots_split="validation",
few_shots_select=None,
generation_size=32768, # needed for reasoning models like R1
stop_sequence=[], # no stop sequence, will use eos token
metric=[
SampleLevelMetric(
metric_name="pass@1:1_samples",
sample_level_fn=PassAtK(
k=1,
n=1,
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
).sample_level_fn([ref], [pred], doc),
).compute,
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
corpus_level_fn=np.mean,
higher_is_better=True,
)
],
trust_dataset=True,
version=0,
)

TASKS_TABLE.append(mmlu_pro)
print(TASKS_TABLE)
Loading