3333)
3434from lighteval .tasks .default_prompts import LETTER_INDICES
3535from lighteval .tasks .lighteval_task import LightevalTaskConfig
36+ from lighteval .tasks .multilingual .tasks import LangCodeLanguage , iso_639_3_ind_to_iso_639_3_macro
3637from lighteval .tasks .requests import Doc
3738from lighteval .utils .language import Language
3839
4849}
4950
5051
52+ def belebele_prompt_en_instruct (line , task_name : str = None ):
53+ line ["dialect" ] == "eng_Latn"
54+ return belebele_prompt (line , task_name )
55+
56+
5157def belebele_prompt (line , task_name : str = None ):
5258 lang_to_template = {
5359 "eng_Latn" : "Given the following passage, query, and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of A, B, C, or D. Think step by step before answering.\n \n ###\n Passage:\n {Passage}\n ###\n Query:\n {Question}\n ###\n Choices:\n A) {A}\n B) {B}\n C) {C}\n D) {D}" ,
@@ -60,7 +66,7 @@ def belebele_prompt(line, task_name: str = None):
6066
6167 gold_index = int (line ["correct_answer_num" ]) - 1
6268 choices = [line ["mc_answer1" ], line ["mc_answer2" ], line ["mc_answer3" ], line ["mc_answer4" ]]
63- query_template = lang_to_template . get ( line ["dialect" ], "eng_Latn" )
69+ query_template = lang_to_template [ line ["dialect" ]]
6470 query = query_template .format (
6571 A = choices [0 ],
6672 B = choices [1 ],
@@ -80,9 +86,9 @@ def belebele_prompt(line, task_name: str = None):
8086 )
8187
8288
83- BELEBELE_TASKS = [
89+ BELEBELE_TASKS_NATIVE_INSTRUCT = [
8490 LightevalTaskConfig (
85- name = f"belebele_instruct_ { lang } _Latn" ,
91+ name = f"belebele_native_instruct_ { lang } _Latn" ,
8692 prompt_function = belebele_prompt ,
8793 suite = ["extended" ],
8894 hf_repo = "facebook/belebele" ,
@@ -123,7 +129,168 @@ def belebele_prompt(line, task_name: str = None):
123129 "spa" ,
124130 ]
125131]
126- TASKS_TABLE .extend (BELEBELE_TASKS )
132+
133+ BELEBELE_TASKS_EN_INSTRUCT = [
134+ LightevalTaskConfig (
135+ name = f"belebele_en_instruct_{ lang } " ,
136+ prompt_function = belebele_prompt_en_instruct ,
137+ suite = ["extended" ],
138+ hf_repo = "facebook/belebele" ,
139+ hf_subset = f"{ lang } _Latn" ,
140+ evaluation_splits = ["test" ],
141+ hf_avail_splits = ["test" ],
142+ few_shots_split = None ,
143+ few_shots_select = None ,
144+ generation_size = 32768 , # needed for reasoning models like R1
145+ metric = [
146+ SampleLevelMetric (
147+ metric_name = "pass@1:1_samples" ,
148+ sample_level_fn = PassAtK (
149+ k = 1 ,
150+ n = 1 ,
151+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
152+ language = iso_639_3_ind_to_iso_639_3_macro [LangCodeLanguage .get (lang ).to_alpha3 ()],
153+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
154+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
155+ precision = 6 ,
156+ ).sample_level_fn ([ref ], [pred ], doc ),
157+ ).compute ,
158+ category = MetricCategory .GENERATIVE_SAMPLING ,
159+ use_case = MetricUseCase .REASONING ,
160+ corpus_level_fn = np .mean ,
161+ higher_is_better = True ,
162+ )
163+ ],
164+ stop_sequence = [], # no stop sequence, will use eos token
165+ trust_dataset = True ,
166+ version = 1 ,
167+ )
168+ for lang in [
169+ "acm_Arab" ,
170+ "arz_Arab" ,
171+ "ceb_Latn" ,
172+ "fin_Latn" ,
173+ "hin_Deva" ,
174+ "ita_Latn" ,
175+ "khm_Khmr" ,
176+ "lvs_Latn" ,
177+ "npi_Deva" ,
178+ "pol_Latn" ,
179+ "slv_Latn" ,
180+ "swe_Latn" ,
181+ # "tso_Latn",
182+ # "xho_Latn",
183+ "afr_Latn" ,
184+ "asm_Beng" ,
185+ "ces_Latn" ,
186+ "fra_Latn" ,
187+ "hin_Latn" ,
188+ "jav_Latn" ,
189+ # "kin_Latn",
190+ "mal_Mlym" ,
191+ "npi_Latn" ,
192+ "por_Latn" ,
193+ # "sna_Latn",
194+ "swh_Latn" ,
195+ "tur_Latn" ,
196+ "yor_Latn" ,
197+ "als_Latn" ,
198+ "azj_Latn" ,
199+ "ckb_Arab" ,
200+ # "fuv_Latn",
201+ "hrv_Latn" ,
202+ "jpn_Jpan" ,
203+ "kir_Cyrl" ,
204+ "mar_Deva" ,
205+ # "nso_Latn",
206+ "snd_Arab" ,
207+ "tam_Taml" ,
208+ "ukr_Cyrl" ,
209+ "zho_Hans" ,
210+ "amh_Ethi" ,
211+ # "bam_Latn",
212+ "dan_Latn" ,
213+ # "gaz_Latn",
214+ "hun_Latn" ,
215+ # "kac_Latn",
216+ "kor_Hang" ,
217+ "mkd_Cyrl" ,
218+ # "nya_Latn",
219+ "ron_Latn" ,
220+ "som_Latn" ,
221+ "tel_Telu" ,
222+ "urd_Arab" ,
223+ "zho_Hant" ,
224+ "apc_Arab" ,
225+ "ben_Beng" ,
226+ "deu_Latn" ,
227+ # "grn_Latn",
228+ "hye_Armn" ,
229+ "kan_Knda" ,
230+ "lao_Laoo" ,
231+ "mlt_Latn" ,
232+ "ory_Orya" ,
233+ "rus_Cyrl" ,
234+ # "sot_Latn",
235+ "tgk_Cyrl" ,
236+ "urd_Latn" ,
237+ "zsm_Latn" ,
238+ "arb_Arab" ,
239+ "ben_Latn" ,
240+ "ell_Grek" ,
241+ "guj_Gujr" ,
242+ # "ibo_Latn",
243+ "kat_Geor" ,
244+ # "lin_Latn",
245+ # "mri_Latn",
246+ "pan_Guru" ,
247+ # "shn_Mymr",
248+ "spa_Latn" ,
249+ "tgl_Latn" ,
250+ "uzn_Latn" ,
251+ # "zul_Latn",
252+ "arb_Latn" ,
253+ # "bod_Tibt",
254+ "eng_Latn" ,
255+ # "hat_Latn",
256+ # "ilo_Latn",
257+ "kaz_Cyrl" ,
258+ "lit_Latn" ,
259+ "mya_Mymr" ,
260+ "pbt_Arab" ,
261+ "sin_Latn" ,
262+ "srp_Cyrl" ,
263+ "tha_Thai" ,
264+ "vie_Latn" ,
265+ "ars_Arab" ,
266+ "bul_Cyrl" ,
267+ "est_Latn" ,
268+ # "hau_Latn",
269+ "ind_Latn" ,
270+ # "kea_Latn",
271+ # "lug_Latn",
272+ "nld_Latn" ,
273+ "pes_Arab" ,
274+ "sin_Sinh" ,
275+ # "ssw_Latn",
276+ # "tir_Ethi",
277+ "war_Latn" ,
278+ "ary_Arab" ,
279+ "cat_Latn" ,
280+ "eus_Latn" ,
281+ "heb_Hebr" ,
282+ "isl_Latn" ,
283+ # "khk_Cyrl",
284+ # "luo_Latn",
285+ "nob_Latn" ,
286+ "plt_Latn" ,
287+ "slk_Latn" ,
288+ # "sun_Latn",
289+ # "tsn_Latn",
290+ # "wol_Latn",
291+ ]
292+ ]
293+ TASKS_TABLE .extend (BELEBELE_TASKS_NATIVE_INSTRUCT + BELEBELE_TASKS_EN_INSTRUCT )
127294
128295
129296class GlobalMMLUPrompt :
0 commit comments