@@ -125,6 +125,7 @@ def belebele_prompt(line, task_name: str = None):
125
125
"spa" ,
126
126
]
127
127
]
128
+ TASKS_TABLE .extend (BELEBELE_TASKS )
128
129
129
130
130
131
MMLU_SUBSETS = [
@@ -225,8 +226,8 @@ def prompt(self, line, task_name: str = None):
225
226
GLOBAL_MMLU_TASKS = [
226
227
LightevalTaskConfig (
227
228
name = f"global_mmlu_instruct_{ sensitivity_label .lower ()} _{ language .value } :{ subset } " ,
228
- prompt_function = GlobalMMLUPrompt (language ).prompt ,
229
- suite = ( "extended" ) ,
229
+ prompt_function = GlobalMMLUPrompt (language . value ).prompt ,
230
+ suite = [ "extended" ] ,
230
231
hf_repo = "CohereForAI/Global-MMLU" ,
231
232
hf_subset = standardize_tag (language .value ),
232
233
evaluation_splits = ("test" ,),
@@ -240,23 +241,25 @@ def prompt(self, line, task_name: str = None):
240
241
subset ,
241
242
sensitivity_label ,
242
243
),
243
- metric = SampleLevelMetric (
244
- metric_name = "pass@1:1_samples" ,
245
- sample_level_fn = PassAtK (
246
- k = 1 ,
247
- n = 1 ,
248
- sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
249
- language = language ,
250
- gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
251
- pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
252
- precision = 6 ,
253
- ).sample_level_fn ([ref ], [pred ], doc ),
254
- ).compute ,
255
- category = MetricCategory .GENERATIVE_SAMPLING ,
256
- use_case = MetricUseCase .REASONING ,
257
- corpus_level_fn = np .mean ,
258
- higher_is_better = True ,
259
- ),
244
+ metric = [
245
+ SampleLevelMetric (
246
+ metric_name = "pass@1:1_samples" ,
247
+ sample_level_fn = PassAtK (
248
+ k = 1 ,
249
+ n = 1 ,
250
+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
251
+ language = language ,
252
+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
253
+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
254
+ precision = 6 ,
255
+ ).sample_level_fn ([ref ], [pred ], doc ),
256
+ ).compute ,
257
+ category = MetricCategory .GENERATIVE_SAMPLING ,
258
+ use_case = MetricUseCase .REASONING ,
259
+ corpus_level_fn = np .mean ,
260
+ higher_is_better = True ,
261
+ )
262
+ ],
260
263
generation_size = 32768 , # needed for reasoning models like R1
261
264
stop_sequence = [], # no stop sequence, will use eos token
262
265
)
@@ -266,45 +269,47 @@ def prompt(self, line, task_name: str = None):
266
269
Language .ENGLISH ,
267
270
Language .SPANISH ,
268
271
Language .FRENCH ,
269
- Language .HEBREW ,
272
+ # Language.HEBREW,
270
273
Language .HINDI ,
271
274
Language .INDONESIAN ,
272
275
Language .ITALIAN ,
273
276
Language .JAPANESE ,
274
- Language .KOREAN ,
275
- Language .MALAY ,
277
+ # Language.KOREAN,
278
+ # Language.MALAY,
276
279
Language .DUTCH ,
277
280
Language .NORWEGIAN ,
278
281
Language .POLISH ,
279
282
Language .PORTUGUESE ,
280
- Language .ROMANIAN ,
283
+ # Language.ROMANIAN,
281
284
Language .RUSSIAN ,
282
285
Language .SERBIAN ,
283
286
Language .SWEDISH ,
284
287
Language .SWAHILI ,
285
- Language .TAMIL ,
288
+ # Language.TAMIL,
286
289
Language .TELUGU ,
287
290
Language .THAI ,
288
291
Language .TURKISH ,
289
292
Language .UKRAINIAN ,
290
293
Language .URDU ,
291
294
Language .VIETNAMESE ,
292
- Language .YORUBA ,
293
- Language .ZULU ,
295
+ # Language.YORUBA,
296
+ # Language.ZULU, missing literals
294
297
]
295
298
for sensitivity_label in ["ALL" , "CA" , "CS" , "UNK" ]
296
299
]
300
+ TASKS_TABLE .extend (GLOBAL_MMLU_TASKS )
297
301
298
302
299
303
def mmlu_pro (line , task_name : str = None ):
300
- instruction = f"Given the following question about { line ['category' ]} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of { ' ,' .join (LETTER_INDICES [: len (line ['choices' ] - 1 )])} , or { LETTER_INDICES [len (line ['choices' ])]} . Think step by step before answering.\n \n "
304
+ num_choices = len (line ["options" ])
305
+ instruction = f"Given the following question about { line ['category' ]} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of { ' ,' .join (LETTER_INDICES [: num_choices - 1 ])} , or { LETTER_INDICES [num_choices ]} . Think step by step before answering.\n \n "
301
306
query = f"{ instruction } ###\n Query:\n { line ['question' ]} \n ###\n Choices:\n "
302
- query += "" .join ([f"\n { key } ) { choice } " for key , choice in zip (LETTER_INDICES , line ["choices " ])])
307
+ query += "" .join ([f"\n { key } ) { choice } " for key , choice in zip (LETTER_INDICES , line ["options " ])])
303
308
304
309
return Doc (
305
310
task_name = task_name ,
306
311
query = query ,
307
- choices = LETTER_INDICES [: len ( line [ "choices" ]) ],
312
+ choices = LETTER_INDICES [:num_choices ],
308
313
gold_index = line ["answer_index" ],
309
314
instruction = instruction ,
310
315
)
@@ -322,27 +327,27 @@ def mmlu_pro(line, task_name: str = None):
322
327
few_shots_select = None ,
323
328
generation_size = 32768 , # needed for reasoning models like R1
324
329
stop_sequence = [], # no stop sequence, will use eos token
325
- metric = SampleLevelMetric (
326
- metric_name = "pass@1:1_samples" ,
327
- sample_level_fn = PassAtK (
328
- k = 1 ,
329
- n = 1 ,
330
- sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
331
- language = Language .ENGLISH ,
332
- gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
333
- pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
334
- precision = 6 ,
335
- ).sample_level_fn ([ref ], [pred ], doc ),
336
- ).compute ,
337
- category = MetricCategory .GENERATIVE_SAMPLING ,
338
- use_case = MetricUseCase .REASONING ,
339
- corpus_level_fn = np .mean ,
340
- higher_is_better = True ,
341
- ),
330
+ metric = [
331
+ SampleLevelMetric (
332
+ metric_name = "pass@1:1_samples" ,
333
+ sample_level_fn = PassAtK (
334
+ k = 1 ,
335
+ n = 1 ,
336
+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
337
+ language = Language .ENGLISH ,
338
+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
339
+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
340
+ precision = 6 ,
341
+ ).sample_level_fn ([ref ], [pred ], doc ),
342
+ ).compute ,
343
+ category = MetricCategory .GENERATIVE_SAMPLING ,
344
+ use_case = MetricUseCase .REASONING ,
345
+ corpus_level_fn = np .mean ,
346
+ higher_is_better = True ,
347
+ )
348
+ ],
342
349
trust_dataset = True ,
343
350
version = 0 ,
344
351
)
345
352
346
- TASKS_TABLE .extend (BELEBELE_TASKS )
347
- TASKS_TABLE .extend (GLOBAL_MMLU_TASKS )
348
353
TASKS_TABLE .append (mmlu_pro )
0 commit comments