@@ -125,6 +125,7 @@ def belebele_prompt(line, task_name: str = None):
125125 "spa" ,
126126 ]
127127]
128+ TASKS_TABLE .extend (BELEBELE_TASKS )
128129
129130
130131MMLU_SUBSETS = [
@@ -225,8 +226,8 @@ def prompt(self, line, task_name: str = None):
225226GLOBAL_MMLU_TASKS = [
226227 LightevalTaskConfig (
227228 name = f"global_mmlu_instruct_{ sensitivity_label .lower ()} _{ language .value } :{ subset } " ,
228- prompt_function = GlobalMMLUPrompt (language ).prompt ,
229- suite = ( "extended" ) ,
229+ prompt_function = GlobalMMLUPrompt (language . value ).prompt ,
230+ suite = [ "extended" ] ,
230231 hf_repo = "CohereForAI/Global-MMLU" ,
231232 hf_subset = standardize_tag (language .value ),
232233 evaluation_splits = ("test" ,),
@@ -240,23 +241,25 @@ def prompt(self, line, task_name: str = None):
240241 subset ,
241242 sensitivity_label ,
242243 ),
243- metric = SampleLevelMetric (
244- metric_name = "pass@1:1_samples" ,
245- sample_level_fn = PassAtK (
246- k = 1 ,
247- n = 1 ,
248- sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
249- language = language ,
250- gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
251- pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
252- precision = 6 ,
253- ).sample_level_fn ([ref ], [pred ], doc ),
254- ).compute ,
255- category = MetricCategory .GENERATIVE_SAMPLING ,
256- use_case = MetricUseCase .REASONING ,
257- corpus_level_fn = np .mean ,
258- higher_is_better = True ,
259- ),
244+ metric = [
245+ SampleLevelMetric (
246+ metric_name = "pass@1:1_samples" ,
247+ sample_level_fn = PassAtK (
248+ k = 1 ,
249+ n = 1 ,
250+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
251+ language = language ,
252+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
253+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
254+ precision = 6 ,
255+ ).sample_level_fn ([ref ], [pred ], doc ),
256+ ).compute ,
257+ category = MetricCategory .GENERATIVE_SAMPLING ,
258+ use_case = MetricUseCase .REASONING ,
259+ corpus_level_fn = np .mean ,
260+ higher_is_better = True ,
261+ )
262+ ],
260263 generation_size = 32768 , # needed for reasoning models like R1
261264 stop_sequence = [], # no stop sequence, will use eos token
262265 )
@@ -266,45 +269,47 @@ def prompt(self, line, task_name: str = None):
266269 Language .ENGLISH ,
267270 Language .SPANISH ,
268271 Language .FRENCH ,
269- Language .HEBREW ,
272+ # Language.HEBREW,
270273 Language .HINDI ,
271274 Language .INDONESIAN ,
272275 Language .ITALIAN ,
273276 Language .JAPANESE ,
274- Language .KOREAN ,
275- Language .MALAY ,
277+ # Language.KOREAN,
278+ # Language.MALAY,
276279 Language .DUTCH ,
277280 Language .NORWEGIAN ,
278281 Language .POLISH ,
279282 Language .PORTUGUESE ,
280- Language .ROMANIAN ,
283+ # Language.ROMANIAN,
281284 Language .RUSSIAN ,
282285 Language .SERBIAN ,
283286 Language .SWEDISH ,
284287 Language .SWAHILI ,
285- Language .TAMIL ,
288+ # Language.TAMIL,
286289 Language .TELUGU ,
287290 Language .THAI ,
288291 Language .TURKISH ,
289292 Language .UKRAINIAN ,
290293 Language .URDU ,
291294 Language .VIETNAMESE ,
292- Language .YORUBA ,
293- Language .ZULU ,
295+ # Language.YORUBA,
296+ # Language.ZULU, missing literals
294297 ]
295298 for sensitivity_label in ["ALL" , "CA" , "CS" , "UNK" ]
296299]
300+ TASKS_TABLE .extend (GLOBAL_MMLU_TASKS )
297301
298302
299303def mmlu_pro (line , task_name : str = None ):
300- instruction = f"Given the following question about { line ['category' ]} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of { ' ,' .join (LETTER_INDICES [: len (line ['choices' ] - 1 )])} , or { LETTER_INDICES [len (line ['choices' ])]} . Think step by step before answering.\n \n "
304+ num_choices = len (line ["options" ])
305+ instruction = f"Given the following question about { line ['category' ]} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of { ' ,' .join (LETTER_INDICES [: num_choices - 1 ])} , or { LETTER_INDICES [num_choices ]} . Think step by step before answering.\n \n "
301306 query = f"{ instruction } ###\n Query:\n { line ['question' ]} \n ###\n Choices:\n "
302- query += "" .join ([f"\n { key } ) { choice } " for key , choice in zip (LETTER_INDICES , line ["choices " ])])
307+ query += "" .join ([f"\n { key } ) { choice } " for key , choice in zip (LETTER_INDICES , line ["options " ])])
303308
304309 return Doc (
305310 task_name = task_name ,
306311 query = query ,
307- choices = LETTER_INDICES [: len ( line [ "choices" ]) ],
312+ choices = LETTER_INDICES [:num_choices ],
308313 gold_index = line ["answer_index" ],
309314 instruction = instruction ,
310315 )
@@ -322,27 +327,27 @@ def mmlu_pro(line, task_name: str = None):
322327 few_shots_select = None ,
323328 generation_size = 32768 , # needed for reasoning models like R1
324329 stop_sequence = [], # no stop sequence, will use eos token
325- metric = SampleLevelMetric (
326- metric_name = "pass@1:1_samples" ,
327- sample_level_fn = PassAtK (
328- k = 1 ,
329- n = 1 ,
330- sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
331- language = Language .ENGLISH ,
332- gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
333- pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
334- precision = 6 ,
335- ).sample_level_fn ([ref ], [pred ], doc ),
336- ).compute ,
337- category = MetricCategory .GENERATIVE_SAMPLING ,
338- use_case = MetricUseCase .REASONING ,
339- corpus_level_fn = np .mean ,
340- higher_is_better = True ,
341- ),
330+ metric = [
331+ SampleLevelMetric (
332+ metric_name = "pass@1:1_samples" ,
333+ sample_level_fn = PassAtK (
334+ k = 1 ,
335+ n = 1 ,
336+ sample_scoring_function = lambda pred , ref , doc : multilingual_extractive_match_metric (
337+ language = Language .ENGLISH ,
338+ gold_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
339+ pred_extraction_target = [IndicesExtractionConfig (prefix_for_extraction = "NativeLetters" )],
340+ precision = 6 ,
341+ ).sample_level_fn ([ref ], [pred ], doc ),
342+ ).compute ,
343+ category = MetricCategory .GENERATIVE_SAMPLING ,
344+ use_case = MetricUseCase .REASONING ,
345+ corpus_level_fn = np .mean ,
346+ higher_is_better = True ,
347+ )
348+ ],
342349 trust_dataset = True ,
343350 version = 0 ,
344351)
345352
346- TASKS_TABLE .extend (BELEBELE_TASKS )
347- TASKS_TABLE .extend (GLOBAL_MMLU_TASKS )
348353TASKS_TABLE .append (mmlu_pro )
0 commit comments