3232from  typing  import  Any , Dict , List , Optional , Union 
3333
3434from  lighteval .metrics .llm_as_judge  import  JudgeLM 
35- from  lighteval .metrics .metrics  import  Metric , MetricCategory , Metrics 
36- from  lighteval .metrics .utils .metric_utils  import  MetricUseCase 
35+ from  lighteval .metrics .metrics  import  Metric , Metrics 
3736from  lighteval .tasks .default_prompts  import  LETTER_INDICES 
3837from  lighteval .tasks .lighteval_task  import  LightevalTaskConfig 
39- from  lighteval .tasks .requests  import  Doc 
38+ from  lighteval .tasks .requests  import  Doc ,  SamplingMethod 
4039
4140
4241# fmt: off 
@@ -104,7 +103,7 @@ def __init__(
104103            hf_subset = hf_subset ,
105104            prompt_function = arabic_mmlu_pfn ,
106105            hf_repo = "MBZUAI/ArabicMMLU" ,
107-             metric = [Metrics .loglikelihood_acc_norm ],
106+             metrics = [Metrics .loglikelihood_acc_norm ],
108107            hf_avail_splits = ["test" ],
109108            evaluation_splits = ["test" ],
110109            few_shots_split = ["dev" ],
@@ -166,7 +165,7 @@ def __init__(
166165            hf_subset = hf_subset ,
167166            prompt_function = arabic_mmlu_ht_pfn ,
168167            hf_repo = "MBZUAI/human_translated_arabic_mmlu" ,
169-             metric = [Metrics .loglikelihood_acc_norm ],
168+             metrics = [Metrics .loglikelihood_acc_norm ],
170169            hf_avail_splits = ["test" ],
171170            evaluation_splits = ["test" ],
172171            few_shots_split = None ,
@@ -231,7 +230,7 @@ def __init__(
231230            hf_subset = hf_subset ,
232231            prompt_function = arabic_mmlu_mt_pfn ,
233232            hf_repo = "OALL/Arabic_MMLU" ,
234-             metric = [Metrics .loglikelihood_acc_norm ],
233+             metrics = [Metrics .loglikelihood_acc_norm ],
235234            hf_avail_splits = ["test" , "dev" ],
236235            evaluation_splits = ["test" ],
237236            few_shots_split = "dev" ,
@@ -287,7 +286,7 @@ def __init__(
287286            hf_subset = hf_subset ,
288287            prompt_function = acva_pfn ,
289288            hf_repo = "OALL/ACVA" ,
290-             metric = [Metrics .loglikelihood_acc_norm ],
289+             metrics = [Metrics .loglikelihood_acc_norm ],
291290            hf_avail_splits = ["test" , "validation" ],
292291            evaluation_splits = ["test" ],
293292            few_shots_split = "validation" ,
@@ -344,7 +343,7 @@ def __init__(
344343            hf_subset = hf_subset ,
345344            prompt_function = aratrust_pfn ,
346345            hf_repo = "asas-ai/AraTrust-categorized" ,
347-             metric = [Metrics .loglikelihood_acc_norm ],
346+             metrics = [Metrics .loglikelihood_acc_norm ],
348347            hf_avail_splits = ["train" ],
349348            evaluation_splits = ["train" ],
350349            few_shots_split = None ,
@@ -393,7 +392,7 @@ def arabic_exams_pfn(line, task_name: str = None):
393392    evaluation_splits = ["test" ],
394393    few_shots_split = "validation" ,
395394    few_shots_select = "sequential" ,
396-     metric = [Metrics .loglikelihood_acc_norm ],
395+     metrics = [Metrics .loglikelihood_acc_norm ],
397396    trust_dataset = True ,
398397    version = 0 ,
399398)
@@ -444,7 +443,7 @@ def __init__(
444443            hf_subset = hf_subset ,
445444            prompt_function = alghafa_pfn ,
446445            hf_repo = "OALL/AlGhafa-Arabic-LLM-Benchmark-Native" ,
447-             metric = [Metrics .loglikelihood_acc_norm ],
446+             metrics = [Metrics .loglikelihood_acc_norm ],
448447            hf_avail_splits = ["test" , "validation" ],
449448            evaluation_splits = ["test" ],
450449            few_shots_split = "validation" ,
@@ -471,7 +470,7 @@ def __init__(
471470    evaluation_splits = ["test" ],
472471    few_shots_split = "validation" ,
473472    few_shots_select = "sequential" ,
474-     metric = [Metrics .loglikelihood_acc_norm ],
473+     metrics = [Metrics .loglikelihood_acc_norm ],
475474    trust_dataset = True ,
476475    version = 0 ,
477476)
@@ -488,7 +487,7 @@ def __init__(
488487    evaluation_splits = ["test" ],
489488    few_shots_split = "validation" ,
490489    few_shots_select = "sequential" ,
491-     metric = [Metrics .loglikelihood_acc_norm ],
490+     metrics = [Metrics .loglikelihood_acc_norm ],
492491    trust_dataset = True ,
493492    version = 0 ,
494493)
@@ -505,7 +504,7 @@ def __init__(
505504    evaluation_splits = ["test" ],
506505    few_shots_split = "validation" ,
507506    few_shots_select = "sequential" ,
508-     metric = [Metrics .loglikelihood_acc_norm ],
507+     metrics = [Metrics .loglikelihood_acc_norm ],
509508    trust_dataset = True ,
510509    version = 0 ,
511510)
@@ -522,7 +521,7 @@ def __init__(
522521    evaluation_splits = ["test" ],
523522    few_shots_split = "validation" ,
524523    few_shots_select = "sequential" ,
525-     metric = [Metrics .loglikelihood_acc_norm ],
524+     metrics = [Metrics .loglikelihood_acc_norm ],
526525    trust_dataset = True ,
527526    version = 0 ,
528527)
@@ -539,7 +538,7 @@ def __init__(
539538    evaluation_splits = ["test" ],
540539    few_shots_split = "validation" ,
541540    few_shots_select = "sequential" ,
542-     metric = [Metrics .loglikelihood_acc_norm ],
541+     metrics = [Metrics .loglikelihood_acc_norm ],
543542    trust_dataset = True ,
544543    version = 0 ,
545544)
@@ -556,7 +555,7 @@ def __init__(
556555    evaluation_splits = ["test" ],
557556    few_shots_split = "validation" ,
558557    few_shots_select = "sequential" ,
559-     metric = [Metrics .loglikelihood_acc_norm ],
558+     metrics = [Metrics .loglikelihood_acc_norm ],
560559    trust_dataset = True ,
561560    version = 0 ,
562561)
@@ -594,7 +593,7 @@ def boolq_arabic_pfn(line, task_name: str = None):
594593    evaluation_splits = ["test" ],
595594    few_shots_split = "validation" ,
596595    few_shots_select = "sequential" ,
597-     metric = [Metrics .loglikelihood_acc_norm ],
596+     metrics = [Metrics .loglikelihood_acc_norm ],
598597    trust_dataset = True ,
599598    version = 0 ,
600599)
@@ -629,7 +628,7 @@ def copa_arabic_pfn(line, task_name: str = None):
629628    evaluation_splits = ["test" ],
630629    few_shots_split = "validation" ,
631630    few_shots_select = "sequential" ,
632-     metric = [Metrics .loglikelihood_acc_norm ],
631+     metrics = [Metrics .loglikelihood_acc_norm ],
633632    trust_dataset = True ,
634633    version = 0 ,
635634)
@@ -673,7 +672,7 @@ def hellaswag_arabic_pfn(line, task_name: str = None):
673672    evaluation_splits = ["test" ],
674673    few_shots_split = "validation" ,
675674    few_shots_select = "sequential" ,
676-     metric = [Metrics .loglikelihood_acc_norm ],
675+     metrics = [Metrics .loglikelihood_acc_norm ],
677676    trust_dataset = True ,
678677    version = 0 ,
679678)
@@ -710,7 +709,7 @@ def toxigen_arabic_pfn(line, task_name: str = None):
710709    evaluation_splits = ["test" ],
711710    few_shots_split = "validation" ,
712711    few_shots_select = "sequential" ,
713-     metric = [Metrics .loglikelihood_acc_norm ],
712+     metrics = [Metrics .loglikelihood_acc_norm ],
714713    trust_dataset = True ,
715714    version = 0 ,
716715)
@@ -761,7 +760,7 @@ def sciq_arabic_pfn(line, task_name: str = None):
761760    evaluation_splits = ["test" ],
762761    few_shots_split = "validation" ,
763762    few_shots_select = "sequential" ,
764-     metric = [Metrics .loglikelihood_acc_norm ],
763+     metrics = [Metrics .loglikelihood_acc_norm ],
765764    trust_dataset = True ,
766765    version = 0 ,
767766)
@@ -819,7 +818,7 @@ def __init__(
819818            hf_subset = hf_subset ,
820819            prompt_function = madinah_qa_pfn ,
821820            hf_repo = "MBZUAI/MadinahQA" ,
822-             metric = [Metrics .loglikelihood_acc_norm ],
821+             metrics = [Metrics .loglikelihood_acc_norm ],
823822            hf_avail_splits = ["test" ],
824823            evaluation_splits = ["test" ],
825824            few_shots_split = ["dev" ],
@@ -849,11 +848,10 @@ def __init__(self, judge: JudgeLM):
849848        """ 
850849        self .judge  =  judge 
851850        self .metric_name  =  "llm_as_judge" 
852-         self .category  =  MetricCategory . LLM_AS_JUDGE 
851+         self .category  =  SamplingMethod . GENERATIVE 
853852        self .corpus_level_fn  =  self .aggregate_scores 
854853        self .sample_level_fn  =  self ._sample_level_fn 
855854        self .higher_is_better  =  True   # Fixed tuple syntax 
856-         self .use_case  =  MetricUseCase .NONE 
857855
858856    def  compute (self , responses : list [str ], formatted_docs : list [Doc ], ** kwargs ) ->  dict [str , float ]:
859857        """ 
@@ -1039,7 +1037,7 @@ def process_judge_response(response) -> float:
10391037    hf_subset = None ,
10401038    hf_avail_splits = ["train" ],
10411039    evaluation_splits = ["train" ],
1042-     metric = [wrapped_judge ],
1040+     metrics = [wrapped_judge ],
10431041    trust_dataset = True ,
10441042    generation_size = 200 ,
10451043    stop_sequence = [],
0 commit comments