@@ -490,12 +490,21 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
490490 hlog_warn (f"{ task_name } , { metric_name } got an OVERFLOW ERROR when computing stderr." )
491491
492492 # We group subtasks which belong to the same parent task, like MMLU, to compute an average on them
493+ # and compute an average of all metrics
493494 grouped_tasks = collections .defaultdict (list )
494- for k in self .metric_aggregated .keys ():
495+ suite_average = {}
496+ suite_nb = {}
497+
498+ # Build aggregation
499+ for k , metrics in self .metric_aggregated .items ():
495500 if "|" in k :
496501 suite , task , fewshot = k .split ("|" )
497502 grouped_tasks [f"{ suite } |{ task .split (':' )[0 ]} :_average|{ fewshot } " ].append (k )
503+ for metric , value in metrics .items ():
504+ suite_average [metric ] = suite_average .get (metric , 0 ) + value
505+ suite_nb [metric ] = suite_nb .get (metric , 0 ) + 1
498506
507+ # Compute average for sub groups
499508 for average_task , list_of_subtasks in grouped_tasks .items ():
500509 if len (list_of_subtasks ) > 1 :
501510 metrics = list (self .metric_aggregated [list_of_subtasks [0 ]].keys ())
@@ -504,6 +513,12 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
504513 for metric in metrics
505514 }
506515
516+ # Compute average for all
517+ for metric , value in suite_average .items ():
518+ suite_average [metric ] = value / suite_nb [metric ]
519+
520+ self .metric_aggregated ["all" ] = suite_average
521+
507522
508523class VersionsLogger :
509524 """Logger of the tasks versions.
0 commit comments