Skip to content

Commit b1aa626

Browse files
Fix push details to hub (#98)
--------- Co-authored-by: Clémentine Fourrier <[email protected]> Co-authored-by: [email protected] <[email protected]>
1 parent 1187400 commit b1aa626

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

src/lighteval/logging/evaluation_tracker.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,8 +462,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
462462
last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset")
463463
f = load_dataset("json", data_files=last_results_file_path, split="train")
464464
results_dict = f["results"][0]
465-
value = results_dict.pop("all")
466-
new_dictionary = {"all": value}
465+
new_dictionary = {"all": results_dict}
467466
new_dictionary.update(results_dict)
468467
results_string = json.dumps(new_dictionary, indent=4)
469468

src/lighteval/logging/info_loggers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,12 +490,21 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
490490
hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.")
491491

492492
# We group subtasks which belong to the same parent task, like MMLU, to compute an average on them
493+
# and compute an average of all metrics
493494
grouped_tasks = collections.defaultdict(list)
494-
for k in self.metric_aggregated.keys():
495+
suite_average = {}
496+
suite_nb = {}
497+
498+
# Build aggregation
499+
for k, metrics in self.metric_aggregated.items():
495500
if "|" in k:
496501
suite, task, fewshot = k.split("|")
497502
grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(k)
503+
for metric, value in metrics.items():
504+
suite_average[metric] = suite_average.get(metric, 0) + value
505+
suite_nb[metric] = suite_nb.get(metric, 0) + 1
498506

507+
# Compute average for sub groups
499508
for average_task, list_of_subtasks in grouped_tasks.items():
500509
if len(list_of_subtasks) > 1:
501510
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
@@ -504,6 +513,12 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
504513
for metric in metrics
505514
}
506515

516+
# Compute average for all
517+
for metric, value in suite_average.items():
518+
suite_average[metric] = value / suite_nb[metric]
519+
520+
self.metric_aggregated["all"] = suite_average
521+
507522

508523
class VersionsLogger:
509524
"""Logger of the tasks versions.

0 commit comments

Comments
 (0)