Create calculate_num_samples method in evaluation_tracker to count number of samples per task

itsmejul · itsmejul · commit d2064d23303e · 2025-07-03T19:04:44.000+02:00
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import collections
 import json
 import logging
 import os
@@ -724,3 +725,29 @@ def push_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard"
             f" at global_step {global_step}"
         )
+
+    def calculate_num_samples(self) -> dict[str, int]:
+        """
+        Counts the number of samples per task, includes grouped tasks.
+        This implementation is oriented on MetricsLogger.aggregate(), to make sure the subgroups of tasks match up.
+        """
+
+        # Count samples of individual tasks
+        num_samples = {task: len(samples) for task, samples in self.details_logger.details.items()}
+
+        # Count samples for sub groups
+        grouped_tasks = collections.defaultdict(list)
+
+        for task in num_samples:
+            if "|" in task:
+                suite, task, fewshot = task.split("|")
+                grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(task)
+
+        for average_task, list_of_subtasks in grouped_tasks.items():
+            if len(list_of_subtasks) > 1:
+                num_samples[average_task] = sum(num_samples[k] for k in list_of_subtasks)
+
+        # Add sample count for all
+        num_samples["all"] = sum(count for task, count in num_samples.items() if task != "all")
+
+        return num_samples