refactor: created separate structure for storing summary for the model across all tasks and repeats

MagdalenaKotynia · MagdalenaKotynia · commit b4124d0d32b5 · 2025-08-14T10:56:14.000+02:00
diff --git a/src/rai_bench/rai_bench/base_benchmark.py b/src/rai_bench/rai_bench/base_benchmark.py
@@ -35,6 +35,44 @@ class RunSummary(BaseModel):
     total_tasks: int = Field(..., description="Total number of executed tasks.")
 
 
+class ModelSummary(BaseModel):
+    model_name: str = Field(..., description="Name of the LLM.")
+    avg_success_rate: float = Field(
+        ...,
+        description="Percentage of successfully completed tasks across all repeats.",
+    )
+    avg_total_tasks: float = Field(
+        ..., description="Average number of tasks executed through all repeats."
+    )
+    avg_time: float = Field(
+        ..., description="Average time taken across all tasks and repeats."
+    )
+
+    repeats: int = Field(
+        ..., description="Total number of repeats for the model for each task."
+    )
+
+
+class TasksSummary(BaseModel):
+    model_name: str = Field(..., description="Name of the LLM.")
+    avg_success_rate: float = Field(
+        ..., description="Average result for task across all repeats."
+    )
+    std_success_rate: float = Field(
+        ..., description="Standard deviation of the success rate across all repeats."
+    )
+    avg_time: float = Field(
+        ..., description="Average time taken across all repeats for one task."
+    )
+    std_time: float = Field(
+        ...,
+        description="Standard deviation of the time taken across all repeats for one task.",
+    )
+    total_tasks: int = Field(
+        ..., description="Total number of executed tasks across all repeats per task."
+    )
+
+
 class TimeoutException(Exception):
     pass
 
diff --git a/src/rai_bench/rai_bench/test_models.py b/src/rai_bench/rai_bench/test_models.py
@@ -26,7 +26,7 @@
 import rai_bench.manipulation_o3de as manipulation_o3de
 import rai_bench.tool_calling_agent as tool_calling_agent
 import rai_bench.vlm_benchmark as vlm_benchmark
-from rai_bench.base_benchmark import RunSummary
+from rai_bench.base_benchmark import ModelSummary, RunSummary
 from rai_bench.results_processing.data_loading import SUMMARY_FILE_NAME
 from rai_bench.utils import (
     define_benchmark_logger,
@@ -138,15 +138,15 @@ def merge_model_repeats_summary(
 
     avg_success_rate = np.mean([s.success_rate for s in summaries])
     avg_time = np.mean([s.avg_time for s in summaries])
-    total_tasks = np.min(
-        [s.total_tasks for s in summaries]
-    )  # NOTE (mkotynia) get the minimum total tasks across repeats. If benchmark breaks for some repeat, it will be noticed in such case
 
-    merged_summary = RunSummary(
+    total_tasks = np.mean([s.total_tasks for s in summaries])
+
+    merged_summary = ModelSummary(
         model_name=model_name,
-        success_rate=round(float(avg_success_rate), 2),
+        avg_success_rate=round(float(avg_success_rate), 2),
         avg_time=round(float(avg_time), 3),
-        total_tasks=total_tasks,
+        avg_total_tasks=round(float(total_tasks), 3),
+        repeats=len(summaries),
     )
 
     merged_file = model_dir / REPEATS_SUMMARY_FILE_NAME