feat: Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)

vladimir-kivi-ds · GlockPL · Konrad Czarnota · mhordynski · commit bb1fcc8a04eb · 2025-09-11T15:58:22.000+02:00
Co-authored-by: GlockPL &lt;pakerzus@tlen.pl&gt;
Co-authored-by: Konrad Czarnota &lt;konrad.czarnota@deepsense.ai&gt;
Co-authored-by: GlockPL &lt;michal.kulczykowski@deepsense.ai&gt;
Co-authored-by: akotyla &lt;79326805+akotyla@users.noreply.github.com&gt;
Co-authored-by: jakubduda-dsai &lt;148433294+jakubduda-dsai@users.noreply.github.com&gt;
Co-authored-by: ds-sebastianchwilczynski &lt;166019576+ds-sebastianchwilczynski@users.noreply.github.com&gt;
Co-authored-by: dazy-ds &lt;135028324+dazy-ds@users.noreply.github.com&gt;
diff --git a/packages/ragbits-evaluate/CHANGELOG.md b/packages/ragbits-evaluate/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Unreleased
 
+- Optional parallel batches execution in ragbits.evaluate.Evaluator (#769)
+
 ## 1.2.2 (2025-08-08)
 
 ### Changed
@@ -142,6 +144,7 @@
 - ragbits-core updated to version v0.10.1
 
 ## 0.10.0 (2025-03-17)
+
 ### Changed
 
 - ragbits-core updated to version v0.10.0
diff --git a/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py b/packages/ragbits-evaluate/src/ragbits/evaluate/evaluator.py
@@ -1,7 +1,7 @@
 import asyncio
 import random
 import time
-from collections.abc import Awaitable, Callable, Iterable
+from collections.abc import Awaitable, Callable, Iterable, Sized
 from dataclasses import dataclass
 from typing import Generic, ParamSpec, TypeVar
 
@@ -71,6 +71,7 @@ def __init__(
         num_retries: int = 3,
         backoff_multiplier: int = 1,
         backoff_max: int = 60,
+        parallelize_batches: bool = False,
     ) -> None:
         """
         Initialize the Evaluator instance.
@@ -80,11 +81,13 @@ def __init__(
             num_retries: The number of retries per evaluation pipeline inference error.
             backoff_multiplier: The base delay multiplier for exponential backoff (in seconds).
             backoff_max: The maximum allowed delay (in seconds) between retries.
+            parallelize_batches: Whether to process samples within each batch in parallel (asyncio.gather).
         """
         self.batch_size = batch_size
         self.num_retries = num_retries
         self.backoff_multiplier = backoff_multiplier
         self.backoff_max = backoff_max
+        self.parallelize_batches = parallelize_batches
 
     @classmethod
     async def run_from_config(cls, config: dict) -> EvaluatorResult:
@@ -156,16 +159,33 @@ async def _call_pipeline(
             The evaluation results and performance metrics.
         """
         start_time = time.perf_counter()
-        outputs = [
-            await self._call_with_error_handling(pipeline, data)
-            for data in tqdm(batched(dataset, self.batch_size), desc="Evaluation")
-        ]
+
+        total_samples = len(dataset) if isinstance(dataset, Sized) else None
+        batches = batched(dataset, self.batch_size)
+        outputs: list[Iterable[EvaluationResultT] | Exception] = []
+
+        with tqdm(total=total_samples, desc="Evaluation", unit="sample") as progress_bar:
+            for batch in batches:
+                batch_list = list(batch)
+
+                if self.parallelize_batches:
+                    tasks = [self._call_with_error_handling(pipeline, [sample]) for sample in batch_list]
+                    batch_results = await asyncio.gather(*tasks)
+
+                    for result in batch_results:
+                        outputs.append(result)
+                        progress_bar.update(1)
+                else:
+                    result = await self._call_with_error_handling(pipeline, batch_list)
+                    outputs.append(result)
+                    progress_bar.update(len(batch_list))
+
         end_time = time.perf_counter()
 
         errors = [output for output in outputs if isinstance(output, Exception)]
         results = [item for output in outputs if not isinstance(output, Exception) for item in output]
 
-        return results, errors, self._compute_time_perf(start_time, end_time, len(outputs))
+        return results, errors, self._compute_time_perf(start_time, end_time, len(results))
 
     async def _call_with_error_handling(
         self,
diff --git a/packages/ragbits-evaluate/tests/unit/test_evaluator.py b/packages/ragbits-evaluate/tests/unit/test_evaluator.py
@@ -1,3 +1,5 @@
+import asyncio
+import time
 from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Any, cast
@@ -31,15 +33,23 @@ def __init__(self, model_name: str = "default") -> None:
 
 
 class MockEvaluationPipeline(EvaluationPipeline[MockEvaluationTarget, MockEvaluationData, MockEvaluationResult]):
+    def __init__(self, evaluation_target: MockEvaluationTarget, slow: bool = False):
+        super().__init__(evaluation_target)
+        self._slow = slow
+
     async def __call__(self, data: Iterable[MockEvaluationData]) -> Iterable[MockEvaluationResult]:
-        return [
-            MockEvaluationResult(
-                input_data=row.input_data,
-                processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
-                is_correct=row.input_data % 2 == 0,
+        results = []
+        for row in data:
+            if self._slow:
+                await asyncio.sleep(0.5)
+            results.append(
+                MockEvaluationResult(
+                    input_data=row.input_data,
+                    processed_output=f"{self.evaluation_target.model_name}_{row.input_data}",
+                    is_correct=row.input_data % 2 == 0,
+                )
             )
-            for row in data
-        ]
+        return results
 
     @classmethod
     def from_config(cls, config: dict) -> "MockEvaluationPipeline":
@@ -102,6 +112,66 @@ async def test_run_evaluation(
     assert all("test_model_" in r.processed_output for r in results.results)
 
 
+@pytest.mark.parametrize(
+    ("parallelize_batches", "expected_results", "expected_accuracy"),
+    [(False, 4, 0.5), (True, 4, 0.5)],
+)
+async def test_run_evaluation_with_parallel_batches(
+    parallelize_batches: bool,
+    expected_results: int,
+    expected_accuracy: float,
+) -> None:
+    target = MockEvaluationTarget(model_name="parallel_test_model")
+    pipeline = MockEvaluationPipeline(target)
+    dataloader = MockDataLoader()
+    metrics = MetricSet(*[MockMetric()])
+    evaluator = Evaluator(batch_size=2, parallelize_batches=parallelize_batches)
+
+    results = await evaluator.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+
+    assert len(results.results) == expected_results
+    assert len(results.errors) == 0
+    assert results.metrics["accuracy"] == expected_accuracy
+    assert all("parallel_test_model_" in r.processed_output for r in results.results)
+
+
+async def test_parallel_batches_performance() -> None:
+    """Test that parallel processing is faster than sequential processing."""
+    target = MockEvaluationTarget(model_name="timing_test_model")
+    pipeline = MockEvaluationPipeline(target, slow=True)
+    dataloader = MockDataLoader(dataset_size=4)
+    metrics = MetricSet(*[MockMetric()])
+
+    # Test sequential processing
+    evaluator_sequential = Evaluator(batch_size=2, parallelize_batches=False)
+    start_time = time.perf_counter()
+    results_sequential = await evaluator_sequential.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+    sequential_time = time.perf_counter() - start_time
+
+    evaluator_parallel = Evaluator(batch_size=2, parallelize_batches=True)
+    start_time = time.perf_counter()
+    results_parallel = await evaluator_parallel.compute(
+        pipeline=pipeline,
+        dataloader=dataloader,
+        metricset=metrics,
+    )
+    parallel_time = time.perf_counter() - start_time
+
+    assert len(results_sequential.results) == len(results_parallel.results)
+    assert results_sequential.metrics == results_parallel.metrics
+
+    # Parallel processing should be roughly 2x faster, but we add some margin
+    assert parallel_time < sequential_time * 0.7
+
+
 async def test_run_from_config() -> None:
     config = {
         "evaluation": {