feat: Improvements for the MultiEvaluator (#95)

nikos-livathinos · web-flow · commit 04fe2d916fbc · 2025-05-15T17:00:33.000+02:00
* fix: MultiEvaluator fix minor logging issue

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* chore: Improve code comments

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* feat: Refactor the MultiEvaluator to allow arbitrary experiment names for the benchmark subdirs.
- In case there is no eval dataset, the experiment name must match a provider's name and this
  will be used to run the predictions.
- In case there is eval dataset, the experiment name is just a tag and the information about the
  prediction provider will be extracted by the corresponding column of the parquet.
- If there is not eval dataset and the experiment name does not match any prediction provider,
  an exception is raised.

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: MultiEvalutor rename the GT_LEAF_DIR and introduce the EVALUATIONS_DIR to make the dir structure
created/used by MultiEvaluator the same with the ones created by the CLI

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Change the pipeline settings of Docling to use 16 CPU threads.

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* chore: MultiEvaluator improve logging

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Fix the MultiEvaluator.load_multi_evaluation() to properly scan the multi evalution dir structure

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

* fix: Ensure to use all CPU cores for the DoclingPredictionProvider

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;

---------

Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;
diff --git a/docling_eval/aggregations/consolidator.py b/docling_eval/aggregations/consolidator.py
@@ -151,8 +151,8 @@ def _build_dataframes(
         df_data: Dict[EvaluationModality, List[Dict[str, Union[str, float, int]]]] = {}
 
         # Collect the dataframe data
-        for benchmark, prov_mod_eval in multi_evaluation.evaluations.items():
-            for experiment, mod_eval in prov_mod_eval.items():
+        for benchmark, exp_mod_eval in multi_evaluation.evaluations.items():
+            for experiment, mod_eval in exp_mod_eval.items():
                 for modality, single_evaluation in mod_eval.items():
                     evaluation = single_evaluation.evaluation
 
diff --git a/docling_eval/aggregations/multi_evalutor.py b/docling_eval/aggregations/multi_evalutor.py
@@ -1,6 +1,5 @@
 import json
 import logging
-from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, Generic, List, Optional
 
@@ -27,6 +26,9 @@
 
 class SingleEvaluation(BaseModel, Generic[DatasetEvaluationType]):
     evaluation: DatasetEvaluationType
+    experiment: str
+
+    # The prediction provider cannot be deteremined if it comes outside of docling-eval.
     prediction_provider_type: Optional[PredictionProviderType] = None
 
 
@@ -77,18 +79,14 @@ def validate_modality(
 
 
 def read_prediction_provider_type(
-    pred_path: Path,
+    pred_dir: Path,
+    split: str,
 ) -> Optional[PredictionProviderType]:
+    r"""
+    Open the evaluation dataset and read the prediction provider column
+    """
     try:
-        # Discover the split
-        split = None
-        for split_path in pred_path.iterdir():
-            split = split_path.name
-            break
-        if not split:
-            return None
-
-        parquet_files = str(pred_path / split / "*.parquet")
+        parquet_files = str(pred_dir / split / "*.parquet")
         ds: IterableDataset = load_dataset(
             "parquet",
             data_files={split: parquet_files},
@@ -121,8 +119,12 @@ class MultiEvaluator(Generic[DatasetEvaluationType]):
     """
 
     # Leaf dirs for GT, predictions, evaluations
-    GT_LEAF_DIR = "_GT_"
-    PRED_LEAF_DIR = "predictions"
+    # GT_LEAF_DIR = "_GT_"
+    # PRED_LEAF_DIR = "predictions"
+
+    GT_LEAF_DIR = "gt_dataset"
+    PRED_LEAF_DIR = "eval_dataset"
+    EVALUATIONS_DIR = "evaluations"
 
     def __init__(
         self,
@@ -141,42 +143,48 @@ def __init__(
 
     def __call__(
         self,
-        prediction_provider_types: List[PredictionProviderType],
+        experiment_names: List[str],
         benchmarks: List[BenchMarkNames],
         modalities: List[EvaluationModality],
         dataset_sources: Optional[Dict[BenchMarkNames, Path]] = None,
         dataset_splits: Optional[Dict[BenchMarkNames, str]] = None,
     ) -> MultiEvaluation:
         r""" """
         # Build any missing dataset
-        benchmark_preds = self._build_datasets(
-            prediction_provider_types,
+        benchmark_experiments = self._build_datasets(
+            experiment_names,
             benchmarks,
             dataset_sources,
             dataset_splits,
         )
 
         # Perform the evaluations
-        multi_evaluation = self._run_evaluations(modalities, benchmark_preds)
+        multi_evaluation = self._run_evaluations(
+            modalities, benchmark_experiments, dataset_splits
+        )
         return multi_evaluation
 
     def _build_datasets(
         self,
-        prediction_provider_types: List[PredictionProviderType],
+        experiment_names: List[str],
         benchmarks: List[BenchMarkNames],
         dataset_sources: Optional[Dict[BenchMarkNames, Path]] = None,
         dataset_splits: Optional[Dict[BenchMarkNames, str]] = None,
-    ) -> Dict[BenchMarkNames, Dict[PredictionProviderType, Path]]:
+    ) -> Dict[BenchMarkNames, Dict[str, Path]]:
         r"""
         1. Get the predicted datasets
         2. If a predicted dataset is missing, check if the GT for this dataset exists.
         3. If both pred and GT datasets exist, build the GT dataset and the pred dataset.
         4. If GT is present, build the pred dataset.
 
-        Return the paths of the prediction datasets
+        Notice: In case the prediction dataset does not exist, the experiment name MUST match a
+        provider name.
+
+        Return the paths of the prediction datasets:
+        benchmark_name -> experiment_name -> Path
         """
         # Dict with benchmark predictions
-        benchmark_preds: Dict[BenchMarkNames, Dict[PredictionProviderType, Path]] = {}
+        benchmark_experiments: Dict[BenchMarkNames, Dict[str, Path]] = {}
 
         # Set the benchmark_preds
         for benchmark in benchmarks:
@@ -189,17 +197,19 @@ def _build_datasets(
                 else self._default_split
             )
 
-            if benchmark not in benchmark_preds:
-                benchmark_preds[benchmark] = {}
-            for provider_type in prediction_provider_types:
+            if benchmark not in benchmark_experiments:
+                benchmark_experiments[benchmark] = {}
+            for experiment_name in experiment_names:
                 benchmark_pred_dir = (
                     self._root_dir
                     / benchmark.value
-                    / provider_type.value
+                    / experiment_name
                     / MultiEvaluator.PRED_LEAF_DIR
                 )
                 if dataset_exists(benchmark_pred_dir, split):
-                    benchmark_preds[benchmark][provider_type] = benchmark_pred_dir
+                    benchmark_experiments[benchmark][
+                        experiment_name
+                    ] = benchmark_pred_dir
                     continue
 
                 # Create the GT dataset if needed
@@ -211,9 +221,19 @@ def _build_datasets(
                     _log.info("Creating GT for: %s", benchmark.value)
                     self._create_gt(benchmark, benchmark_gt_dir, split, dataset_source)
 
-                # Create the pred dataset
+                # Map the experiment_name to a PredictionProviderType
+                try:
+                    provider_type = PredictionProviderType(experiment_name)
+                except ValueError as ex:
+                    _log.error(
+                        "Prediction dataset is missing and experiment %s does NOT match any provider name",
+                        experiment_name,
+                    )
+                    raise ex
+
+                # Create the prediction dataset
                 _log.info(
-                    "Creating predictions for: %s / %s / %s",
+                    "Creating predictions for: %s / %s",
                     benchmark.value,
                     provider_type.value,
                 )
@@ -225,56 +245,69 @@ def _build_datasets(
                     benchmark_pred_dir,
                 )
 
-                benchmark_preds[benchmark][provider_type] = benchmark_pred_dir
+                benchmark_experiments[benchmark][experiment_name] = benchmark_pred_dir
 
-        return benchmark_preds
+        return benchmark_experiments
 
     def _run_evaluations(
         self,
         modalities: List[EvaluationModality],
-        benchmark_preds: Dict[BenchMarkNames, Dict[PredictionProviderType, Path]],
+        benchmark_experiments: Dict[BenchMarkNames, Dict[str, Path]],
         dataset_splits: Optional[Dict[BenchMarkNames, str]] = None,
     ) -> MultiEvaluation:
         evaluations: Dict[
             BenchMarkNames,
             Dict[str, Dict[EvaluationModality, SingleEvaluation]],
         ] = {}
-        for benchmark, prov_mod_paths in benchmark_preds.items():
+        for benchmark, exp_mod_paths in benchmark_experiments.items():
             split = (
                 dataset_splits.get(benchmark, self._default_split)
                 if dataset_splits
                 else self._default_split
             )
             if benchmark not in evaluations:
                 evaluations[benchmark] = {}
-            for provider_type, pred_dir in prov_mod_paths.items():
-                experiment = provider_type.value
+            for experiment, pred_dir in exp_mod_paths.items():
                 if experiment not in evaluations[benchmark]:
                     evaluations[benchmark][experiment] = {}
 
+                # Try to get the prediction provider
+                provider_type = read_prediction_provider_type(pred_dir, split)
+
                 for modality in modalities:
-                    # Check if the provider supports the asked modality
-                    if not validate_modality(provider_type, modality):
+                    # If the provider is available, check if it supports the asked modality
+                    if provider_type and not validate_modality(provider_type, modality):
                         _log.error(
-                            "Provider %s does not support modality: %s",
-                            provider_type,
+                            "Prediction dataset from provider '%s', which does not support modality '%s'",
+                            provider_type.value,
                             modality,
                         )
                         continue
 
                     eval_dir = (
-                        self._root_dir / benchmark.value / experiment / modality.value
+                        self._root_dir
+                        / benchmark.value
+                        / experiment
+                        / MultiEvaluator.EVALUATIONS_DIR
+                        / modality.value
                     )
                     # Check if the evaluations are already present
                     evaluation = load_evaluation(benchmark, modality, eval_dir)
                     if not evaluation:
+                        _log.info(
+                            "Evaluate benchmark: %s, for experiment: %s, for modality: %s",
+                            benchmark.value,
+                            experiment,
+                            modality,
+                        )
                         evaluation = evaluate(
                             modality, benchmark, pred_dir, eval_dir, split
                         )
                     if evaluation:
                         assert evaluation
                         evaluations[benchmark][experiment][modality] = SingleEvaluation(
                             evaluation=evaluation,
+                            experiment=experiment,
                             prediction_provider_type=provider_type,
                         )
 
@@ -315,7 +348,7 @@ def _create_gt(
     def _create_eval(
         self,
         benchmark: BenchMarkNames,
-        prediction_provider: PredictionProviderType,
+        provider_type: PredictionProviderType,
         gt_dir: Path,
         split: str,
         pred_dir: Path,
@@ -330,7 +363,7 @@ def _create_eval(
         try:
             # Create the appropriate prediction provider
             provider = get_prediction_provider(
-                provider_type=prediction_provider,
+                provider_type=provider_type,
                 do_visualization=False,
             )
 
@@ -359,26 +392,24 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
             Dict[Path, Dict[EvaluationModality, DatasetEvaluationType]],
         ] = {}
 
+        # Get the benchmark
         for benchmark_path in multi_evaluation_path.iterdir():
             try:
                 benchmark = BenchMarkNames(benchmark_path.name)
             except ValueError:
                 continue
+            # Get the experiment
             for experiment_path in benchmark_path.iterdir():
                 if not experiment_path.is_dir():
                     continue
 
                 experiment = experiment_path.name
-                if experiment == "_GT_":
+                if experiment == MultiEvaluator.GT_LEAF_DIR:
                     continue
 
-                # Get the provider_type from the prediction
-                pred_provider_type = read_prediction_provider_type(
-                    experiment_path / MultiEvaluator.PRED_LEAF_DIR
-                )
-
-                # Get the experiment
-                for modality_path in experiment_path.iterdir():
+                # Load the evaluations for each modality
+                evaluations_path = experiment_path / MultiEvaluator.EVALUATIONS_DIR
+                for modality_path in evaluations_path.iterdir():
                     try:
                         modality = EvaluationModality(modality_path.name)
                     except ValueError:
@@ -395,8 +426,8 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
                         evaluations[benchmark][experiment] = {}
                     evaluations[benchmark][experiment][modality] = SingleEvaluation(
                         evaluation=evaluation,
-                        prediction_provider_type=pred_provider_type,
+                        experiment=experiment,
                     )
 
-        multi_evalution: MultiEvaluation = MultiEvaluation(evaluations=evaluations)
-        return multi_evalution
+        multi_evaluation: MultiEvaluation = MultiEvaluation(evaluations=evaluations)
+        return multi_evaluation
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -1,6 +1,7 @@
 import glob
 import json
 import logging
+import multiprocessing
 import os
 import sys
 from pathlib import Path
@@ -9,6 +10,8 @@
 import typer
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     PaginatedPipelineOptions,
     PdfPipelineOptions,
     VlmPipelineOptions,
@@ -247,6 +250,10 @@ def get_prediction_provider(
         ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
             kind="easyocr",
         )
+        # Use all CPU cores
+        accelerator_options = AcceleratorOptions(
+            num_threads=multiprocessing.cpu_count(),
+        )
 
         pipeline_options = PdfPipelineOptions(
             do_ocr=True,
@@ -258,6 +265,7 @@ def get_prediction_provider(
         pipeline_options.generate_page_images = True
         pipeline_options.generate_picture_images = True
         pipeline_options.generate_parsed_pages = True
+        pipeline_options.accelerator_options = accelerator_options
 
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
diff --git a/docs/examples/matrix.py b/docs/examples/matrix.py