docling-project · nikos-livathinos · Jul 28, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 9, 2025
diff --git a/docling_eval/aggregations/consolidator.py b/docling_eval/aggregations/consolidator.py
@@ -198,17 +198,18 @@ def _build_dataframes(
         dfs: Dict[EvaluationModality, DataFrame] = {}
         for modality, m_data in df_data.items():
             df = DataFrame(m_data)
-            df = df.sort_values(by=["Benchmark"], ascending=[True])
+            df = df.sort_values(by=["Benchmark", "Experiment"], ascending=[True, True])
             dfs[modality] = df
 
         return dfs
 
     def _layout_metrics(self, evaluation: DatasetLayoutEvaluation) -> Dict[str, str]:
         r"""Get the metrics for the LayoutEvaluation"""
         metrics = {
-            "mAP": export_value(evaluation.map_stats),
-            "mAP_50": export_value(evaluation.map_50_stats),
-            "mAP_75": export_value(evaluation.map_75_stats),
+            "mAP": export_value(evaluation.mAP),
+            "stat_mAP": export_value(evaluation.map_stats),
+            "stat_mAP_50": export_value(evaluation.map_50_stats),
+            "stat_mAP_75": export_value(evaluation.map_75_stats),
             "weighted_mAP_50": export_value(evaluation.weighted_map_50_stats),
             "weighted_mAP_75": export_value(evaluation.weighted_map_75_stats),
             "weighted_mAP_90": export_value(evaluation.weighted_map_90_stats),

diff --git a/docling_eval/aggregations/multi_evalutor.py b/docling_eval/aggregations/multi_evalutor.py
@@ -385,11 +385,39 @@ def _create_eval(
 
     @staticmethod
     def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
-        r"""Load MultiEvaluation from disk files"""
-        # benchmark -> provider -> modality -> DatasetEvaluation
+        r"""
+        Load MultiEvaluation from disk files
+        """
+
+        def _get_modalities_evaluations(
+            evaluations_root: Path,
+            benchmark: BenchMarkNames,
+        ) -> Dict[EvaluationModality, SingleEvaluation]:
+            r"""
+            Scan the evaluations_root and load the evaluations for each modality
+            """
+            modalities_evaluations: Dict[EvaluationModality, SingleEvaluation] = {}
+            for modality_path in evaluations_root.iterdir():
+                try:
+                    modality = EvaluationModality(modality_path.name)
+                except ValueError:
+                    continue
+
+                # Load the evaluation
+                evaluation = load_evaluation(benchmark, modality, modality_path)
+                if not evaluation:
+                    continue
+
+                modalities_evaluations[modality] = SingleEvaluation(
+                    evaluation=evaluation,
+                    experiment=experiment,
+                )
+            return modalities_evaluations
+
+        # benchmark -> experiment_and_subexperiment -> modality-> SingleEvaluation
         evaluations: Dict[
             BenchMarkNames,
-            Dict[Path, Dict[EvaluationModality, DatasetEvaluationType]],
+            Dict[str, Dict[EvaluationModality, SingleEvaluation]],
         ] = {}
 
         # Get the benchmark
@@ -398,6 +426,9 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
                 benchmark = BenchMarkNames(benchmark_path.name)
             except ValueError:
                 continue
+            if benchmark not in evaluations:
+                evaluations[benchmark] = {}
+
             # Get the experiment
             for experiment_path in benchmark_path.iterdir():
                 if not experiment_path.is_dir():
@@ -407,30 +438,35 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
                 if experiment == MultiEvaluator.GT_LEAF_DIR:
                     continue
 
-                # Load the evaluations for each modality
-                evaluations_path = experiment_path / MultiEvaluator.EVALUATIONS_DIR
-                if not evaluations_path.is_dir():
-                    continue
-                for modality_path in evaluations_path.iterdir():
-                    try:
-                        modality = EvaluationModality(modality_path.name)
-                    except ValueError:
+                # Check if a sub-experiment is present
+                for exp_child_path in experiment_path.iterdir():
+                    if not exp_child_path.is_dir():
                         continue
 
-                    # Load the evaluation
-                    evaluation = load_evaluation(benchmark, modality, modality_path)
-                    if not evaluation:
+                    subexp_candidate = exp_child_path.name
+                    if subexp_candidate == MultiEvaluator.PRED_LEAF_DIR:
                         continue
 
-                    if benchmark not in evaluations:
-                        evaluations[benchmark] = {}
-                    if experiment not in evaluations[benchmark]:
-                        evaluations[benchmark][experiment] = {}
+                    modalities_evaluations: Dict[EvaluationModality, SingleEvaluation]
+                    if subexp_candidate == MultiEvaluator.EVALUATIONS_DIR:
+                        modalities_evaluations = _get_modalities_evaluations(
+                            exp_child_path, benchmark
+                        )
 
-                    evaluations[benchmark][experiment][modality] = SingleEvaluation(
-                        evaluation=evaluation,
-                        experiment=experiment,
-                    )
+                        exp_and_subexp = experiment
+                        evaluations[benchmark][exp_and_subexp] = modalities_evaluations
+                    else:
+                        subexp_candidate_evaluations = (
+                            exp_child_path / MultiEvaluator.EVALUATIONS_DIR
+                        )
+                        if not subexp_candidate_evaluations.is_dir():
+                            continue
+                        modalities_evaluations = _get_modalities_evaluations(
+                            subexp_candidate_evaluations, benchmark
+                        )
+
+                        exp_and_subexp = f"{experiment}_{subexp_candidate}"
+                        evaluations[benchmark][exp_and_subexp] = modalities_evaluations
 
         multi_evaluation: MultiEvaluation = MultiEvaluation(evaluations=evaluations)
         return multi_evaluation
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -5,18 +5,33 @@
 import os
 import sys
 from pathlib import Path
-from typing import Annotated, Dict, Optional, Tuple
+
+# --- DoclingLayoutOptionsManager definition moved here ---
+from typing import Annotated, Dict, List, Optional, Tuple
 
 import typer
+from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_EGRET_LARGE,
+    DOCLING_LAYOUT_EGRET_MEDIUM,
+    DOCLING_LAYOUT_EGRET_XLARGE,
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_HERON_101,
+    DOCLING_LAYOUT_V2,
+    LayoutModelConfig,
+)
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
+    LayoutOptions,
     PaginatedPipelineOptions,
     PdfPipelineOptions,
     VlmPipelineOptions,
-    smoldocling_vlm_conversion_options,
-    smoldocling_vlm_mlx_conversion_options,
+)
+from docling.datamodel.vlm_model_specs import (
+    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
+)
+from docling.datamodel.vlm_model_specs import (
+    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
 )
 from docling.document_converter import FormatOption, PdfFormatOption
 from docling.models.factories import get_ocr_factory
@@ -103,6 +118,26 @@
     TableFormerPredictionProvider,
 )
 
+
+class DoclingLayoutOptionsManager:
+    layout_model_configs = {
+        "docling_layout_v2": DOCLING_LAYOUT_V2,
+        "docling_layout_heron": DOCLING_LAYOUT_HERON,
+        "docling_layout_heron_101": DOCLING_LAYOUT_HERON_101,
+        "docling_layout_egret_medium": DOCLING_LAYOUT_EGRET_MEDIUM,
+        "docling_layout_egret_large": DOCLING_LAYOUT_EGRET_LARGE,
+        "docling_layout_egret_xlarge": DOCLING_LAYOUT_EGRET_XLARGE,
+    }
+
+    @staticmethod
+    def get_layout_model_config(model_spec: str) -> LayoutModelConfig:
+        return DoclingLayoutOptionsManager.layout_model_configs[model_spec]
+
+    @staticmethod
+    def get_layout_model_config_names() -> List[str]:
+        return list(DoclingLayoutOptionsManager.layout_model_configs.keys())
+
+
 # Configure logging
 logging_level = logging.WARNING
 # logging_level = logging.DEBUG
@@ -125,6 +160,32 @@
 )
 
 
+def derive_input_output_dirs(
+    benchmark: BenchMarkNames,
+    modality: EvaluationModality,
+    input_dir: Optional[Path],
+    output_dir: Optional[Path],
+) -> Tuple[Path, Path]:
+    r"""
+    One of the input or output dirs must be non None.
+    In case one of them is None, it can be derived from the other one.
+    """
+    if input_dir and output_dir:
+        return input_dir, output_dir
+    if not input_dir and not output_dir:
+        raise ValueError("Either input_dir or output_dir must be provided")
+
+    if not input_dir and output_dir:
+        # Derive input and output paths based on the directory structure in test_dataset_builder.py
+        input_dir = output_dir / "eval_dataset" / benchmark.value / modality.value
+
+    if not output_dir and input_dir:
+        output_dir = input_dir.parent
+    assert input_dir is not None
+    assert output_dir is not None
+    return input_dir, output_dir
+
+
 def log_and_save_stats(
     odir: Path,
     benchmark: BenchMarkNames,
@@ -259,6 +320,9 @@ def get_prediction_provider(
     do_table_structure: bool = True,
     artifacts_path: Optional[Path] = None,
     image_scale_factor: Optional[float] = None,
+    docling_layout_model_spec: Optional[LayoutModelConfig] = None,
+    docling_layout_create_orphan_clusters: Optional[bool] = None,
+    docling_layout_keep_empty_clusters: Optional[bool] = None,
 ):
     pipeline_options: PaginatedPipelineOptions
     """Get the appropriate prediction provider with default settings."""
@@ -289,8 +353,17 @@ def get_prediction_provider(
         pipeline_options.generate_parsed_pages = True
         pipeline_options.accelerator_options = accelerator_options
 
-        pipeline_options.layout_options.create_orphan_clusters = False
-        pipeline_options.layout_options.keep_empty_clusters = True
+        # Layout options
+        layout_options: LayoutOptions = LayoutOptions()
+        if docling_layout_model_spec is not None:
+            layout_options.model_spec = docling_layout_model_spec
+        if docling_layout_create_orphan_clusters is not None:
+            layout_options.create_orphan_clusters = (
+                docling_layout_create_orphan_clusters
+            )
+        if docling_layout_keep_empty_clusters is not None:
+            layout_options.keep_empty_clusters = docling_layout_keep_empty_clusters
+        pipeline_options.layout_options = layout_options
 
         if artifacts_path is not None:
             pipeline_options.artifacts_path = artifacts_path
@@ -1038,6 +1111,24 @@ def create_eval(
             help="Directory for local model artifacts. Will only be passed to providers supporting this."
         ),
     ] = None,
+    docling_layout_model_spec: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Layout model spec for Docling. Supported values: {}".format(
+                DoclingLayoutOptionsManager.get_layout_model_config_names()
+            )
+        ),
+    ] = "docling_layout_heron",
+    docling_layout_create_orphan_clusters: Annotated[
+        Optional[bool],
+        typer.Option(
+            help="Enable orphan clusters creation in Docling layout post-processing"
+        ),
+    ] = True,
+    docling_layout_keep_empty_clusters: Annotated[
+        Optional[bool],
+        typer.Option(help="Keep the empty clusters in Docling layout post-processing"),
+    ] = False,
     do_visualization: Annotated[
         bool, typer.Option(help="visualize the predictions")
     ] = True,
@@ -1070,6 +1161,14 @@ def create_eval(
         )
 
         # Create the appropriate prediction provider
+        docling_layout_model_spec_obj = (
+            DoclingLayoutOptionsManager.get_layout_model_config(
+                docling_layout_model_spec
+            )
+            if docling_layout_model_spec
+            else None
+        )
+
         provider = get_prediction_provider(
             provider_type=prediction_provider,
             file_source_path=file_source_path,
@@ -1080,6 +1179,9 @@ def create_eval(
             do_visualization=do_visualization,
             image_scale_factor=image_scale_factor,
             do_table_structure=do_table_structure,
+            docling_layout_model_spec=docling_layout_model_spec_obj,
+            docling_layout_create_orphan_clusters=docling_layout_create_orphan_clusters,
+            docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
         )
 
         # Get the dataset name from the benchmark
@@ -1173,13 +1275,32 @@ def create(
 @app.command(name="evaluate")
 def evaluate_cmd(
     modality: Annotated[EvaluationModality, typer.Option(help="Evaluation modality")],
-    benchmark: Annotated[BenchMarkNames, typer.Option(help="Benchmark name")],
-    output_dir: Annotated[Path, typer.Option(help="Base output directory")],
+    benchmark: Annotated[
+        BenchMarkNames,
+        typer.Option(
+            help="Benchmark name. It is used only to set the filename of the evaluation json file."
+        ),
+    ],
+    input_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Directory with evaluation dataset. If not provided, the input directory will be derived from the output directory."
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Base output directory. If not provided, the output directory will be derived from the input directory."
+        ),
+    ] = None,
     split: Annotated[str, typer.Option(help="Dataset split")] = "test",
 ):
     """Evaluate predictions against ground truth."""
-    # Derive input and output paths based on the directory structure in test_dataset_builder.py
-    input_dir = output_dir / "eval_dataset"
+    input_dir, output_dir = derive_input_output_dirs(
+        benchmark, modality, input_dir, output_dir
+    )
+    assert input_dir is not None
+    assert output_dir is not None
     eval_output_dir = output_dir / "evaluations" / modality.value
 
     # Create output directory
@@ -1201,16 +1322,30 @@ def visualize_cmd(
         EvaluationModality, typer.Option(help="Visualization modality")
     ],
     benchmark: Annotated[BenchMarkNames, typer.Option(help="Benchmark name")],
-    output_dir: Annotated[Path, typer.Option(help="Base output directory")],
+    input_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Directory with evaluation dataset. If not provided, the input directory will be derived from the output directory."
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Base output directory. If not provided, the output directory will be derived from the input directory."
+        ),
+    ] = None,
     split: Annotated[str, typer.Option(help="Dataset split")] = "test",
     begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
     end_index: Annotated[
         int, typer.Option(help="End index (exclusive), -1 for all")
     ] = -1,
 ):
     """Visualize evaluation results."""
-    # Derive input and output paths based on the directory structure in test_dataset_builder.py
-    input_dir = output_dir / "eval_dataset"
+    input_dir, output_dir = derive_input_output_dirs(
+        benchmark, modality, input_dir, output_dir
+    )
+    assert input_dir is not None
+    assert output_dir is not None
     eval_output_dir = output_dir / "evaluations" / modality.value
 
     # Create output directory

diff --git a/docling_eval/utils/coco_exporter.py b/docling_eval/utils/coco_exporter.py
@@ -308,6 +308,11 @@ def main(args):
     log_format = "%(asctime)s - %(levelname)s - %(message)s"
     logging.basicConfig(level=logging.INFO, format=log_format)
 
+    _log.info("Export eval-dataset in COCO-tools format")
+    _log.info("COCO dataset: %s", str(coco_path))
+    _log.info("eval-dataset: %s", str(docling_eval_path))
+    _log.info("Save path: %s", str(save_path))
+
     # Create the COCO exporter
     exporter = DoclingEvalCOCOExporter(docling_eval_path)
     exporter.export_predictions_wrt_original_COCO(