Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
1dfd4f6
feat: Pinpoint docling to cau/layout-model-spec. Introduce the doclin…
nikos-livathinos Jul 8, 2025
e0e1b25
chore: Update docling to the latest status of cau/layout-model-spec
nikos-livathinos Jul 8, 2025
3525b83
chore: Update docling to the latest status of cau/layout-model-spec
nikos-livathinos Jul 9, 2025
e3bac64
feat: Refactor the CLI arguments of 'evaluate' to accept optional inp…
nikos-livathinos Jul 9, 2025
dba40be
feat: Refactor the MultiEvaluator:load_multi_evaluation() to search f…
nikos-livathinos Jul 9, 2025
c8b524e
chore: Clean up code in MultiEvaluator
nikos-livathinos Jul 10, 2025
e8035d0
chore: Pin docling from its main branch. Add code comments.
nikos-livathinos Jul 10, 2025
be6052b
fix: In the Consolidator, add the single value mAP score for the layo…
nikos-livathinos Jul 14, 2025
5cc1986
feat: Extend the CLI to introduce the --docling-layout-keep-empty-clu…
nikos-livathinos Jul 14, 2025
452cd39
chore: Remove pinpoint to docling branch as the docling PR has been m…
nikos-livathinos Jul 15, 2025
a08400e
fix: Have the Consolidator sorting the produced excel by Benchmark an…
nikos-livathinos Jul 15, 2025
be7bae4
fix: Set the --docling-layout-keep-empty-clusters CLI parameter by de…
nikos-livathinos Jul 15, 2025
9ba956c
feat: Extend the CLI to accept optional input_dir and output_dir in t…
nikos-livathinos Jul 23, 2025
64bccb5
chore: Remove docling pinning. Use the latest docling release.
nikos-livathinos Jul 28, 2025
675b9f0
Merge branch 'main' into nli/docling_layout
nikos-livathinos Jul 28, 2025
6386f7e
fix: Fix in CLI for create-eval `--docling-layout-keep-empty-clusters`
nikos-livathinos Jul 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions docling_eval/aggregations/consolidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,17 +198,18 @@ def _build_dataframes(
dfs: Dict[EvaluationModality, DataFrame] = {}
for modality, m_data in df_data.items():
df = DataFrame(m_data)
df = df.sort_values(by=["Benchmark"], ascending=[True])
df = df.sort_values(by=["Benchmark", "Experiment"], ascending=[True, True])
dfs[modality] = df

return dfs

def _layout_metrics(self, evaluation: DatasetLayoutEvaluation) -> Dict[str, str]:
r"""Get the metrics for the LayoutEvaluation"""
metrics = {
"mAP": export_value(evaluation.map_stats),
"mAP_50": export_value(evaluation.map_50_stats),
"mAP_75": export_value(evaluation.map_75_stats),
"mAP": export_value(evaluation.mAP),
"stat_mAP": export_value(evaluation.map_stats),
"stat_mAP_50": export_value(evaluation.map_50_stats),
"stat_mAP_75": export_value(evaluation.map_75_stats),
"weighted_mAP_50": export_value(evaluation.weighted_map_50_stats),
"weighted_mAP_75": export_value(evaluation.weighted_map_75_stats),
"weighted_mAP_90": export_value(evaluation.weighted_map_90_stats),
Expand Down
80 changes: 58 additions & 22 deletions docling_eval/aggregations/multi_evalutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,11 +385,39 @@ def _create_eval(

@staticmethod
def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
r"""Load MultiEvaluation from disk files"""
# benchmark -> provider -> modality -> DatasetEvaluation
r"""
Load MultiEvaluation from disk files
"""

def _get_modalities_evaluations(
evaluations_root: Path,
benchmark: BenchMarkNames,
) -> Dict[EvaluationModality, SingleEvaluation]:
r"""
Scan the evaluations_root and load the evaluations for each modality
"""
modalities_evaluations: Dict[EvaluationModality, SingleEvaluation] = {}
for modality_path in evaluations_root.iterdir():
try:
modality = EvaluationModality(modality_path.name)
except ValueError:
continue

# Load the evaluation
evaluation = load_evaluation(benchmark, modality, modality_path)
if not evaluation:
continue

modalities_evaluations[modality] = SingleEvaluation(
evaluation=evaluation,
experiment=experiment,
)
return modalities_evaluations

# benchmark -> experiment_and_subexperiment -> modality-> SingleEvaluation
evaluations: Dict[
BenchMarkNames,
Dict[Path, Dict[EvaluationModality, DatasetEvaluationType]],
Dict[str, Dict[EvaluationModality, SingleEvaluation]],
] = {}

# Get the benchmark
Expand All @@ -398,6 +426,9 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
benchmark = BenchMarkNames(benchmark_path.name)
except ValueError:
continue
if benchmark not in evaluations:
evaluations[benchmark] = {}

# Get the experiment
for experiment_path in benchmark_path.iterdir():
if not experiment_path.is_dir():
Expand All @@ -407,30 +438,35 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
if experiment == MultiEvaluator.GT_LEAF_DIR:
continue

# Load the evaluations for each modality
evaluations_path = experiment_path / MultiEvaluator.EVALUATIONS_DIR
if not evaluations_path.is_dir():
continue
for modality_path in evaluations_path.iterdir():
try:
modality = EvaluationModality(modality_path.name)
except ValueError:
# Check if a sub-experiment is present
for exp_child_path in experiment_path.iterdir():
if not exp_child_path.is_dir():
continue

# Load the evaluation
evaluation = load_evaluation(benchmark, modality, modality_path)
if not evaluation:
subexp_candidate = exp_child_path.name
if subexp_candidate == MultiEvaluator.PRED_LEAF_DIR:
continue

if benchmark not in evaluations:
evaluations[benchmark] = {}
if experiment not in evaluations[benchmark]:
evaluations[benchmark][experiment] = {}
modalities_evaluations: Dict[EvaluationModality, SingleEvaluation]
if subexp_candidate == MultiEvaluator.EVALUATIONS_DIR:
modalities_evaluations = _get_modalities_evaluations(
exp_child_path, benchmark
)

evaluations[benchmark][experiment][modality] = SingleEvaluation(
evaluation=evaluation,
experiment=experiment,
)
exp_and_subexp = experiment
evaluations[benchmark][exp_and_subexp] = modalities_evaluations
else:
subexp_candidate_evaluations = (
exp_child_path / MultiEvaluator.EVALUATIONS_DIR
)
if not subexp_candidate_evaluations.is_dir():
continue
modalities_evaluations = _get_modalities_evaluations(
subexp_candidate_evaluations, benchmark
)

exp_and_subexp = f"{experiment}_{subexp_candidate}"
evaluations[benchmark][exp_and_subexp] = modalities_evaluations

multi_evaluation: MultiEvaluation = MultiEvaluation(evaluations=evaluations)
return multi_evaluation
163 changes: 149 additions & 14 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,33 @@
import os
import sys
from pathlib import Path
from typing import Annotated, Dict, Optional, Tuple

# --- DoclingLayoutOptionsManager definition moved here ---
from typing import Annotated, Dict, List, Optional, Tuple

import typer
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.layout_model_specs import (
DOCLING_LAYOUT_EGRET_LARGE,
DOCLING_LAYOUT_EGRET_MEDIUM,
DOCLING_LAYOUT_EGRET_XLARGE,
DOCLING_LAYOUT_HERON,
DOCLING_LAYOUT_HERON_101,
DOCLING_LAYOUT_V2,
LayoutModelConfig,
)
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
LayoutOptions,
PaginatedPipelineOptions,
PdfPipelineOptions,
VlmPipelineOptions,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
)
from docling.document_converter import FormatOption, PdfFormatOption
from docling.models.factories import get_ocr_factory
Expand Down Expand Up @@ -103,6 +118,26 @@
TableFormerPredictionProvider,
)


class DoclingLayoutOptionsManager:
layout_model_configs = {
"docling_layout_v2": DOCLING_LAYOUT_V2,
"docling_layout_heron": DOCLING_LAYOUT_HERON,
"docling_layout_heron_101": DOCLING_LAYOUT_HERON_101,
"docling_layout_egret_medium": DOCLING_LAYOUT_EGRET_MEDIUM,
"docling_layout_egret_large": DOCLING_LAYOUT_EGRET_LARGE,
"docling_layout_egret_xlarge": DOCLING_LAYOUT_EGRET_XLARGE,
}

@staticmethod
def get_layout_model_config(model_spec: str) -> LayoutModelConfig:
return DoclingLayoutOptionsManager.layout_model_configs[model_spec]

@staticmethod
def get_layout_model_config_names() -> List[str]:
return list(DoclingLayoutOptionsManager.layout_model_configs.keys())


# Configure logging
logging_level = logging.WARNING
# logging_level = logging.DEBUG
Expand All @@ -125,6 +160,32 @@
)


def derive_input_output_dirs(
benchmark: BenchMarkNames,
modality: EvaluationModality,
input_dir: Optional[Path],
output_dir: Optional[Path],
) -> Tuple[Path, Path]:
r"""
One of the input or output dirs must be non None.
In case one of them is None, it can be derived from the other one.
"""
if input_dir and output_dir:
return input_dir, output_dir
if not input_dir and not output_dir:
raise ValueError("Either input_dir or output_dir must be provided")

if not input_dir and output_dir:
# Derive input and output paths based on the directory structure in test_dataset_builder.py
input_dir = output_dir / "eval_dataset" / benchmark.value / modality.value

if not output_dir and input_dir:
output_dir = input_dir.parent
assert input_dir is not None
assert output_dir is not None
return input_dir, output_dir


def log_and_save_stats(
odir: Path,
benchmark: BenchMarkNames,
Expand Down Expand Up @@ -259,6 +320,9 @@ def get_prediction_provider(
do_table_structure: bool = True,
artifacts_path: Optional[Path] = None,
image_scale_factor: Optional[float] = None,
docling_layout_model_spec: Optional[LayoutModelConfig] = None,
docling_layout_create_orphan_clusters: Optional[bool] = None,
docling_layout_keep_empty_clusters: Optional[bool] = None,
):
pipeline_options: PaginatedPipelineOptions
"""Get the appropriate prediction provider with default settings."""
Expand Down Expand Up @@ -289,8 +353,17 @@ def get_prediction_provider(
pipeline_options.generate_parsed_pages = True
pipeline_options.accelerator_options = accelerator_options

pipeline_options.layout_options.create_orphan_clusters = False
pipeline_options.layout_options.keep_empty_clusters = True
# Layout options
layout_options: LayoutOptions = LayoutOptions()
if docling_layout_model_spec is not None:
layout_options.model_spec = docling_layout_model_spec
if docling_layout_create_orphan_clusters is not None:
layout_options.create_orphan_clusters = (
docling_layout_create_orphan_clusters
)
if docling_layout_keep_empty_clusters is not None:
layout_options.keep_empty_clusters = docling_layout_keep_empty_clusters
pipeline_options.layout_options = layout_options

if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
Expand Down Expand Up @@ -1038,6 +1111,24 @@ def create_eval(
help="Directory for local model artifacts. Will only be passed to providers supporting this."
),
] = None,
docling_layout_model_spec: Annotated[
Optional[str],
typer.Option(
help="Layout model spec for Docling. Supported values: {}".format(
DoclingLayoutOptionsManager.get_layout_model_config_names()
)
),
] = "docling_layout_heron",
docling_layout_create_orphan_clusters: Annotated[
Optional[bool],
typer.Option(
help="Enable orphan clusters creation in Docling layout post-processing"
),
] = True,
docling_layout_keep_empty_clusters: Annotated[
Optional[bool],
typer.Option(help="Keep the empty clusters in Docling layout post-processing"),
] = False,
do_visualization: Annotated[
bool, typer.Option(help="visualize the predictions")
] = True,
Expand Down Expand Up @@ -1070,6 +1161,14 @@ def create_eval(
)

# Create the appropriate prediction provider
docling_layout_model_spec_obj = (
DoclingLayoutOptionsManager.get_layout_model_config(
docling_layout_model_spec
)
if docling_layout_model_spec
else None
)

provider = get_prediction_provider(
provider_type=prediction_provider,
file_source_path=file_source_path,
Expand All @@ -1080,6 +1179,9 @@ def create_eval(
do_visualization=do_visualization,
image_scale_factor=image_scale_factor,
do_table_structure=do_table_structure,
docling_layout_model_spec=docling_layout_model_spec_obj,
docling_layout_create_orphan_clusters=docling_layout_create_orphan_clusters,
docling_layout_keep_empty_clusters=docling_layout_keep_empty_clusters,
)

# Get the dataset name from the benchmark
Expand Down Expand Up @@ -1173,13 +1275,32 @@ def create(
@app.command(name="evaluate")
def evaluate_cmd(
modality: Annotated[EvaluationModality, typer.Option(help="Evaluation modality")],
benchmark: Annotated[BenchMarkNames, typer.Option(help="Benchmark name")],
output_dir: Annotated[Path, typer.Option(help="Base output directory")],
benchmark: Annotated[
BenchMarkNames,
typer.Option(
help="Benchmark name. It is used only to set the filename of the evaluation json file."
),
],
input_dir: Annotated[
Optional[Path],
typer.Option(
help="Directory with evaluation dataset. If not provided, the input directory will be derived from the output directory."
),
] = None,
output_dir: Annotated[
Optional[Path],
typer.Option(
help="Base output directory. If not provided, the output directory will be derived from the input directory."
),
] = None,
split: Annotated[str, typer.Option(help="Dataset split")] = "test",
):
"""Evaluate predictions against ground truth."""
# Derive input and output paths based on the directory structure in test_dataset_builder.py
input_dir = output_dir / "eval_dataset"
input_dir, output_dir = derive_input_output_dirs(
benchmark, modality, input_dir, output_dir
)
assert input_dir is not None
assert output_dir is not None
eval_output_dir = output_dir / "evaluations" / modality.value

# Create output directory
Expand All @@ -1201,16 +1322,30 @@ def visualize_cmd(
EvaluationModality, typer.Option(help="Visualization modality")
],
benchmark: Annotated[BenchMarkNames, typer.Option(help="Benchmark name")],
output_dir: Annotated[Path, typer.Option(help="Base output directory")],
input_dir: Annotated[
Optional[Path],
typer.Option(
help="Directory with evaluation dataset. If not provided, the input directory will be derived from the output directory."
),
] = None,
output_dir: Annotated[
Optional[Path],
typer.Option(
help="Base output directory. If not provided, the output directory will be derived from the input directory."
),
] = None,
split: Annotated[str, typer.Option(help="Dataset split")] = "test",
begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
end_index: Annotated[
int, typer.Option(help="End index (exclusive), -1 for all")
] = -1,
):
"""Visualize evaluation results."""
# Derive input and output paths based on the directory structure in test_dataset_builder.py
input_dir = output_dir / "eval_dataset"
input_dir, output_dir = derive_input_output_dirs(
benchmark, modality, input_dir, output_dir
)
assert input_dir is not None
assert output_dir is not None
eval_output_dir = output_dir / "evaluations" / modality.value

# Create output directory
Expand Down
5 changes: 5 additions & 0 deletions docling_eval/utils/coco_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,11 @@ def main(args):
log_format = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format)

_log.info("Export eval-dataset in COCO-tools format")
_log.info("COCO dataset: %s", str(coco_path))
_log.info("eval-dataset: %s", str(docling_eval_path))
_log.info("Save path: %s", str(save_path))

# Create the COCO exporter
exporter = DoclingEvalCOCOExporter(docling_eval_path)
exporter.export_predictions_wrt_original_COCO(
Expand Down
Loading