From 63317231a1a7dea3ce84325295dfd83057026b75 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 16:34:56 +0100 Subject: [PATCH 01/22] chore: Move the teds.py inside the subdir evaluators/table Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/{ => table}/teds.py | 0 docling_eval/evaluators/table_evaluator.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename docling_eval/evaluators/{ => table}/teds.py (100%) diff --git a/docling_eval/evaluators/teds.py b/docling_eval/evaluators/table/teds.py similarity index 100% rename from docling_eval/evaluators/teds.py rename to docling_eval/evaluators/table/teds.py diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index bd28e84e..283eea68 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -22,7 +22,7 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.evaluators.teds import TEDScorer +from docling_eval.evaluators.table.teds import TEDScorer _log = logging.getLogger(__name__) From 85890fb7a8c4633390c0a6778f3c22d5d7cf2b18 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 16:37:31 +0100 Subject: [PATCH 02/22] feat: Introduce the external_predictions_path in BaseEvaluator and dummy entries in all evaluators. Extend the CLI to support the --external-predictions-path Signed-off-by: Nikos Livathinos --- docling_eval/cli/main.py | 20 ++++++++++++++++++- docling_eval/evaluators/base_evaluator.py | 1 + .../evaluators/bbox_text_evaluator.py | 5 ++++- .../evaluators/doc_structure_evaluator.py | 1 + docling_eval/evaluators/keyvalue_evaluator.py | 7 ++++++- docling_eval/evaluators/layout_evaluator.py | 1 + .../evaluators/markdown_text_evaluator.py | 1 + docling_eval/evaluators/ocr_evaluator.py | 1 + .../evaluators/pixel_layout_evaluator.py | 1 + .../evaluators/readingorder_evaluator.py | 1 + docling_eval/evaluators/table_evaluator.py | 1 + docling_eval/evaluators/timings_evaluator.py | 1 + 12 files changed, 38 insertions(+), 3 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index ade289da..08652a5b 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -631,6 +631,7 @@ def evaluate( odir: Path, split: str = "test", cvat_overview_path: Optional[Path] = None, + external_predictions_path: Optional[Path] = None, ) -> Optional[DatasetEvaluationType]: """Evaluate predictions against ground truth.""" if not os.path.exists(idir): @@ -665,6 +666,7 @@ def evaluate( evaluation = layout_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -673,7 +675,9 @@ def evaluate( # Evaluate with the pixel-wise layout evaluation pixel_layout_evaluator = PixelLayoutEvaluator() pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator( - idir, split=split + idir, + split=split, + external_predictions_path=external_predictions_path, ) pixel_save_root: Path = save_fn.parent pixel_layout_evaluator.save_evaluations( @@ -687,6 +691,7 @@ def evaluate( evaluation = table_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -699,6 +704,7 @@ def evaluate( evaluation = doc_struct_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -719,6 +725,7 @@ def evaluate( evaluation = ocr_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -729,6 +736,7 @@ def evaluate( evaluation = readingorder_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -745,6 +753,7 @@ def evaluate( evaluation = md_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: @@ -761,6 +770,7 @@ def evaluate( evaluation = bbox_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: json.dump( @@ -776,6 +786,7 @@ def evaluate( evaluation = keyvalue_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: json.dump( @@ -1479,6 +1490,12 @@ def evaluate_cmd( ), ] = None, split: Annotated[str, typer.Option(help="Dataset split")] = "test", + external_predictions_path: Annotated[ + Optional[Path], + typer.Option( + help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]", + ), + ] = None, ): """Evaluate predictions against ground truth.""" input_dir, output_dir = derive_input_output_dirs( @@ -1498,6 +1515,7 @@ def evaluate_cmd( idir=input_dir, odir=eval_output_dir, split=split, + external_predictions_path=external_predictions_path, ) diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py index 940f6bc5..4198f084 100644 --- a/docling_eval/evaluators/base_evaluator.py +++ b/docling_eval/evaluators/base_evaluator.py @@ -100,6 +100,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetEvaluationType: r""" Perform the evaluation diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index 09301693..156da241 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -94,7 +94,10 @@ def __init_( nltk.download("popular", quiet=True) def __call__( - self, ds_path: Path, split: str = "test" + self, + ds_path: Path, + split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetBoxesTextEvaluation: parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py index c572a6f3..c3ce7623 100644 --- a/docling_eval/evaluators/doc_structure_evaluator.py +++ b/docling_eval/evaluators/doc_structure_evaluator.py @@ -71,6 +71,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetDocStructureEvaluation: r""" Parameters diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py index 2ac58666..06c45911 100644 --- a/docling_eval/evaluators/keyvalue_evaluator.py +++ b/docling_eval/evaluators/keyvalue_evaluator.py @@ -415,7 +415,12 @@ def __init__( # --------------------------------------------------------------------- # # Public API # --------------------------------------------------------------------- # - def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluation: + def __call__( + self, + ds_path: Path, + split: str = "test", + external_predictions_path: Optional[Path] = None, + ) -> DatasetKeyValueEvaluation: split_glob = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: split_glob}) _log.info("Loaded split '%s' – %d samples", split, len(ds[split])) diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 8b737738..5663dddf 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -190,6 +190,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetLayoutEvaluation: logging.info("Loading the split '%s' from: '%s'", split, ds_path) diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index 18a107ff..a68c1884 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -108,6 +108,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetMarkdownEvaluation: r""" Parameters diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py index b04c01cc..10b136cd 100644 --- a/docling_eval/evaluators/ocr_evaluator.py +++ b/docling_eval/evaluators/ocr_evaluator.py @@ -62,6 +62,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> OcrDatasetEvaluationResult: dataset_path = ds_path data_split_name = split diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py index cddbb11b..ecd7f569 100644 --- a/docling_eval/evaluators/pixel_layout_evaluator.py +++ b/docling_eval/evaluators/pixel_layout_evaluator.py @@ -171,6 +171,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetPixelLayoutEvaluation: _log.info("Loading the split '%s' from: '%s'", split, ds_path) diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py index 0ff6037c..00e211eb 100644 --- a/docling_eval/evaluators/readingorder_evaluator.py +++ b/docling_eval/evaluators/readingorder_evaluator.py @@ -80,6 +80,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetReadingOrderEvaluation: parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 283eea68..4355903d 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -132,6 +132,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetTableEvaluation: r""" Load a dataset in HF format. Expected columns with DoclingDocuments diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index 9e8896b0..a0d2fa08 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -50,6 +50,7 @@ def __call__( self, ds_path: Path, split: str = "test", + external_predictions_path: Optional[Path] = None, ) -> DatasetTimingsEvaluation: logging.info("Loading the split '%s' from: '%s'", split, ds_path) From 5f9a279d4940079d1688d60a749a7230d27fedf2 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 16:39:56 +0100 Subject: [PATCH 03/22] feat: Extend test_dataset_builder.py to save document predictions in various formats Signed-off-by: Nikos Livathinos --- tests/test_dataset_builder.py | 48 +++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 75029ecf..1222cd31 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -2,8 +2,11 @@ from pathlib import Path import pytest +from datasets import load_dataset +from docling_core.types.doc.document import DoclingDocument from docling_eval.cli.main import evaluate, get_prediction_provider, visualize +from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction from docling_eval.datamodels.types import ( BenchMarkNames, EvaluationModality, @@ -34,6 +37,42 @@ IS_CI = bool(os.getenv("CI")) +def export_predictions( + ds_path: Path, + save_path: Path, + split: str = "test", +): + r"""Export the predicted document in the save path in various formats""" + parquet_files = str(ds_path / split / "*.parquet") + ds = load_dataset("parquet", data_files={split: parquet_files}) + + for data in ds[split]: + data_record = DatasetRecordWithPrediction.model_validate(data) + doc_id = data_record.doc_id + pred_doc: DoclingDocument = data_record.predicted_doc + + if pred_doc is None: + continue + + # Save as JSON + json_dir = save_path / "json" + json_dir.mkdir(parents=True, exist_ok=True) + json_fn = json_dir / f"{doc_id}.json" + pred_doc.save_as_json(json_fn) + + # Save as doctags (.doctags) + doctags_dir = save_path / "doctag" + doctags_dir.mkdir(parents=True, exist_ok=True) + doctags_fn = doctags_dir / f"{doc_id}.doctags" + pred_doc.save_as_doctags(doctags_fn) + + # Save as YAML + yaml_dir = save_path / "yaml" + yaml_dir.mkdir(parents=True, exist_ok=True) + yaml_fn = yaml_dir / f"{doc_id}.yaml" + pred_doc.save_as_yaml(yaml_fn) + + @pytest.mark.dependency() def test_run_dpbench_e2e(): target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/") @@ -54,6 +93,11 @@ def test_run_dpbench_e2e(): target_dataset_dir=target_path / "eval_dataset_e2e", ) + # Export predictions + pred_path = target_path / "eval_dataset_e2e" + save_path = target_path / "predicted_documents" + export_predictions(pred_path, save_path) + ## Evaluate Layout evaluate( modality=EvaluationModality.LAYOUT, @@ -602,3 +646,7 @@ def test_file_dataset_builder(): ) dataset_builder.save_to_disk(do_visualization=True) + + +if __name__ == "__main__": + test_run_dpbench_e2e() From e6e84096d05a5bf885da806809a2f45fab9e4f58 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 16:41:16 +0100 Subject: [PATCH 04/22] feat: Extend MarkDownTextEvaluator to support external_predictions_path. Add unit test Signed-off-by: Nikos Livathinos --- .../evaluators/markdown_text_evaluator.py | 34 ++++++++++++++----- .../utils/external_docling_doc_loader.py | 28 +++++++++++++++ tests/test_markdown_text_evaluator.py | 15 ++++++-- 3 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 docling_eval/utils/external_docling_doc_loader.py diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index a68c1884..96222c68 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -26,6 +26,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader _log = logging.getLogger(__name__) @@ -116,6 +117,11 @@ def __call__( ds_path: Path to load the parquet files of the dataset split: Split of the dataset to load """ + if external_predictions_path is not None: + external_docling_doc_loader = ExternalDoclingDocLoader( + external_predictions_path + ) + parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) _log.info(f"Overview of the dataset: {ds}") @@ -146,16 +152,28 @@ def __call__( ): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: - _log.error( - "Skipping record without successfull conversion status: %s", doc_id - ) - rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 - continue - true_doc = data_record.ground_truth_doc true_md = self._docling_document_to_md(true_doc) - pred_md = self._get_pred_md(data_record) + + # Get the predicted markdown from the external predictions path + if external_predictions_path is not None: + pred_doc = external_docling_doc_loader(doc_id) + if pred_doc is None: + _log.error("No external prediction found for doc_id=%s", doc_id) + rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 + continue + pred_md = self._docling_document_to_md(pred_doc) + else: + if data_record.status not in self._accepted_status: + _log.error( + "Skipping record without successfull conversion status: %s", + doc_id, + ) + rejected_samples[ + EvaluationRejectionType.INVALID_CONVERSION_STATUS + ] += 1 + continue + pred_md = self._get_pred_md(data_record) # type: ignore if not pred_md: _log.error("There is no markdown prediction for doc_id=%s", doc_id) diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_doc_loader.py new file mode 100644 index 00000000..c132b6ef --- /dev/null +++ b/docling_eval/utils/external_docling_doc_loader.py @@ -0,0 +1,28 @@ +from pathlib import Path +from typing import Optional + +from docling_core.types.doc.document import DoclingDocument + + +class ExternalDoclingDocLoader: + def __init__(self, external_predictions_dir: Path): + self._external_predictions_dir = external_predictions_dir + + def __call__(self, doc_id: str) -> Optional[DoclingDocument]: + r""" + Load the DoclingDocument from the external predictions path + """ + json_path = self._external_predictions_dir / f"{doc_id}.json" + dt_path = self._external_predictions_dir / f"{doc_id}.dt" + yaml_path = self._external_predictions_dir / f"{doc_id}.yaml" + yml_path = self._external_predictions_dir / f"{doc_id}.yml" + + if json_path.is_file(): + return DoclingDocument.load_from_json(json_path) + if dt_path.is_file(): + return DoclingDocument.load_from_doctags(dt_path) + if yaml_path.is_file(): + return DoclingDocument.load_from_yaml(yaml_path) + if yml_path.is_file(): + return DoclingDocument.load_from_yaml(yml_path) + return None diff --git a/tests/test_markdown_text_evaluator.py b/tests/test_markdown_text_evaluator.py index 6eba1acb..8fbd4073 100644 --- a/tests/test_markdown_text_evaluator.py +++ b/tests/test_markdown_text_evaluator.py @@ -34,5 +34,16 @@ def test_markdown_text_evaluator(): assert is_exception -# if __name__ == "__main__": -# test_markdown_text_evaluator() +def test_markdown_text_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = MarkdownTextEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + # test_markdown_text_evaluator() + test_markdown_text_evaluator_external_predictions() From 5624e6195da06bc4fc0a77a20b83544bde536973 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 17:12:43 +0100 Subject: [PATCH 05/22] feat: Extend LayoutEvaluator to support external_predictions_path. Add unit test. Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/layout_evaluator.py | 23 +++++++++++++++++---- tests/test_layout_evaluator.py | 17 ++++++++++++--- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 5663dddf..5f43fb03 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -30,6 +30,7 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader from docling_eval.utils.utils import tensor_to_float _log = logging.getLogger(__name__) @@ -194,6 +195,10 @@ def __call__( ) -> DatasetLayoutEvaluation: logging.info("Loading the split '%s' from: '%s'", split, ds_path) + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) @@ -209,7 +214,7 @@ def __call__( pred_labels, intersection_labels, union_labels, - ) = self._find_intersecting_labels(ds_selection) + ) = self._find_intersecting_labels(ds_selection, ext_docdoc_loader) true_labels_str = ", ".join(sorted(true_labels)) logging.info(f"True labels: {true_labels_str}") @@ -282,7 +287,9 @@ def __call__( continue true_doc = data_record.ground_truth_doc - pred_doc = self._get_pred_doc(data_record) + + # Get the predicted document + pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) if not pred_doc: _log.error("There is no prediction for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 @@ -585,12 +592,19 @@ def __call__( return dataset_layout_evaluation def _get_pred_doc( - self, data_record: DatasetRecordWithPrediction + self, + data_record: DatasetRecordWithPrediction, + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None + if ext_docdoc_loader is not None: + doc_id = data_record.doc_id + pred_doc = ext_docdoc_loader(doc_id) + return pred_doc + for prediction_format in self._prediction_sources: if prediction_format == PredictionFormats.DOCLING_DOCUMENT: pred_doc = data_record.predicted_doc @@ -802,6 +816,7 @@ def _compute_average_iou_with_labels_across_iou( def _find_intersecting_labels( self, ds: Dataset, + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]: r""" Compute counters per labels for the groundtruth, prediciton and their intersections @@ -821,7 +836,7 @@ def _find_intersecting_labels( ): data_record = DatasetRecordWithPrediction.model_validate(data) true_doc = data_record.ground_truth_doc - pred_doc = self._get_pred_doc(data_record) + pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) for item, level in true_doc.iterate_items( included_content_layers={ diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py index d72f23d1..ef9539e2 100644 --- a/tests/test_layout_evaluator.py +++ b/tests/test_layout_evaluator.py @@ -54,6 +54,17 @@ def test_failed_conversions(): assert len(v1.evaluations_per_image) == 0 -# if __name__ == "__main__": -# # test_layout_evaluator() -# test_failed_conversions() +def test_layout_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = LayoutEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + # # test_layout_evaluator() + # test_failed_conversions() + test_layout_evaluator_external_predictions() From 171ad7455bbed78c723141d11ddc6d670ebcb582 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 17:36:23 +0100 Subject: [PATCH 06/22] fix: Add missing pytest dependencies in tests Signed-off-by: Nikos Livathinos --- tests/test_layout_evaluator.py | 4 ++++ tests/test_markdown_text_evaluator.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py index ef9539e2..33c79a64 100644 --- a/tests/test_layout_evaluator.py +++ b/tests/test_layout_evaluator.py @@ -54,6 +54,10 @@ def test_failed_conversions(): assert len(v1.evaluations_per_image) == 0 +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) def test_layout_evaluator_external_predictions(): r"""Testing the evaluator with external predictions""" eval = LayoutEvaluator() diff --git a/tests/test_markdown_text_evaluator.py b/tests/test_markdown_text_evaluator.py index 8fbd4073..8d3eb203 100644 --- a/tests/test_markdown_text_evaluator.py +++ b/tests/test_markdown_text_evaluator.py @@ -34,6 +34,10 @@ def test_markdown_text_evaluator(): assert is_exception +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) def test_markdown_text_evaluator_external_predictions(): r"""Testing the evaluator with external predictions""" eval = MarkdownTextEvaluator() From 0f0cfb5f4b39945e183ba532f3b4ad94e6214e26 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 17:55:38 +0100 Subject: [PATCH 07/22] fix: Fix loading the external predictions in LayoutEvaluator Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/layout_evaluator.py | 6 +++++- tests/test_layout_evaluator.py | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 5f43fb03..ee439610 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -279,7 +279,11 @@ def __call__( ): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: + + if ( + ext_docdoc_loader is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py index 33c79a64..2de0752a 100644 --- a/tests/test_layout_evaluator.py +++ b/tests/test_layout_evaluator.py @@ -5,7 +5,6 @@ from docling_eval.datamodels.types import PredictionFormats from docling_eval.evaluators.layout_evaluator import LayoutEvaluator -from docling_eval.evaluators.markdown_text_evaluator import MarkdownTextEvaluator @pytest.mark.dependency( From 8069571c2e09292ba055bd3826e9753e69cd211a Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 17:56:31 +0100 Subject: [PATCH 08/22] feat: Introduce external predictions in DocStructureEvaluator. Add unit test. Signed-off-by: Nikos Livathinos --- .../evaluators/doc_structure_evaluator.py | 15 ++++++++++-- tests/test_doc_structure_evaluator.py | 23 +++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/test_doc_structure_evaluator.py diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py index c3ce7623..42609e49 100644 --- a/docling_eval/evaluators/doc_structure_evaluator.py +++ b/docling_eval/evaluators/doc_structure_evaluator.py @@ -18,6 +18,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader _log = logging.getLogger(__name__) @@ -79,6 +80,10 @@ def __call__( ds_path: Path to load the parquet files of the dataset split: Split of the dataset to load """ + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) _log.info(f"Overview of the dataset: {ds}") @@ -107,7 +112,10 @@ def __call__( ): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: + if ( + ext_docdoc_loader is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) @@ -115,7 +123,10 @@ def __call__( continue true_doc = data_record.ground_truth_doc - pred_doc = data_record.predicted_doc + if ext_docdoc_loader: + pred_doc = ext_docdoc_loader(doc_id) + else: + pred_doc = data_record.predicted_doc if pred_doc is None: _log.error("There is no prediction for doc_id=%s", doc_id) diff --git a/tests/test_doc_structure_evaluator.py b/tests/test_doc_structure_evaluator.py new file mode 100644 index 00000000..a1440d26 --- /dev/null +++ b/tests/test_doc_structure_evaluator.py @@ -0,0 +1,23 @@ +from pathlib import Path + +import pytest + +from docling_eval.evaluators.doc_structure_evaluator import DocStructureEvaluator + + +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_doc_structure_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = DocStructureEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + test_doc_structure_evaluator_external_predictions() From 8ba6b453cc5eb45d89e653c241ba0365fff791ff Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Thu, 4 Dec 2025 18:07:31 +0100 Subject: [PATCH 09/22] feat: Extend the TableEvaluator to support external predictions. Add unit test Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/table_evaluator.py | 16 ++++++++++++++-- tests/test_table_evaluator.py | 18 ++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 4355903d..2872a10a 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -23,6 +23,7 @@ ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.evaluators.table.teds import TEDScorer +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader _log = logging.getLogger(__name__) @@ -141,6 +142,10 @@ def __call__( """ logging.info("Loading the split '%s' from: '%s'", split, ds_path) + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) @@ -167,7 +172,7 @@ def __call__( data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id gt_doc = data_record.ground_truth_doc - pred_doc = self._get_pred_doc(data_record) + pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) if not pred_doc: _log.error("There is no prediction for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 @@ -310,12 +315,19 @@ def _evaluate_tables_in_documents( return table_evaluations def _get_pred_doc( - self, data_record: DatasetRecordWithPrediction + self, + data_record: DatasetRecordWithPrediction, + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None + if ext_docdoc_loader is not None: + doc_id = data_record.doc_id + pred_doc = ext_docdoc_loader(doc_id) + return pred_doc + for prediction_format in self._prediction_sources: if prediction_format == PredictionFormats.DOCLING_DOCUMENT: pred_doc = data_record.predicted_doc diff --git a/tests/test_table_evaluator.py b/tests/test_table_evaluator.py index a2adce76..312ab9d7 100644 --- a/tests/test_table_evaluator.py +++ b/tests/test_table_evaluator.py @@ -320,5 +320,19 @@ def test_table_evaluator(): assert is_exception -# if __name__ == "__main__": -# test_table_evaluator() +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_table_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = TableEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + test_table_evaluator_external_predictions() From 949d6ccdce9a55824a254d7bf150c27f38d9ce0c Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 11:45:41 +0100 Subject: [PATCH 10/22] feat: Extend the KeyValueEvaluator to support external predictions. Add unit test. Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/keyvalue_evaluator.py | 18 +++++++++++++++--- tests/test_keyvalue_evaluator.py | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py index 06c45911..20899d33 100644 --- a/docling_eval/evaluators/keyvalue_evaluator.py +++ b/docling_eval/evaluators/keyvalue_evaluator.py @@ -21,6 +21,7 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader _log = logging.getLogger(__name__) @@ -421,6 +422,11 @@ def __call__( split: str = "test", external_predictions_path: Optional[Path] = None, ) -> DatasetKeyValueEvaluation: + r""" """ + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + split_glob = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: split_glob}) _log.info("Loaded split '%s' – %d samples", split, len(ds[split])) @@ -466,13 +472,13 @@ def __call__( doc_id = record.doc_id # ----- sanity checks -------------------------------------------------- - if record.status not in self._accepted_status: + if ext_docdoc_loader is None and record.status not in self._accepted_status: rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1 _log.error("Skipping %s – conversion failed", doc_id) continue gt_doc = record.ground_truth_doc - pred_doc = self._get_pred_doc(record) + pred_doc = self._get_pred_doc(record, ext_docdoc_loader) if pred_doc is None: rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 _log.error("Skipping %s – missing prediction", doc_id) @@ -640,10 +646,16 @@ def __call__( # Helpers # --------------------------------------------------------------------- # def _get_pred_doc( - self, data_record: DatasetRecordWithPrediction + self, + data_record: DatasetRecordWithPrediction, + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, ) -> Optional[DoclingDocument]: """Fetch the prediction in the first available format declared by `prediction_sources`.""" pred_doc: Optional[DoclingDocument] = None + if ext_docdoc_loader is not None: + doc_id = data_record.doc_id + pred_doc = ext_docdoc_loader(doc_id) + return pred_doc for fmt in self._prediction_sources: if fmt == PredictionFormats.DOCLING_DOCUMENT: diff --git a/tests/test_keyvalue_evaluator.py b/tests/test_keyvalue_evaluator.py index 712a3ca4..d47fd3eb 100644 --- a/tests/test_keyvalue_evaluator.py +++ b/tests/test_keyvalue_evaluator.py @@ -52,3 +52,21 @@ def test_failed_conversions(): v1 = evaluator(test_dataset_dir) assert v1 is not None assert len(v1.evaluations) == 0 + + +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_keyvalue_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = KeyValueEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + test_keyvalue_evaluator_external_predictions() From 13badc5522edb87c9780a476d1c9ef5a9fac839d Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 11:53:14 +0100 Subject: [PATCH 11/22] feat: Extend the PixelLayoutEvaluator to support external predictions. Add unit test Signed-off-by: Nikos Livathinos --- .../evaluators/pixel_layout_evaluator.py | 21 ++++++++++++++++--- tests/test_pixel_layout_evaluator.py | 20 +++++++++++++++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py index 230831e7..688ec82c 100644 --- a/docling_eval/evaluators/pixel_layout_evaluator.py +++ b/docling_eval/evaluators/pixel_layout_evaluator.py @@ -39,6 +39,7 @@ PagePixelLayoutEvaluation, ) from docling_eval.evaluators.stats import compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader from docling_eval.utils.utils import dict_get _log = logging.getLogger(__name__) @@ -175,6 +176,10 @@ def __call__( ) -> DatasetPixelLayoutEvaluation: _log.info("Loading the split '%s' from: '%s'", split, ds_path) + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) @@ -229,7 +234,10 @@ def __call__( ) doc_id: str = data_record.doc_id - if data_record.status not in self._accepted_status: + if ( + ext_docdoc_loader is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) @@ -237,7 +245,7 @@ def __call__( continue true_doc = data_record.ground_truth_doc - pred_doc = self._get_pred_doc(data_record) + pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader) if not pred_doc: _log.error("There is no prediction for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 @@ -541,12 +549,19 @@ def _collect_items_by_page( return pages_to_objects def _get_pred_doc( - self, data_record: DatasetRecordWithPrediction + self, + data_record: DatasetRecordWithPrediction, + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None + if ext_docdoc_loader is not None: + doc_id = data_record.doc_id + pred_doc = ext_docdoc_loader(doc_id) + return pred_doc + for prediction_format in self._prediction_sources: if prediction_format == PredictionFormats.DOCLING_DOCUMENT: pred_doc = data_record.predicted_doc diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py index 99d60f9d..9beaecb8 100644 --- a/tests/test_pixel_layout_evaluator.py +++ b/tests/test_pixel_layout_evaluator.py @@ -19,7 +19,7 @@ depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], scope="session", ) -def test_layout_evaluator(): +def test_pixel_layout_evaluator(): r""" """ test_dataset_dir = Path("scratch/DPBench/eval_dataset_e2e") @@ -87,5 +87,19 @@ def test_layout_evaluator(): ), "Wrong label mapping in _matrix_id_to_name" -# if __name__ == "__main__": -# test_layout_evaluator() +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_pixel_layout_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = PixelLayoutEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + test_pixel_layout_evaluator_external_predictions() From 8c2a0654c2b5982b6390bf2681f5b649c9d9df0e Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 14:24:46 +0100 Subject: [PATCH 12/22] feat: Extend the BboxTextEvaluator to support external predictions. Add unit test Signed-off-by: Nikos Livathinos --- .../evaluators/bbox_text_evaluator.py | 30 ++++++++++++++++--- tests/test_bboxtext_evaluator.py | 19 ++++++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index 156da241..c129b01c 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -4,7 +4,7 @@ import nltk from datasets import load_dataset -from docling_core.types.doc.base import BoundingBox +from docling_core.types.doc.base import BoundingBox, CoordOrigin from docling_core.types.doc.document import DoclingDocument, TextItem from nltk import edit_distance, word_tokenize from nltk.metrics import f_measure, precision, recall @@ -25,6 +25,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader _log = logging.getLogger(__name__) @@ -99,6 +100,11 @@ def __call__( split: str = "test", external_predictions_path: Optional[Path] = None, ) -> DatasetBoxesTextEvaluation: + r""" """ + ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + if external_predictions_path is not None: + ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) _log.info(f"oveview of dataset: {ds}") @@ -128,7 +134,10 @@ def __call__( ): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: + if ( + ext_docdoc_loader is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) @@ -136,7 +145,12 @@ def __call__( continue true_doc = data_record.ground_truth_doc - pred_doc = data_record.predicted_doc + + # Load the pred_doc + if ext_docdoc_loader is not None: + pred_doc = ext_docdoc_loader(doc_id) + else: + pred_doc = data_record.predicted_doc if pred_doc is None: _log.error("There is no prediction for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 @@ -215,7 +229,15 @@ def _match_bboxes( continue assert len(doc_item.prov) == 1 prov = doc_item.prov[0] - bboxes[doc_key].append(prov.bbox) + + # Ensure bbox is in top-left origin + bbox = prov.bbox + if bbox.coord_origin != CoordOrigin.TOPLEFT: + page_no = prov.page_no + page_size = doc.pages[page_no].size + bbox = bbox.to_top_left_origin(page_size.height) + + bboxes[doc_key].append(bbox) texts[doc_key].append(doc_item.text) # Decide which document is the pivot diff --git a/tests/test_bboxtext_evaluator.py b/tests/test_bboxtext_evaluator.py index 7917c23c..cd9112d4 100644 --- a/tests/test_bboxtext_evaluator.py +++ b/tests/test_bboxtext_evaluator.py @@ -29,5 +29,20 @@ def test_bboxtext_evaluator(): assert is_exception -# if __name__ == "__main__": -# test_bboxtext_evaluator() +@pytest.mark.dependency( + depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"], + scope="session", +) +def test_bboxtext_evaluator_external_predictions(): + r"""Testing the evaluator with external predictions""" + eval = BboxTextEvaluator() + gt_path = Path("scratch/DPBench/gt_dataset") + preds_path = Path("scratch/DPBench/predicted_documents/json") + + v = eval(gt_path, external_predictions_path=preds_path) + assert v is not None + + +if __name__ == "__main__": + # test_bboxtext_evaluator() + test_bboxtext_evaluator_external_predictions() From 08391b36a9d05bf47daa8d3b5bcfd18bbf1db255 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 14:48:35 +0100 Subject: [PATCH 13/22] feat: Disable the OCREvaluator when using the external predictions Signed-off-by: Nikos Livathinos --- docling_eval/cli/main.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 08652a5b..7b233802 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -711,25 +711,30 @@ def evaluate( json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) elif modality == EvaluationModality.OCR: - if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]: - text_unit = TextCellUnit.LINE - else: - text_unit = TextCellUnit.WORD - - logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})") - logging.info(f"Text unit set to {text_unit}") + if not external_predictions_path: + if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]: + text_unit = TextCellUnit.LINE + else: + text_unit = TextCellUnit.WORD + + logging.info( + f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})" + ) + logging.info(f"Text unit set to {text_unit}") - ocr_evaluator = OCREvaluator( - intermediate_evaluations_path=odir, text_unit=text_unit - ) - evaluation = ocr_evaluator( # type: ignore - idir, - split=split, - external_predictions_path=external_predictions_path, - ) + ocr_evaluator = OCREvaluator( + intermediate_evaluations_path=odir, text_unit=text_unit + ) + evaluation = ocr_evaluator( # type: ignore + idir, + split=split, + external_predictions_path=external_predictions_path, + ) - with open(save_fn, "w") as fd: - json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + with open(save_fn, "w") as fd: + json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True) + else: + logging.error("External predictions are not supported for OCR evaluations") elif modality == EvaluationModality.READING_ORDER: readingorder_evaluator = ReadingOrderEvaluator() From 595ba6c19857601796e2b486e25cf9e51846da03 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 15:23:05 +0100 Subject: [PATCH 14/22] fix: Fixing guard for external predictions in TimingsEvaluator, ReadingOrderEvaluator. Fix main Signed-off-by: Nikos Livathinos --- docling_eval/cli/main.py | 1 + docling_eval/evaluators/readingorder_evaluator.py | 5 ++++- docling_eval/evaluators/timings_evaluator.py | 5 ++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index 7b233802..d65bd634 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -652,6 +652,7 @@ def evaluate( evaluation = timings_evaluator( # type: ignore idir, split=split, + external_predictions_path=external_predictions_path, ) with open(save_fn, "w") as fd: diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py index 00e211eb..cb8f09fb 100644 --- a/docling_eval/evaluators/readingorder_evaluator.py +++ b/docling_eval/evaluators/readingorder_evaluator.py @@ -101,7 +101,10 @@ def __call__( ): data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: + if ( + external_predictions_path is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py index a0d2fa08..4c0c018b 100644 --- a/docling_eval/evaluators/timings_evaluator.py +++ b/docling_eval/evaluators/timings_evaluator.py @@ -80,7 +80,10 @@ def __call__( data_record = DatasetRecordWithPrediction.model_validate(data) doc_id = data_record.doc_id - if data_record.status not in self._accepted_status: + if ( + external_predictions_path is None + and data_record.status not in self._accepted_status + ): _log.error( "Skipping record without successfull conversion status: %s", doc_id ) From 406b122f425a70a1b34959c6a5125b569de96ad6 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 17:28:12 +0100 Subject: [PATCH 15/22] fix: Export the doctag files with the correct file extension Signed-off-by: Nikos Livathinos --- tests/test_dataset_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py index 1222cd31..22a028d5 100644 --- a/tests/test_dataset_builder.py +++ b/tests/test_dataset_builder.py @@ -63,7 +63,7 @@ def export_predictions( # Save as doctags (.doctags) doctags_dir = save_path / "doctag" doctags_dir.mkdir(parents=True, exist_ok=True) - doctags_fn = doctags_dir / f"{doc_id}.doctags" + doctags_fn = doctags_dir / f"{doc_id}.dt" pred_doc.save_as_doctags(doctags_fn) # Save as YAML From ebe70b00dd14c855e77ebf28f84029541bcd2349 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 18:09:05 +0100 Subject: [PATCH 16/22] feat: Refactor the ExternalDoclingDocumentLoader to properly load a DoclingDocument from doctags and the GT image. - Introduce the staticmethod load_doctags() which covers all cases on page image loading. - Refactor the FilePredictionProvider to use the load_doctags() from ExternalDoclingDocumentLoader. - Refactor all evaluators to use the new ExternalDoclingDocumentLoader. Signed-off-by: Nikos Livathinos --- .../evaluators/bbox_text_evaluator.py | 8 +- .../evaluators/doc_structure_evaluator.py | 8 +- docling_eval/evaluators/keyvalue_evaluator.py | 11 +- docling_eval/evaluators/layout_evaluator.py | 13 +- .../evaluators/markdown_text_evaluator.py | 6 +- .../evaluators/pixel_layout_evaluator.py | 11 +- docling_eval/evaluators/table_evaluator.py | 11 +- .../prediction_providers/file_provider.py | 73 +++------ .../utils/external_docling_doc_loader.py | 139 +++++++++++++++--- tests/test_layout_evaluator.py | 11 +- 10 files changed, 179 insertions(+), 112 deletions(-) diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index c129b01c..a93f1461 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -25,7 +25,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -101,9 +101,9 @@ def __call__( external_predictions_path: Optional[Path] = None, ) -> DatasetBoxesTextEvaluation: r""" """ - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) @@ -148,7 +148,7 @@ def __call__( # Load the pred_doc if ext_docdoc_loader is not None: - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) else: pred_doc = data_record.predicted_doc if pred_doc is None: diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py index 42609e49..a2f59c30 100644 --- a/docling_eval/evaluators/doc_structure_evaluator.py +++ b/docling_eval/evaluators/doc_structure_evaluator.py @@ -18,7 +18,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -80,9 +80,9 @@ def __call__( ds_path: Path to load the parquet files of the dataset split: Split of the dataset to load """ - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) parquet_files = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: parquet_files}) @@ -124,7 +124,7 @@ def __call__( true_doc = data_record.ground_truth_doc if ext_docdoc_loader: - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) else: pred_doc = data_record.predicted_doc diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py index 20899d33..9a5d3c72 100644 --- a/docling_eval/evaluators/keyvalue_evaluator.py +++ b/docling_eval/evaluators/keyvalue_evaluator.py @@ -21,7 +21,7 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -423,9 +423,9 @@ def __call__( external_predictions_path: Optional[Path] = None, ) -> DatasetKeyValueEvaluation: r""" """ - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) split_glob = str(ds_path / split / "*.parquet") ds = load_dataset("parquet", data_files={split: split_glob}) @@ -648,13 +648,12 @@ def __call__( def _get_pred_doc( self, data_record: DatasetRecordWithPrediction, - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None, ) -> Optional[DoclingDocument]: """Fetch the prediction in the first available format declared by `prediction_sources`.""" pred_doc: Optional[DoclingDocument] = None if ext_docdoc_loader is not None: - doc_id = data_record.doc_id - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) return pred_doc for fmt in self._prediction_sources: diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index ee439610..a906d707 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -30,7 +30,7 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader from docling_eval.utils.utils import tensor_to_float _log = logging.getLogger(__name__) @@ -195,9 +195,9 @@ def __call__( ) -> DatasetLayoutEvaluation: logging.info("Loading the split '%s' from: '%s'", split, ds_path) - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) # Load the dataset split_path = str(ds_path / split / "*.parquet") @@ -598,15 +598,14 @@ def __call__( def _get_pred_doc( self, data_record: DatasetRecordWithPrediction, - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None if ext_docdoc_loader is not None: - doc_id = data_record.doc_id - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) return pred_doc for prediction_format in self._prediction_sources: @@ -820,7 +819,7 @@ def _compute_average_iou_with_labels_across_iou( def _find_intersecting_labels( self, ds: Dataset, - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None, ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]: r""" Compute counters per labels for the groundtruth, prediciton and their intersections diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index 96222c68..aa112d64 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -26,7 +26,7 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -118,7 +118,7 @@ def __call__( split: Split of the dataset to load """ if external_predictions_path is not None: - external_docling_doc_loader = ExternalDoclingDocLoader( + external_docling_doc_loader = ExternalDoclingDocumentLoader( external_predictions_path ) @@ -157,7 +157,7 @@ def __call__( # Get the predicted markdown from the external predictions path if external_predictions_path is not None: - pred_doc = external_docling_doc_loader(doc_id) + pred_doc = external_docling_doc_loader(data_record) if pred_doc is None: _log.error("No external prediction found for doc_id=%s", doc_id) rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1 diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py index 688ec82c..b3df7396 100644 --- a/docling_eval/evaluators/pixel_layout_evaluator.py +++ b/docling_eval/evaluators/pixel_layout_evaluator.py @@ -39,7 +39,7 @@ PagePixelLayoutEvaluation, ) from docling_eval.evaluators.stats import compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader from docling_eval.utils.utils import dict_get _log = logging.getLogger(__name__) @@ -176,9 +176,9 @@ def __call__( ) -> DatasetPixelLayoutEvaluation: _log.info("Loading the split '%s' from: '%s'", split, ds_path) - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) # Load the dataset split_path = str(ds_path / split / "*.parquet") @@ -551,15 +551,14 @@ def _collect_items_by_page( def _get_pred_doc( self, data_record: DatasetRecordWithPrediction, - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None if ext_docdoc_loader is not None: - doc_id = data_record.doc_id - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) return pred_doc for prediction_format in self._prediction_sources: diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 2872a10a..05fec46b 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -23,7 +23,7 @@ ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.evaluators.table.teds import TEDScorer -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -142,9 +142,9 @@ def __call__( """ logging.info("Loading the split '%s' from: '%s'", split, ds_path) - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None if external_predictions_path is not None: - ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path) + ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path) # Load the dataset split_path = str(ds_path / split / "*.parquet") @@ -317,15 +317,14 @@ def _evaluate_tables_in_documents( def _get_pred_doc( self, data_record: DatasetRecordWithPrediction, - ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None, + ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None, ) -> Optional[DoclingDocument]: r""" Get the predicted DoclingDocument """ pred_doc = None if ext_docdoc_loader is not None: - doc_id = data_record.doc_id - pred_doc = ext_docdoc_loader(doc_id) + pred_doc = ext_docdoc_loader(data_record) return pred_doc for prediction_format in self._prediction_sources: diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py index 97244c99..17b9bad3 100644 --- a/docling_eval/prediction_providers/file_provider.py +++ b/docling_eval/prediction_providers/file_provider.py @@ -4,11 +4,7 @@ from docling.datamodel.base_models import ConversionStatus from docling_core.types.doc import DocItemLabel -from docling_core.types.doc.document import ( - DoclingDocument, - DocTagsDocument, - DocTagsPage, -) +from docling_core.types.doc.document import DoclingDocument from PIL import Image from docling_eval.datamodels.dataset_record import ( @@ -23,6 +19,7 @@ from docling_eval.prediction_providers.base_prediction_provider import ( BasePredictionProvider, ) +from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader _log = logging.getLogger(__name__) @@ -164,58 +161,22 @@ def prediction_format(self) -> PredictionFormats: return self._prediction_format def _load_doctags_doc(self, record: DatasetRecord) -> Optional[DoclingDocument]: + r""" + Load the DoclingDocument from doctags + image. + Check ExternalDoclingDocLoader for details on the loading alrogithm. """ - Load doctags file into DoclingDocument. - - Args: - record: Groundtruth dataset record - - Returns: - DoclingDocument or None if file not found - """ - # Read the doctags file - doctags_fn = self._prediction_source_path / f"{record.doc_id}.dt" - if self._ignore_missing_files and not doctags_fn.is_file(): - return None - - try: - with open(doctags_fn, "r") as fd: - doctags = fd.read() - - page_image: Optional[Image.Image] = None - - # Try to get an image file for the predictions: - # 1. Check the pred_images_path. - # 2. Use the GT page image if the corresponding flag is set. - # 3. Look inside the same dir as the doctag files. - if self._prediction_images_path: - page_image_fn = self._prediction_images_path / f"{record.doc_id}.png" - if page_image_fn.is_file(): - page_image = Image.open(page_image_fn) - else: - _log.warning("Failed to load pred image: %s", page_image_fn) - elif self._use_ground_truth_page_images: - page_image = record.ground_truth_page_images[0] - else: - page_image_fn = self._prediction_source_path / f"{record.doc_id}.png" - if page_image_fn.is_file(): - page_image = Image.open(page_image_fn) - else: - _log.warning("Failed to load pred image: %s", page_image_fn) - - # Build DoclingDocument - doctags_page = DocTagsPage(tokens=doctags, image=page_image) - doctags_doc = DocTagsDocument(pages=[doctags_page]) - doc = DoclingDocument.load_from_doctags( - doctags_doc, document_name=record.doc_id - ) - - return doc - except Exception as e: - _log.error(f"Error loading doctags document {record.doc_id}: {str(e)}") - if not self._ignore_missing_files: - raise - return None + doc_id = record.doc_id + gt_page_images = record.ground_truth_page_images + gt_page_image = gt_page_images[0] if len(gt_page_images) > 0 else None + doc = ExternalDoclingDocumentLoader.load_doctags( + doc_id, + self._prediction_source_path, + page_images_root=self._prediction_images_path, + gt_page_image=gt_page_image, + ) + if not self._ignore_missing_files: + raise ValueError(f"Missing missing document {doc_id}") + return doc def _load_json_doc(self, record: DatasetRecord) -> Optional[DoclingDocument]: """ diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_doc_loader.py index c132b6ef..c3efd495 100644 --- a/docling_eval/utils/external_docling_doc_loader.py +++ b/docling_eval/utils/external_docling_doc_loader.py @@ -1,28 +1,133 @@ +import logging from pathlib import Path from typing import Optional -from docling_core.types.doc.document import DoclingDocument +from docling_core.types.doc.document import ( + DoclingDocument, + DocTagsDocument, + DocTagsPage, +) +from PIL import Image +from docling_eval.datamodels.dataset_record import DatasetRecord -class ExternalDoclingDocLoader: - def __init__(self, external_predictions_dir: Path): +_log = logging.getLogger(__name__) + + +class ExternalDoclingDocumentLoader: + r""" """ + + def __init__( + self, + external_predictions_dir: Path, + ): + r""" """ self._external_predictions_dir = external_predictions_dir - def __call__(self, doc_id: str) -> Optional[DoclingDocument]: + def __call__(self, record: DatasetRecord) -> Optional[DoclingDocument]: r""" Load the DoclingDocument from the external predictions path + + The following fields are used from the `record` parameter: + - record.doc_id + - record.ground_truth_page_images[0] """ - json_path = self._external_predictions_dir / f"{doc_id}.json" - dt_path = self._external_predictions_dir / f"{doc_id}.dt" - yaml_path = self._external_predictions_dir / f"{doc_id}.yaml" - yml_path = self._external_predictions_dir / f"{doc_id}.yml" - - if json_path.is_file(): - return DoclingDocument.load_from_json(json_path) - if dt_path.is_file(): - return DoclingDocument.load_from_doctags(dt_path) - if yaml_path.is_file(): - return DoclingDocument.load_from_yaml(yaml_path) - if yml_path.is_file(): - return DoclingDocument.load_from_yaml(yml_path) + doc_id = record.doc_id + + json_fn = self._external_predictions_dir / f"{doc_id}.json" + doctags_fn = ExternalDoclingDocumentLoader.build_doctags_path( + self._external_predictions_dir, doc_id + ) + yaml_fn = self._external_predictions_dir / f"{doc_id}.yaml" + yml_fn = self._external_predictions_dir / f"{doc_id}.yml" + + if json_fn.is_file(): + return DoclingDocument.load_from_json(json_fn) + if doctags_fn.is_file(): + gt_page_images = record.ground_truth_page_images + gt_page_image = gt_page_images[0] if len(gt_page_images) > 0 else None + + return ExternalDoclingDocumentLoader.load_doctags( + doc_id, + self._external_predictions_dir, + gt_page_image=gt_page_image, + ) + if yaml_fn.is_file(): + return DoclingDocument.load_from_yaml(yaml_fn) + if yml_fn.is_file(): + return DoclingDocument.load_from_yaml(yml_fn) return None + + @staticmethod + def build_doctags_path(doctags_root: Path, doc_id: str) -> Path: + r"""Get the full path of the doctags file""" + dt_path = doctags_root / f"{doc_id}.dt" + return dt_path + + @staticmethod + def load_doctags( + doc_id: str, + doctags_root: Path, + page_images_root: Optional[Path] = None, + gt_page_image: Optional[Image.Image] = None, + image_filename_extension: str = "png", + ) -> Optional[DoclingDocument]: + r""" + Load a single page DoclingDocument object from a doctags file and a page image. + + The page image is supplied from these sources in the specific order: + 1. The page_images_root: An image with filename . is used + 2. gt_page_image: An explicit Image object is used. + 3. Search for the image with filename . in the doctags root + + Parameters + ---------- + doctags_root: Root path to load doctags as files with name .dt + doc_id: The document id of the file to be loaded + page_images_root: If provided, search for the page images here first. + gt_page_image: If provided, search use that object for the page image. + image_filename_extension: The file extension for the page image. + + Returns + ------- + DoclingDocument object or None if the document cannot be reconstructed + """ + # Read the doctags file + doctags_fn = ExternalDoclingDocumentLoader.build_doctags_path( + doctags_root, doc_id + ) + + try: + with open(doctags_fn, "r") as fd: + doctags = fd.read() + + page_image: Optional[Image.Image] = None + + if page_images_root: + page_image_fn = ( + page_images_root / f"{doc_id}.{image_filename_extension}" + ) + if page_image_fn.is_file(): + page_image = Image.open(page_image_fn) + else: + _log.warning("Failed to load page image: %s", page_image_fn) + elif gt_page_image is not None: + page_image = gt_page_image + else: + page_image_fn = doctags_root / f"{doc_id}.{image_filename_extension}" + if page_image_fn.is_file(): + page_image = Image.open(page_image_fn) + else: + _log.warning( + "Missing page image file: %s. Reconstruct doctags without page image", + page_image_fn, + ) + + # Build DoclingDocument + doctags_page = DocTagsPage(tokens=doctags, image=page_image) + doctags_doc = DocTagsDocument(pages=[doctags_page]) + doc = DoclingDocument.load_from_doctags(doctags_doc, document_name=doc_id) + return doc + except Exception as e: + _log.error(f"Error loading doctags document {doc_id}: {str(e)}") + return None diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py index 2de0752a..eafaef87 100644 --- a/tests/test_layout_evaluator.py +++ b/tests/test_layout_evaluator.py @@ -61,10 +61,15 @@ def test_layout_evaluator_external_predictions(): r"""Testing the evaluator with external predictions""" eval = LayoutEvaluator() gt_path = Path("scratch/DPBench/gt_dataset") - preds_path = Path("scratch/DPBench/predicted_documents/json") - v = eval(gt_path, external_predictions_path=preds_path) - assert v is not None + preds_path = [ + Path("scratch/DPBench/predicted_documents/json"), + Path("scratch/DPBench/predicted_documents/doctag"), + Path("scratch/DPBench/predicted_documents/yaml"), + ] + for pred_path in preds_path: + v = eval(gt_path, external_predictions_path=pred_path) + assert v is not None if __name__ == "__main__": From 33511c922cad0bacb9ccfee0c57ae528b5127f54 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Fri, 5 Dec 2025 18:14:04 +0100 Subject: [PATCH 17/22] chore: Rename code file as external_docling_document_loader.py Signed-off-by: Nikos Livathinos --- docling_eval/evaluators/bbox_text_evaluator.py | 4 +++- docling_eval/evaluators/doc_structure_evaluator.py | 4 +++- docling_eval/evaluators/keyvalue_evaluator.py | 4 +++- docling_eval/evaluators/layout_evaluator.py | 4 +++- docling_eval/evaluators/markdown_text_evaluator.py | 4 +++- docling_eval/evaluators/pixel_layout_evaluator.py | 4 +++- docling_eval/evaluators/table_evaluator.py | 4 +++- docling_eval/prediction_providers/file_provider.py | 4 +++- ...ling_doc_loader.py => external_docling_document_loader.py} | 0 9 files changed, 24 insertions(+), 8 deletions(-) rename docling_eval/utils/{external_docling_doc_loader.py => external_docling_document_loader.py} (100%) diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py index a93f1461..42109dee 100644 --- a/docling_eval/evaluators/bbox_text_evaluator.py +++ b/docling_eval/evaluators/bbox_text_evaluator.py @@ -25,7 +25,9 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py index a2f59c30..c0842015 100644 --- a/docling_eval/evaluators/doc_structure_evaluator.py +++ b/docling_eval/evaluators/doc_structure_evaluator.py @@ -18,7 +18,9 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py index 9a5d3c72..baa78697 100644 --- a/docling_eval/evaluators/keyvalue_evaluator.py +++ b/docling_eval/evaluators/keyvalue_evaluator.py @@ -21,7 +21,9 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index a906d707..0d11394d 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -30,7 +30,9 @@ docling_document_from_doctags, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) from docling_eval.utils.utils import tensor_to_float _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py index aa112d64..afbc68ee 100644 --- a/docling_eval/evaluators/markdown_text_evaluator.py +++ b/docling_eval/evaluators/markdown_text_evaluator.py @@ -26,7 +26,9 @@ UnitEvaluation, ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py index b3df7396..1913f1b7 100644 --- a/docling_eval/evaluators/pixel_layout_evaluator.py +++ b/docling_eval/evaluators/pixel_layout_evaluator.py @@ -39,7 +39,9 @@ PagePixelLayoutEvaluation, ) from docling_eval.evaluators.stats import compute_stats -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) from docling_eval.utils.utils import dict_get _log = logging.getLogger(__name__) diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index 05fec46b..fb2f6aac 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -23,7 +23,9 @@ ) from docling_eval.evaluators.stats import DatasetStatistics, compute_stats from docling_eval.evaluators.table.teds import TEDScorer -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py index 17b9bad3..5c93e7b9 100644 --- a/docling_eval/prediction_providers/file_provider.py +++ b/docling_eval/prediction_providers/file_provider.py @@ -19,7 +19,9 @@ from docling_eval.prediction_providers.base_prediction_provider import ( BasePredictionProvider, ) -from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader +from docling_eval.utils.external_docling_document_loader import ( + ExternalDoclingDocumentLoader, +) _log = logging.getLogger(__name__) diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_document_loader.py similarity index 100% rename from docling_eval/utils/external_docling_doc_loader.py rename to docling_eval/utils/external_docling_document_loader.py From 94b39385cbd78f5b25d8548307bf6a0968764006 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Mon, 8 Dec 2025 13:31:28 +0100 Subject: [PATCH 18/22] fix: Fix typo Signed-off-by: Nikos Livathinos --- docs/examples/matrix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/examples/matrix.py b/docs/examples/matrix.py index a6e04cb2..6d7625c0 100644 --- a/docs/examples/matrix.py +++ b/docs/examples/matrix.py @@ -85,7 +85,7 @@ def main(args): if __name__ == "__main__": - desription = """ + description = """ Running multi-evaluation and consolidation inside a working directory and generate matrix reports The working directory must have the structure: @@ -108,7 +108,7 @@ def main(args): └── evaluation__.json """ parser = argparse.ArgumentParser( - description=desription, formatter_class=argparse.RawTextHelpFormatter + description=description, formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument( "-t", From ae10646fdd9ed91638bcaa9b18b99e4d71c5090e Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Mon, 8 Dec 2025 13:54:25 +0100 Subject: [PATCH 19/22] feat: Introduce examples how to evaluate using external predictions using the API and the CLI. Signed-off-by: Nikos Livathinos --- ...valuate_dpbench_on_external_predictions.sh | 72 ++++++++++++++++ .../examples/evaluate_external_predictions.py | 85 +++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100755 docs/examples/evaluate_dpbench_on_external_predictions.sh create mode 100644 docs/examples/evaluate_external_predictions.py diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh new file mode 100755 index 00000000..413b12de --- /dev/null +++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +########################################################################################### +# Invariants +# + +readonly GT_DIR=scratch/DPBench/gt_dataset + +readonly MODALITIES=( +layout +table_structure +document_structure +reading_order +markdown_text +bboxes_text +key_value +timings +) + + +########################################################################################### +# Functions +# + +evaluate() { + local pred_dir save_dir modality + pred_dir="$1" + save_dir="$2" + + # Check if the GT/preds dirs exist + if [ ! -d "${GT_DIR}" ]; then + echo "Missing GT dir: ${GT_DIR}" + exit 1 + fi + if [ ! -d "${pred_dir}" ]; then + echo "Missing predictions dir: ${pred_dir}" + exit 2 + fi + + for modality in "${MODALITIES[@]}"; do + echo "Evaluation modality: ${modality}, predictions: ${pred_dir}" + uv run docling-eval evaluate \ + --benchmark DPBench \ + --modality "${modality}" \ + --input-dir "${GT_DIR}" \ + --external-predictions-path "${pred_dir}" \ + --output-dir "${save_dir}" + done +} + + +########################################################################################### +# Main +# + +# json predictions +evaluate \ + scratch/DPBench/predicted_documents/json \ + scratch/DPBench/external_evaluations_jsons + + +# doctags predictions +evaluate \ + scratch/DPBench/predicted_documents/doctag \ + scratch/DPBench/external_evaluations_doctags + + +# yaml predictions +evaluate \ + scratch/DPBench/predicted_documents/yaml \ + scratch/DPBench/external_evaluations_yaml + diff --git a/docs/examples/evaluate_external_predictions.py b/docs/examples/evaluate_external_predictions.py new file mode 100644 index 00000000..9e7f9dd9 --- /dev/null +++ b/docs/examples/evaluate_external_predictions.py @@ -0,0 +1,85 @@ +import argparse +import logging +from pathlib import Path + +from docling_eval.cli.main import evaluate +from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality + +_log = logging.getLogger(__name__) + + +def evaluate_external_predictions( + benchmark: BenchMarkNames, + modality: EvaluationModality, + gt_path: Path, + predictions_dir: Path, + save_dir: Path, +): + r""" """ + evaluate( + modality, + benchmark, + gt_path, + save_dir, + external_predictions_path=predictions_dir, + ) + + +def main(): + r""" """ + parser = argparse.ArgumentParser( + description="Example how to use GT from parquet and predictions from externally provided prediction files", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-b", + "--benchmark", + required=True, + type=BenchMarkNames, + help="Evaluation modality", + ) + parser.add_argument( + "-m", + "--modality", + required=True, + type=EvaluationModality, + help="Evaluation modality", + ) + parser.add_argument( + "-g", + "--gt_parquet_dir", + required=True, + type=Path, + help="Path to the parquet GT dataset", + ) + parser.add_argument( + "-p", + "--predictions_dir", + required=True, + type=Path, + help="Dir with the external prediction files (json, dt, yaml)", + ) + parser.add_argument( + "-s", + "--save_dir", + required=False, + type=Path, + help="Path to save the produced evaluation files", + ) + args = parser.parse_args() + + # Configure logger + log_format = "%(asctime)s - %(levelname)s - %(message)s" + logging.basicConfig(level=logging.INFO, format=log_format) + + evaluate_external_predictions( + args.benchmark, + args.modality, + args.gt_parquet_dir, + args.predictions_dir, + args.save_dir, + ) + + +if __name__ == "__main__": + main() From 8c52e36b48defb2c1372af583940d89fec0a423f Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 8 Dec 2025 15:16:19 +0100 Subject: [PATCH 20/22] feat: Prediction vizualizer Signed-off-by: Christoph Auer --- docling_eval/cli/main.py | 66 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index d65bd634..2de5afe9 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -126,6 +126,7 @@ from docling_eval.prediction_providers.tableformer_provider import ( TableFormerPredictionProvider, ) +from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer class DoclingLayoutOptionsManager: @@ -362,7 +363,7 @@ def get_prediction_provider( docling_layout_keep_empty_clusters: Optional[bool] = None, # Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING) docling_programmatic_add_orphan_text_cells: Optional[bool] = None, - docling_force_full_page_ocr: Optional[bool] = None, + docling_force_full_page_ocr: bool = False, granite_docling_vlm_options: Optional[InlineVlmOptions] = None, max_new_tokens: Optional[int] = None, ): @@ -376,7 +377,7 @@ def get_prediction_provider( ocr_factory = get_ocr_factory() ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore - kind="easyocr", + kind="rapidocr", force_full_page_ocr=docling_force_full_page_ocr, ) # Use all CPU cores @@ -1570,6 +1571,67 @@ def visualize_cmd( ) +@app.command(name="create_viz") +def create_viz( + dataset_dir: Annotated[ + Path, + typer.Option( + help=( + "Dataset directory (GT parquet or eval_dataset parquet with predictions) " + "containing the split folder with parquet shards." + ) + ), + ], + split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test", + external_predictions_path: Annotated[ + Optional[Path], + typer.Option( + help=( + "Directory with DoclingDocument predictions named as .[json|dt|yaml|yml]. " + "If omitted, predictions are taken from the dataset parquet." + ) + ), + ] = None, + output_dir: Annotated[ + Optional[Path], + typer.Option( + help=( + "Directory where HTML visualizations are written. Defaults to " + "/visualizations when omitted." + ) + ), + ] = None, + begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0, + end_index: Annotated[ + int, typer.Option(help="End index (exclusive), -1 for all") + ] = -1, + ignore_missing_predictions: Annotated[ + bool, + typer.Option( + help="Skip documents without a matching prediction instead of failing" + ), + ] = False, +): + """ + Create paired GT vs. prediction HTML visualizations without generating parquet output. + """ + visualizations_dir = ( + output_dir if output_dir is not None else dataset_dir / "visualizations" + ) + + visualizer = PredictionsVisualizer( + visualizations_dir=visualizations_dir, + external_predictions_dir=external_predictions_path, + ignore_missing_predictions=ignore_missing_predictions, + ) + visualizer.create_visualizations( + dataset_dir=dataset_dir, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + @app.callback() def main(): """Docling Evaluation CLI for benchmarking document processing tasks.""" From 6f7331c71050afb27a9749f9a750427a125b1a7d Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Tue, 9 Dec 2025 08:20:33 +0100 Subject: [PATCH 21/22] Update docling_eval/utils/external_predictions_visualizer.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --- .../utils/external_predictions_visualizer.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/docling_eval/utils/external_predictions_visualizer.py b/docling_eval/utils/external_predictions_visualizer.py index 2eed6946..35342c98 100644 --- a/docling_eval/utils/external_predictions_visualizer.py +++ b/docling_eval/utils/external_predictions_visualizer.py @@ -150,9 +150,14 @@ def _save_visualization(self, record: DatasetRecordWithPrediction) -> None: record.predicted_page_images, ) - save_comparison_html_with_clusters( - filename=self._visualizations_dir / f"{record.doc_id}.html", - true_doc=gt_doc, - pred_doc=pred_doc, - draw_reading_order=True, - ) + try: + save_comparison_html_with_clusters( + filename=self._visualizations_dir / f"{record.doc_id}.html", + true_doc=gt_doc, + pred_doc=pred_doc, + draw_reading_order=True, + ) + except (IndexError, ValueError) as e: + _LOGGER.warning( + f"Failed to save visualization for doc_id {record.doc_id}: {e}" + ) From 21eae304e5bbedf300318ff17d6fd6dc916e8d2c Mon Sep 17 00:00:00 2001 From: Nikos Livathinos Date: Tue, 9 Dec 2025 13:35:19 +0100 Subject: [PATCH 22/22] feat: Update examples bash script to demonstrate visualisations on external predictions Signed-off-by: Nikos Livathinos --- ...valuate_dpbench_on_external_predictions.sh | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh index 413b12de..6189e90b 100755 --- a/docs/examples/evaluate_dpbench_on_external_predictions.sh +++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh @@ -38,7 +38,7 @@ evaluate() { fi for modality in "${MODALITIES[@]}"; do - echo "Evaluation modality: ${modality}, predictions: ${pred_dir}" + echo "Evaluate: modality: ${modality}: predictions: ${pred_dir}" uv run docling-eval evaluate \ --benchmark DPBench \ --modality "${modality}" \ @@ -49,24 +49,54 @@ evaluate() { } +visualize() { + local pred_dir save_dir modality + pred_dir="$1" + save_dir="$2" + + # Check if the GT/preds dirs exist + if [ ! -d "${GT_DIR}" ]; then + echo "Missing GT dir: ${GT_DIR}" + exit 1 + fi + if [ ! -d "${pred_dir}" ]; then + echo "Missing predictions dir: ${pred_dir}" + exit 2 + fi + + echo "Visualize predictions: ${pred_dir}" + uv run docling-eval create_viz \ + --dataset-dir "${GT_DIR}" \ + --external-predictions-path "${pred_dir}" \ + --output-dir "${save_dir}" +} + ########################################################################################### # Main # +# Predictions + # json predictions evaluate \ scratch/DPBench/predicted_documents/json \ - scratch/DPBench/external_evaluations_jsons + scratch/DPBench/external_predictions_jsons # doctags predictions evaluate \ scratch/DPBench/predicted_documents/doctag \ - scratch/DPBench/external_evaluations_doctags + scratch/DPBench/external_predictions_doctags # yaml predictions evaluate \ scratch/DPBench/predicted_documents/yaml \ - scratch/DPBench/external_evaluations_yaml + scratch/DPBench/external_predictions_yaml + + +# Visualisations +visualize \ + scratch/DPBench/predicted_documents/json \ + scratch/DPBench/external_predictions_visualisations