From 63317231a1a7dea3ce84325295dfd83057026b75 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 16:34:56 +0100
Subject: [PATCH 01/22] chore: Move the teds.py inside the subdir
 evaluators/table

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/{ => table}/teds.py | 0
 docling_eval/evaluators/table_evaluator.py  | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename docling_eval/evaluators/{ => table}/teds.py (100%)

diff --git a/docling_eval/evaluators/teds.py b/docling_eval/evaluators/table/teds.py
similarity index 100%
rename from docling_eval/evaluators/teds.py
rename to docling_eval/evaluators/table/teds.py
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index bd28e84e..283eea68 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -22,7 +22,7 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.evaluators.teds import TEDScorer
+from docling_eval.evaluators.table.teds import TEDScorer
 
 _log = logging.getLogger(__name__)
 

From 85890fb7a8c4633390c0a6778f3c22d5d7cf2b18 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 16:37:31 +0100
Subject: [PATCH 02/22] feat: Introduce the external_predictions_path in
 BaseEvaluator and dummy entries in all evaluators. Extend the CLI to support
 the --external-predictions-path

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                      | 20 ++++++++++++++++++-
 docling_eval/evaluators/base_evaluator.py     |  1 +
 .../evaluators/bbox_text_evaluator.py         |  5 ++++-
 .../evaluators/doc_structure_evaluator.py     |  1 +
 docling_eval/evaluators/keyvalue_evaluator.py |  7 ++++++-
 docling_eval/evaluators/layout_evaluator.py   |  1 +
 .../evaluators/markdown_text_evaluator.py     |  1 +
 docling_eval/evaluators/ocr_evaluator.py      |  1 +
 .../evaluators/pixel_layout_evaluator.py      |  1 +
 .../evaluators/readingorder_evaluator.py      |  1 +
 docling_eval/evaluators/table_evaluator.py    |  1 +
 docling_eval/evaluators/timings_evaluator.py  |  1 +
 12 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index ade289da..08652a5b 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -631,6 +631,7 @@ def evaluate(
     odir: Path,
     split: str = "test",
     cvat_overview_path: Optional[Path] = None,
+    external_predictions_path: Optional[Path] = None,
 ) -> Optional[DatasetEvaluationType]:
     """Evaluate predictions against ground truth."""
     if not os.path.exists(idir):
@@ -665,6 +666,7 @@ def evaluate(
         evaluation = layout_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -673,7 +675,9 @@ def evaluate(
         # Evaluate with the pixel-wise layout evaluation
         pixel_layout_evaluator = PixelLayoutEvaluator()
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
-            idir, split=split
+            idir,
+            split=split,
+            external_predictions_path=external_predictions_path,
         )
         pixel_save_root: Path = save_fn.parent
         pixel_layout_evaluator.save_evaluations(
@@ -687,6 +691,7 @@ def evaluate(
         evaluation = table_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -699,6 +704,7 @@ def evaluate(
         evaluation = doc_struct_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -719,6 +725,7 @@ def evaluate(
         evaluation = ocr_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -729,6 +736,7 @@ def evaluate(
         evaluation = readingorder_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -745,6 +753,7 @@ def evaluate(
         evaluation = md_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -761,6 +770,7 @@ def evaluate(
         evaluation = bbox_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -776,6 +786,7 @@ def evaluate(
         evaluation = keyvalue_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -1479,6 +1490,12 @@ def evaluate_cmd(
         ),
     ] = None,
     split: Annotated[str, typer.Option(help="Dataset split")] = "test",
+    external_predictions_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
+        ),
+    ] = None,
 ):
     """Evaluate predictions against ground truth."""
     input_dir, output_dir = derive_input_output_dirs(
@@ -1498,6 +1515,7 @@ def evaluate_cmd(
         idir=input_dir,
         odir=eval_output_dir,
         split=split,
+        external_predictions_path=external_predictions_path,
     )
 
 
diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
index 940f6bc5..4198f084 100644
--- a/docling_eval/evaluators/base_evaluator.py
+++ b/docling_eval/evaluators/base_evaluator.py
@@ -100,6 +100,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetEvaluationType:
         r"""
         Perform the evaluation
diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
index 09301693..156da241 100644
--- a/docling_eval/evaluators/bbox_text_evaluator.py
+++ b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -94,7 +94,10 @@ def __init_(
         nltk.download("popular", quiet=True)
 
     def __call__(
-        self, ds_path: Path, split: str = "test"
+        self,
+        ds_path: Path,
+        split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetBoxesTextEvaluation:
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
index c572a6f3..c3ce7623 100644
--- a/docling_eval/evaluators/doc_structure_evaluator.py
+++ b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -71,6 +71,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetDocStructureEvaluation:
         r"""
         Parameters
diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py
index 2ac58666..06c45911 100644
--- a/docling_eval/evaluators/keyvalue_evaluator.py
+++ b/docling_eval/evaluators/keyvalue_evaluator.py
@@ -415,7 +415,12 @@ def __init__(
     # --------------------------------------------------------------------- #
     # Public API
     # --------------------------------------------------------------------- #
-    def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluation:
+    def __call__(
+        self,
+        ds_path: Path,
+        split: str = "test",
+        external_predictions_path: Optional[Path] = None,
+    ) -> DatasetKeyValueEvaluation:
         split_glob = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: split_glob})
         _log.info("Loaded split '%s' – %d samples", split, len(ds[split]))
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 8b737738..5663dddf 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -190,6 +190,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetLayoutEvaluation:
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 
diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
index 18a107ff..a68c1884 100644
--- a/docling_eval/evaluators/markdown_text_evaluator.py
+++ b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -108,6 +108,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetMarkdownEvaluation:
         r"""
         Parameters
diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py
index b04c01cc..10b136cd 100644
--- a/docling_eval/evaluators/ocr_evaluator.py
+++ b/docling_eval/evaluators/ocr_evaluator.py
@@ -62,6 +62,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> OcrDatasetEvaluationResult:
         dataset_path = ds_path
         data_split_name = split
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index cddbb11b..ecd7f569 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -171,6 +171,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetPixelLayoutEvaluation:
         _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py
index 0ff6037c..00e211eb 100644
--- a/docling_eval/evaluators/readingorder_evaluator.py
+++ b/docling_eval/evaluators/readingorder_evaluator.py
@@ -80,6 +80,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetReadingOrderEvaluation:
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index 283eea68..4355903d 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -132,6 +132,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetTableEvaluation:
         r"""
         Load a dataset in HF format. Expected columns with DoclingDocuments
diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py
index 9e8896b0..a0d2fa08 100644
--- a/docling_eval/evaluators/timings_evaluator.py
+++ b/docling_eval/evaluators/timings_evaluator.py
@@ -50,6 +50,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetTimingsEvaluation:
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 

From 5f9a279d4940079d1688d60a749a7230d27fedf2 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 16:39:56 +0100
Subject: [PATCH 03/22] feat: Extend test_dataset_builder.py to save document
 predictions in various formats

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 tests/test_dataset_builder.py | 48 +++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py
index 75029ecf..1222cd31 100644
--- a/tests/test_dataset_builder.py
+++ b/tests/test_dataset_builder.py
@@ -2,8 +2,11 @@
 from pathlib import Path
 
 import pytest
+from datasets import load_dataset
+from docling_core.types.doc.document import DoclingDocument
 
 from docling_eval.cli.main import evaluate, get_prediction_provider, visualize
+from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
 from docling_eval.datamodels.types import (
     BenchMarkNames,
     EvaluationModality,
@@ -34,6 +37,42 @@
 IS_CI = bool(os.getenv("CI"))
 
 
+def export_predictions(
+    ds_path: Path,
+    save_path: Path,
+    split: str = "test",
+):
+    r"""Export the predicted document in the save path in various formats"""
+    parquet_files = str(ds_path / split / "*.parquet")
+    ds = load_dataset("parquet", data_files={split: parquet_files})
+
+    for data in ds[split]:
+        data_record = DatasetRecordWithPrediction.model_validate(data)
+        doc_id = data_record.doc_id
+        pred_doc: DoclingDocument = data_record.predicted_doc
+
+        if pred_doc is None:
+            continue
+
+        # Save as JSON
+        json_dir = save_path / "json"
+        json_dir.mkdir(parents=True, exist_ok=True)
+        json_fn = json_dir / f"{doc_id}.json"
+        pred_doc.save_as_json(json_fn)
+
+        # Save as doctags (.doctags)
+        doctags_dir = save_path / "doctag"
+        doctags_dir.mkdir(parents=True, exist_ok=True)
+        doctags_fn = doctags_dir / f"{doc_id}.doctags"
+        pred_doc.save_as_doctags(doctags_fn)
+
+        # Save as YAML
+        yaml_dir = save_path / "yaml"
+        yaml_dir.mkdir(parents=True, exist_ok=True)
+        yaml_fn = yaml_dir / f"{doc_id}.yaml"
+        pred_doc.save_as_yaml(yaml_fn)
+
+
 @pytest.mark.dependency()
 def test_run_dpbench_e2e():
     target_path = Path(f"./scratch/{BenchMarkNames.DPBENCH.value}/")
@@ -54,6 +93,11 @@ def test_run_dpbench_e2e():
         target_dataset_dir=target_path / "eval_dataset_e2e",
     )
 
+    # Export predictions
+    pred_path = target_path / "eval_dataset_e2e"
+    save_path = target_path / "predicted_documents"
+    export_predictions(pred_path, save_path)
+
     ## Evaluate Layout
     evaluate(
         modality=EvaluationModality.LAYOUT,
@@ -602,3 +646,7 @@ def test_file_dataset_builder():
     )
 
     dataset_builder.save_to_disk(do_visualization=True)
+
+
+if __name__ == "__main__":
+    test_run_dpbench_e2e()

From e6e84096d05a5bf885da806809a2f45fab9e4f58 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 16:41:16 +0100
Subject: [PATCH 04/22] feat: Extend MarkDownTextEvaluator to support
 external_predictions_path. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/markdown_text_evaluator.py     | 34 ++++++++++++++-----
 .../utils/external_docling_doc_loader.py      | 28 +++++++++++++++
 tests/test_markdown_text_evaluator.py         | 15 ++++++--
 3 files changed, 67 insertions(+), 10 deletions(-)
 create mode 100644 docling_eval/utils/external_docling_doc_loader.py

diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
index a68c1884..96222c68 100644
--- a/docling_eval/evaluators/markdown_text_evaluator.py
+++ b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -26,6 +26,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 
 _log = logging.getLogger(__name__)
 
@@ -116,6 +117,11 @@ def __call__(
         ds_path: Path to load the parquet files of the dataset
         split: Split of the dataset to load
         """
+        if external_predictions_path is not None:
+            external_docling_doc_loader = ExternalDoclingDocLoader(
+                external_predictions_path
+            )
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"Overview of the dataset: {ds}")
@@ -146,16 +152,28 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
-                _log.error(
-                    "Skipping record without successfull conversion status: %s", doc_id
-                )
-                rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
-                continue
-
             true_doc = data_record.ground_truth_doc
             true_md = self._docling_document_to_md(true_doc)
-            pred_md = self._get_pred_md(data_record)
+
+            # Get the predicted markdown from the external predictions path
+            if external_predictions_path is not None:
+                pred_doc = external_docling_doc_loader(doc_id)
+                if pred_doc is None:
+                    _log.error("No external prediction found for doc_id=%s", doc_id)
+                    rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
+                    continue
+                pred_md = self._docling_document_to_md(pred_doc)
+            else:
+                if data_record.status not in self._accepted_status:
+                    _log.error(
+                        "Skipping record without successfull conversion status: %s",
+                        doc_id,
+                    )
+                    rejected_samples[
+                        EvaluationRejectionType.INVALID_CONVERSION_STATUS
+                    ] += 1
+                    continue
+                pred_md = self._get_pred_md(data_record)  # type: ignore
 
             if not pred_md:
                 _log.error("There is no markdown prediction for doc_id=%s", doc_id)
diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_doc_loader.py
new file mode 100644
index 00000000..c132b6ef
--- /dev/null
+++ b/docling_eval/utils/external_docling_doc_loader.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+from typing import Optional
+
+from docling_core.types.doc.document import DoclingDocument
+
+
+class ExternalDoclingDocLoader:
+    def __init__(self, external_predictions_dir: Path):
+        self._external_predictions_dir = external_predictions_dir
+
+    def __call__(self, doc_id: str) -> Optional[DoclingDocument]:
+        r"""
+        Load the DoclingDocument from the external predictions path
+        """
+        json_path = self._external_predictions_dir / f"{doc_id}.json"
+        dt_path = self._external_predictions_dir / f"{doc_id}.dt"
+        yaml_path = self._external_predictions_dir / f"{doc_id}.yaml"
+        yml_path = self._external_predictions_dir / f"{doc_id}.yml"
+
+        if json_path.is_file():
+            return DoclingDocument.load_from_json(json_path)
+        if dt_path.is_file():
+            return DoclingDocument.load_from_doctags(dt_path)
+        if yaml_path.is_file():
+            return DoclingDocument.load_from_yaml(yaml_path)
+        if yml_path.is_file():
+            return DoclingDocument.load_from_yaml(yml_path)
+        return None
diff --git a/tests/test_markdown_text_evaluator.py b/tests/test_markdown_text_evaluator.py
index 6eba1acb..8fbd4073 100644
--- a/tests/test_markdown_text_evaluator.py
+++ b/tests/test_markdown_text_evaluator.py
@@ -34,5 +34,16 @@ def test_markdown_text_evaluator():
     assert is_exception
 
 
-# if __name__ == "__main__":
-#     test_markdown_text_evaluator()
+def test_markdown_text_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = MarkdownTextEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    # test_markdown_text_evaluator()
+    test_markdown_text_evaluator_external_predictions()

From 5624e6195da06bc4fc0a77a20b83544bde536973 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 17:12:43 +0100
Subject: [PATCH 05/22] feat: Extend LayoutEvaluator to support
 external_predictions_path. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/layout_evaluator.py | 23 +++++++++++++++++----
 tests/test_layout_evaluator.py              | 17 ++++++++++++---
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 5663dddf..5f43fb03 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -30,6 +30,7 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 from docling_eval.utils.utils import tensor_to_float
 
 _log = logging.getLogger(__name__)
@@ -194,6 +195,10 @@ def __call__(
     ) -> DatasetLayoutEvaluation:
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
@@ -209,7 +214,7 @@ def __call__(
             pred_labels,
             intersection_labels,
             union_labels,
-        ) = self._find_intersecting_labels(ds_selection)
+        ) = self._find_intersecting_labels(ds_selection, ext_docdoc_loader)
         true_labels_str = ", ".join(sorted(true_labels))
         logging.info(f"True labels: {true_labels_str}")
 
@@ -282,7 +287,9 @@ def __call__(
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record)
+
+            # Get the predicted document
+            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
             if not pred_doc:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -585,12 +592,19 @@ def __call__(
         return dataset_layout_evaluation
 
     def _get_pred_doc(
-        self, data_record: DatasetRecordWithPrediction
+        self,
+        data_record: DatasetRecordWithPrediction,
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
+        if ext_docdoc_loader is not None:
+            doc_id = data_record.doc_id
+            pred_doc = ext_docdoc_loader(doc_id)
+            return pred_doc
+
         for prediction_format in self._prediction_sources:
             if prediction_format == PredictionFormats.DOCLING_DOCUMENT:
                 pred_doc = data_record.predicted_doc
@@ -802,6 +816,7 @@ def _compute_average_iou_with_labels_across_iou(
     def _find_intersecting_labels(
         self,
         ds: Dataset,
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
     ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]:
         r"""
         Compute counters per labels for the groundtruth, prediciton and their intersections
@@ -821,7 +836,7 @@ def _find_intersecting_labels(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             true_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record)
+            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
 
             for item, level in true_doc.iterate_items(
                 included_content_layers={
diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py
index d72f23d1..ef9539e2 100644
--- a/tests/test_layout_evaluator.py
+++ b/tests/test_layout_evaluator.py
@@ -54,6 +54,17 @@ def test_failed_conversions():
     assert len(v1.evaluations_per_image) == 0
 
 
-# if __name__ == "__main__":
-#     # test_layout_evaluator()
-#     test_failed_conversions()
+def test_layout_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = LayoutEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    #     # test_layout_evaluator()
+    #     test_failed_conversions()
+    test_layout_evaluator_external_predictions()

From 171ad7455bbed78c723141d11ddc6d670ebcb582 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 17:36:23 +0100
Subject: [PATCH 06/22] fix: Add missing pytest dependencies in tests

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 tests/test_layout_evaluator.py        | 4 ++++
 tests/test_markdown_text_evaluator.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py
index ef9539e2..33c79a64 100644
--- a/tests/test_layout_evaluator.py
+++ b/tests/test_layout_evaluator.py
@@ -54,6 +54,10 @@ def test_failed_conversions():
     assert len(v1.evaluations_per_image) == 0
 
 
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
 def test_layout_evaluator_external_predictions():
     r"""Testing the evaluator with external predictions"""
     eval = LayoutEvaluator()
diff --git a/tests/test_markdown_text_evaluator.py b/tests/test_markdown_text_evaluator.py
index 8fbd4073..8d3eb203 100644
--- a/tests/test_markdown_text_evaluator.py
+++ b/tests/test_markdown_text_evaluator.py
@@ -34,6 +34,10 @@ def test_markdown_text_evaluator():
     assert is_exception
 
 
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
 def test_markdown_text_evaluator_external_predictions():
     r"""Testing the evaluator with external predictions"""
     eval = MarkdownTextEvaluator()

From 0f0cfb5f4b39945e183ba532f3b4ad94e6214e26 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 17:55:38 +0100
Subject: [PATCH 07/22] fix: Fix loading the external predictions in
 LayoutEvaluator

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/layout_evaluator.py | 6 +++++-
 tests/test_layout_evaluator.py              | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 5f43fb03..ee439610 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -279,7 +279,11 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py
index 33c79a64..2de0752a 100644
--- a/tests/test_layout_evaluator.py
+++ b/tests/test_layout_evaluator.py
@@ -5,7 +5,6 @@
 
 from docling_eval.datamodels.types import PredictionFormats
 from docling_eval.evaluators.layout_evaluator import LayoutEvaluator
-from docling_eval.evaluators.markdown_text_evaluator import MarkdownTextEvaluator
 
 
 @pytest.mark.dependency(

From 8069571c2e09292ba055bd3826e9753e69cd211a Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 17:56:31 +0100
Subject: [PATCH 08/22] feat: Introduce external predictions in
 DocStructureEvaluator. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/doc_structure_evaluator.py     | 15 ++++++++++--
 tests/test_doc_structure_evaluator.py         | 23 +++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_doc_structure_evaluator.py

diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
index c3ce7623..42609e49 100644
--- a/docling_eval/evaluators/doc_structure_evaluator.py
+++ b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -18,6 +18,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 
 _log = logging.getLogger(__name__)
 
@@ -79,6 +80,10 @@ def __call__(
         ds_path: Path to load the parquet files of the dataset
         split: Split of the dataset to load
         """
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"Overview of the dataset: {ds}")
@@ -107,7 +112,10 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
@@ -115,7 +123,10 @@ def __call__(
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+            if ext_docdoc_loader:
+                pred_doc = ext_docdoc_loader(doc_id)
+            else:
+                pred_doc = data_record.predicted_doc
 
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
diff --git a/tests/test_doc_structure_evaluator.py b/tests/test_doc_structure_evaluator.py
new file mode 100644
index 00000000..a1440d26
--- /dev/null
+++ b/tests/test_doc_structure_evaluator.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+import pytest
+
+from docling_eval.evaluators.doc_structure_evaluator import DocStructureEvaluator
+
+
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_doc_structure_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = DocStructureEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    test_doc_structure_evaluator_external_predictions()

From 8ba6b453cc5eb45d89e653c241ba0365fff791ff Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Thu, 4 Dec 2025 18:07:31 +0100
Subject: [PATCH 09/22] feat: Extend the TableEvaluator to support external
 predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/table_evaluator.py | 16 ++++++++++++++--
 tests/test_table_evaluator.py              | 18 ++++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index 4355903d..2872a10a 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -23,6 +23,7 @@
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
 from docling_eval.evaluators.table.teds import TEDScorer
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 
 _log = logging.getLogger(__name__)
 
@@ -141,6 +142,10 @@ def __call__(
         """
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
@@ -167,7 +172,7 @@ def __call__(
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
             gt_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record)
+            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
             if not pred_doc:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -310,12 +315,19 @@ def _evaluate_tables_in_documents(
         return table_evaluations
 
     def _get_pred_doc(
-        self, data_record: DatasetRecordWithPrediction
+        self,
+        data_record: DatasetRecordWithPrediction,
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
+        if ext_docdoc_loader is not None:
+            doc_id = data_record.doc_id
+            pred_doc = ext_docdoc_loader(doc_id)
+            return pred_doc
+
         for prediction_format in self._prediction_sources:
             if prediction_format == PredictionFormats.DOCLING_DOCUMENT:
                 pred_doc = data_record.predicted_doc
diff --git a/tests/test_table_evaluator.py b/tests/test_table_evaluator.py
index a2adce76..312ab9d7 100644
--- a/tests/test_table_evaluator.py
+++ b/tests/test_table_evaluator.py
@@ -320,5 +320,19 @@ def test_table_evaluator():
     assert is_exception
 
 
-# if __name__ == "__main__":
-#     test_table_evaluator()
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_table_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = TableEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    test_table_evaluator_external_predictions()

From 949d6ccdce9a55824a254d7bf150c27f38d9ce0c Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 11:45:41 +0100
Subject: [PATCH 10/22] feat: Extend the KeyValueEvaluator to support external
 predictions. Add unit test.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/keyvalue_evaluator.py | 18 +++++++++++++++---
 tests/test_keyvalue_evaluator.py              | 18 ++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py
index 06c45911..20899d33 100644
--- a/docling_eval/evaluators/keyvalue_evaluator.py
+++ b/docling_eval/evaluators/keyvalue_evaluator.py
@@ -21,6 +21,7 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 
 _log = logging.getLogger(__name__)
 
@@ -421,6 +422,11 @@ def __call__(
         split: str = "test",
         external_predictions_path: Optional[Path] = None,
     ) -> DatasetKeyValueEvaluation:
+        r""" """
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         split_glob = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: split_glob})
         _log.info("Loaded split '%s' – %d samples", split, len(ds[split]))
@@ -466,13 +472,13 @@ def __call__(
             doc_id = record.doc_id
 
             # ----- sanity checks --------------------------------------------------
-            if record.status not in self._accepted_status:
+            if ext_docdoc_loader is None and record.status not in self._accepted_status:
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 _log.error("Skipping %s – conversion failed", doc_id)
                 continue
 
             gt_doc = record.ground_truth_doc
-            pred_doc = self._get_pred_doc(record)
+            pred_doc = self._get_pred_doc(record, ext_docdoc_loader)
             if pred_doc is None:
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
                 _log.error("Skipping %s – missing prediction", doc_id)
@@ -640,10 +646,16 @@ def __call__(
     # Helpers
     # --------------------------------------------------------------------- #
     def _get_pred_doc(
-        self, data_record: DatasetRecordWithPrediction
+        self,
+        data_record: DatasetRecordWithPrediction,
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
     ) -> Optional[DoclingDocument]:
         """Fetch the prediction in the first available format declared by `prediction_sources`."""
         pred_doc: Optional[DoclingDocument] = None
+        if ext_docdoc_loader is not None:
+            doc_id = data_record.doc_id
+            pred_doc = ext_docdoc_loader(doc_id)
+            return pred_doc
 
         for fmt in self._prediction_sources:
             if fmt == PredictionFormats.DOCLING_DOCUMENT:
diff --git a/tests/test_keyvalue_evaluator.py b/tests/test_keyvalue_evaluator.py
index 712a3ca4..d47fd3eb 100644
--- a/tests/test_keyvalue_evaluator.py
+++ b/tests/test_keyvalue_evaluator.py
@@ -52,3 +52,21 @@ def test_failed_conversions():
     v1 = evaluator(test_dataset_dir)
     assert v1 is not None
     assert len(v1.evaluations) == 0
+
+
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_keyvalue_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = KeyValueEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    test_keyvalue_evaluator_external_predictions()

From 13badc5522edb87c9780a476d1c9ef5a9fac839d Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 11:53:14 +0100
Subject: [PATCH 11/22] feat: Extend the PixelLayoutEvaluator to support
 external predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/pixel_layout_evaluator.py      | 21 ++++++++++++++++---
 tests/test_pixel_layout_evaluator.py          | 20 +++++++++++++++---
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 230831e7..688ec82c 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -39,6 +39,7 @@
     PagePixelLayoutEvaluation,
 )
 from docling_eval.evaluators.stats import compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 from docling_eval.utils.utils import dict_get
 
 _log = logging.getLogger(__name__)
@@ -175,6 +176,10 @@ def __call__(
     ) -> DatasetPixelLayoutEvaluation:
         _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
@@ -229,7 +234,10 @@ def __call__(
                 )
 
             doc_id: str = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
@@ -237,7 +245,7 @@ def __call__(
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record)
+            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
             if not pred_doc:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -541,12 +549,19 @@ def _collect_items_by_page(
         return pages_to_objects
 
     def _get_pred_doc(
-        self, data_record: DatasetRecordWithPrediction
+        self,
+        data_record: DatasetRecordWithPrediction,
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
+        if ext_docdoc_loader is not None:
+            doc_id = data_record.doc_id
+            pred_doc = ext_docdoc_loader(doc_id)
+            return pred_doc
+
         for prediction_format in self._prediction_sources:
             if prediction_format == PredictionFormats.DOCLING_DOCUMENT:
                 pred_doc = data_record.predicted_doc
diff --git a/tests/test_pixel_layout_evaluator.py b/tests/test_pixel_layout_evaluator.py
index 99d60f9d..9beaecb8 100644
--- a/tests/test_pixel_layout_evaluator.py
+++ b/tests/test_pixel_layout_evaluator.py
@@ -19,7 +19,7 @@
     depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
     scope="session",
 )
-def test_layout_evaluator():
+def test_pixel_layout_evaluator():
     r""" """
     test_dataset_dir = Path("scratch/DPBench/eval_dataset_e2e")
 
@@ -87,5 +87,19 @@ def test_layout_evaluator():
     ), "Wrong label mapping in _matrix_id_to_name"
 
 
-# if __name__ == "__main__":
-#     test_layout_evaluator()
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_pixel_layout_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = PixelLayoutEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    test_pixel_layout_evaluator_external_predictions()

From 8c2a0654c2b5982b6390bf2681f5b649c9d9df0e Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 14:24:46 +0100
Subject: [PATCH 12/22] feat: Extend the BboxTextEvaluator to support external
 predictions. Add unit test

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/bbox_text_evaluator.py         | 30 ++++++++++++++++---
 tests/test_bboxtext_evaluator.py              | 19 ++++++++++--
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
index 156da241..c129b01c 100644
--- a/docling_eval/evaluators/bbox_text_evaluator.py
+++ b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -4,7 +4,7 @@
 
 import nltk
 from datasets import load_dataset
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from docling_core.types.doc.document import DoclingDocument, TextItem
 from nltk import edit_distance, word_tokenize
 from nltk.metrics import f_measure, precision, recall
@@ -25,6 +25,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
 
 _log = logging.getLogger(__name__)
 
@@ -99,6 +100,11 @@ def __call__(
         split: str = "test",
         external_predictions_path: Optional[Path] = None,
     ) -> DatasetBoxesTextEvaluation:
+        r""" """
+        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"oveview of dataset: {ds}")
@@ -128,7 +134,10 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
@@ -136,7 +145,12 @@ def __call__(
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+
+            # Load the pred_doc
+            if ext_docdoc_loader is not None:
+                pred_doc = ext_docdoc_loader(doc_id)
+            else:
+                pred_doc = data_record.predicted_doc
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -215,7 +229,15 @@ def _match_bboxes(
                     continue
                 assert len(doc_item.prov) == 1
                 prov = doc_item.prov[0]
-                bboxes[doc_key].append(prov.bbox)
+
+                # Ensure bbox is in top-left origin
+                bbox = prov.bbox
+                if bbox.coord_origin != CoordOrigin.TOPLEFT:
+                    page_no = prov.page_no
+                    page_size = doc.pages[page_no].size
+                    bbox = bbox.to_top_left_origin(page_size.height)
+
+                bboxes[doc_key].append(bbox)
                 texts[doc_key].append(doc_item.text)
 
         # Decide which document is the pivot
diff --git a/tests/test_bboxtext_evaluator.py b/tests/test_bboxtext_evaluator.py
index 7917c23c..cd9112d4 100644
--- a/tests/test_bboxtext_evaluator.py
+++ b/tests/test_bboxtext_evaluator.py
@@ -29,5 +29,20 @@ def test_bboxtext_evaluator():
     assert is_exception
 
 
-# if __name__ == "__main__":
-#     test_bboxtext_evaluator()
+@pytest.mark.dependency(
+    depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
+    scope="session",
+)
+def test_bboxtext_evaluator_external_predictions():
+    r"""Testing the evaluator with external predictions"""
+    eval = BboxTextEvaluator()
+    gt_path = Path("scratch/DPBench/gt_dataset")
+    preds_path = Path("scratch/DPBench/predicted_documents/json")
+
+    v = eval(gt_path, external_predictions_path=preds_path)
+    assert v is not None
+
+
+if __name__ == "__main__":
+    # test_bboxtext_evaluator()
+    test_bboxtext_evaluator_external_predictions()

From 08391b36a9d05bf47daa8d3b5bcfd18bbf1db255 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 14:48:35 +0100
Subject: [PATCH 13/22] feat: Disable the OCREvaluator when using the external
 predictions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 08652a5b..7b233802 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -711,25 +711,30 @@ def evaluate(
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
     elif modality == EvaluationModality.OCR:
-        if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
-            text_unit = TextCellUnit.LINE
-        else:
-            text_unit = TextCellUnit.WORD
-
-        logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})")
-        logging.info(f"Text unit set to {text_unit}")
+        if not external_predictions_path:
+            if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
+                text_unit = TextCellUnit.LINE
+            else:
+                text_unit = TextCellUnit.WORD
+
+            logging.info(
+                f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})"
+            )
+            logging.info(f"Text unit set to {text_unit}")
 
-        ocr_evaluator = OCREvaluator(
-            intermediate_evaluations_path=odir, text_unit=text_unit
-        )
-        evaluation = ocr_evaluator(  # type: ignore
-            idir,
-            split=split,
-            external_predictions_path=external_predictions_path,
-        )
+            ocr_evaluator = OCREvaluator(
+                intermediate_evaluations_path=odir, text_unit=text_unit
+            )
+            evaluation = ocr_evaluator(  # type: ignore
+                idir,
+                split=split,
+                external_predictions_path=external_predictions_path,
+            )
 
-        with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            with open(save_fn, "w") as fd:
+                json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+        else:
+            logging.error("External predictions are not supported for OCR evaluations")
 
     elif modality == EvaluationModality.READING_ORDER:
         readingorder_evaluator = ReadingOrderEvaluator()

From 595ba6c19857601796e2b486e25cf9e51846da03 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 15:23:05 +0100
Subject: [PATCH 14/22] fix: Fixing guard for external predictions in
 TimingsEvaluator, ReadingOrderEvaluator. Fix main

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/cli/main.py                          | 1 +
 docling_eval/evaluators/readingorder_evaluator.py | 5 ++++-
 docling_eval/evaluators/timings_evaluator.py      | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index 7b233802..d65bd634 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -652,6 +652,7 @@ def evaluate(
         evaluation = timings_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
diff --git a/docling_eval/evaluators/readingorder_evaluator.py b/docling_eval/evaluators/readingorder_evaluator.py
index 00e211eb..cb8f09fb 100644
--- a/docling_eval/evaluators/readingorder_evaluator.py
+++ b/docling_eval/evaluators/readingorder_evaluator.py
@@ -101,7 +101,10 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                external_predictions_path is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
diff --git a/docling_eval/evaluators/timings_evaluator.py b/docling_eval/evaluators/timings_evaluator.py
index a0d2fa08..4c0c018b 100644
--- a/docling_eval/evaluators/timings_evaluator.py
+++ b/docling_eval/evaluators/timings_evaluator.py
@@ -80,7 +80,10 @@ def __call__(
             data_record = DatasetRecordWithPrediction.model_validate(data)
 
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                external_predictions_path is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )

From 406b122f425a70a1b34959c6a5125b569de96ad6 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 17:28:12 +0100
Subject: [PATCH 15/22] fix: Export the doctag files with the correct file
 extension

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 tests/test_dataset_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py
index 1222cd31..22a028d5 100644
--- a/tests/test_dataset_builder.py
+++ b/tests/test_dataset_builder.py
@@ -63,7 +63,7 @@ def export_predictions(
         # Save as doctags (.doctags)
         doctags_dir = save_path / "doctag"
         doctags_dir.mkdir(parents=True, exist_ok=True)
-        doctags_fn = doctags_dir / f"{doc_id}.doctags"
+        doctags_fn = doctags_dir / f"{doc_id}.dt"
         pred_doc.save_as_doctags(doctags_fn)
 
         # Save as YAML

From ebe70b00dd14c855e77ebf28f84029541bcd2349 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 18:09:05 +0100
Subject: [PATCH 16/22] feat: Refactor the ExternalDoclingDocumentLoader to
 properly load a DoclingDocument from doctags and the GT image. - Introduce
 the staticmethod load_doctags() which covers all cases on page image loading.
 - Refactor the FilePredictionProvider to use the load_doctags() from
 ExternalDoclingDocumentLoader. - Refactor all evaluators to use the new
 ExternalDoclingDocumentLoader.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 .../evaluators/bbox_text_evaluator.py         |   8 +-
 .../evaluators/doc_structure_evaluator.py     |   8 +-
 docling_eval/evaluators/keyvalue_evaluator.py |  11 +-
 docling_eval/evaluators/layout_evaluator.py   |  13 +-
 .../evaluators/markdown_text_evaluator.py     |   6 +-
 .../evaluators/pixel_layout_evaluator.py      |  11 +-
 docling_eval/evaluators/table_evaluator.py    |  11 +-
 .../prediction_providers/file_provider.py     |  73 +++------
 .../utils/external_docling_doc_loader.py      | 139 +++++++++++++++---
 tests/test_layout_evaluator.py                |  11 +-
 10 files changed, 179 insertions(+), 112 deletions(-)

diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
index c129b01c..a93f1461 100644
--- a/docling_eval/evaluators/bbox_text_evaluator.py
+++ b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -25,7 +25,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -101,9 +101,9 @@ def __call__(
         external_predictions_path: Optional[Path] = None,
     ) -> DatasetBoxesTextEvaluation:
         r""" """
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
@@ -148,7 +148,7 @@ def __call__(
 
             # Load the pred_doc
             if ext_docdoc_loader is not None:
-                pred_doc = ext_docdoc_loader(doc_id)
+                pred_doc = ext_docdoc_loader(data_record)
             else:
                 pred_doc = data_record.predicted_doc
             if pred_doc is None:
diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
index 42609e49..a2f59c30 100644
--- a/docling_eval/evaluators/doc_structure_evaluator.py
+++ b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -18,7 +18,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -80,9 +80,9 @@ def __call__(
         ds_path: Path to load the parquet files of the dataset
         split: Split of the dataset to load
         """
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
@@ -124,7 +124,7 @@ def __call__(
 
             true_doc = data_record.ground_truth_doc
             if ext_docdoc_loader:
-                pred_doc = ext_docdoc_loader(doc_id)
+                pred_doc = ext_docdoc_loader(data_record)
             else:
                 pred_doc = data_record.predicted_doc
 
diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py
index 20899d33..9a5d3c72 100644
--- a/docling_eval/evaluators/keyvalue_evaluator.py
+++ b/docling_eval/evaluators/keyvalue_evaluator.py
@@ -21,7 +21,7 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -423,9 +423,9 @@ def __call__(
         external_predictions_path: Optional[Path] = None,
     ) -> DatasetKeyValueEvaluation:
         r""" """
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         split_glob = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: split_glob})
@@ -648,13 +648,12 @@ def __call__(
     def _get_pred_doc(
         self,
         data_record: DatasetRecordWithPrediction,
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> Optional[DoclingDocument]:
         """Fetch the prediction in the first available format declared by `prediction_sources`."""
         pred_doc: Optional[DoclingDocument] = None
         if ext_docdoc_loader is not None:
-            doc_id = data_record.doc_id
-            pred_doc = ext_docdoc_loader(doc_id)
+            pred_doc = ext_docdoc_loader(data_record)
             return pred_doc
 
         for fmt in self._prediction_sources:
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index ee439610..a906d707 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -30,7 +30,7 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 from docling_eval.utils.utils import tensor_to_float
 
 _log = logging.getLogger(__name__)
@@ -195,9 +195,9 @@ def __call__(
     ) -> DatasetLayoutEvaluation:
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
@@ -598,15 +598,14 @@ def __call__(
     def _get_pred_doc(
         self,
         data_record: DatasetRecordWithPrediction,
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
         if ext_docdoc_loader is not None:
-            doc_id = data_record.doc_id
-            pred_doc = ext_docdoc_loader(doc_id)
+            pred_doc = ext_docdoc_loader(data_record)
             return pred_doc
 
         for prediction_format in self._prediction_sources:
@@ -820,7 +819,7 @@ def _compute_average_iou_with_labels_across_iou(
     def _find_intersecting_labels(
         self,
         ds: Dataset,
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> tuple[dict[str, int], dict[str, int], list[DocItemLabel], list[DocItemLabel]]:
         r"""
         Compute counters per labels for the groundtruth, prediciton and their intersections
diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
index 96222c68..aa112d64 100644
--- a/docling_eval/evaluators/markdown_text_evaluator.py
+++ b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -26,7 +26,7 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -118,7 +118,7 @@ def __call__(
         split: Split of the dataset to load
         """
         if external_predictions_path is not None:
-            external_docling_doc_loader = ExternalDoclingDocLoader(
+            external_docling_doc_loader = ExternalDoclingDocumentLoader(
                 external_predictions_path
             )
 
@@ -157,7 +157,7 @@ def __call__(
 
             # Get the predicted markdown from the external predictions path
             if external_predictions_path is not None:
-                pred_doc = external_docling_doc_loader(doc_id)
+                pred_doc = external_docling_doc_loader(data_record)
                 if pred_doc is None:
                     _log.error("No external prediction found for doc_id=%s", doc_id)
                     rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index 688ec82c..b3df7396 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -39,7 +39,7 @@
     PagePixelLayoutEvaluation,
 )
 from docling_eval.evaluators.stats import compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 from docling_eval.utils.utils import dict_get
 
 _log = logging.getLogger(__name__)
@@ -176,9 +176,9 @@ def __call__(
     ) -> DatasetPixelLayoutEvaluation:
         _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
@@ -551,15 +551,14 @@ def _collect_items_by_page(
     def _get_pred_doc(
         self,
         data_record: DatasetRecordWithPrediction,
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
         if ext_docdoc_loader is not None:
-            doc_id = data_record.doc_id
-            pred_doc = ext_docdoc_loader(doc_id)
+            pred_doc = ext_docdoc_loader(data_record)
             return pred_doc
 
         for prediction_format in self._prediction_sources:
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index 2872a10a..05fec46b 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -23,7 +23,7 @@
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
 from docling_eval.evaluators.table.teds import TEDScorer
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocLoader
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -142,9 +142,9 @@ def __call__(
         """
         logging.info("Loading the split '%s' from: '%s'", split, ds_path)
 
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
-            ext_docdoc_loader = ExternalDoclingDocLoader(external_predictions_path)
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
 
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
@@ -317,15 +317,14 @@ def _evaluate_tables_in_documents(
     def _get_pred_doc(
         self,
         data_record: DatasetRecordWithPrediction,
-        ext_docdoc_loader: Optional[ExternalDoclingDocLoader] = None,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> Optional[DoclingDocument]:
         r"""
         Get the predicted DoclingDocument
         """
         pred_doc = None
         if ext_docdoc_loader is not None:
-            doc_id = data_record.doc_id
-            pred_doc = ext_docdoc_loader(doc_id)
+            pred_doc = ext_docdoc_loader(data_record)
             return pred_doc
 
         for prediction_format in self._prediction_sources:
diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py
index 97244c99..17b9bad3 100644
--- a/docling_eval/prediction_providers/file_provider.py
+++ b/docling_eval/prediction_providers/file_provider.py
@@ -4,11 +4,7 @@
 
 from docling.datamodel.base_models import ConversionStatus
 from docling_core.types.doc import DocItemLabel
-from docling_core.types.doc.document import (
-    DoclingDocument,
-    DocTagsDocument,
-    DocTagsPage,
-)
+from docling_core.types.doc.document import DoclingDocument
 from PIL import Image
 
 from docling_eval.datamodels.dataset_record import (
@@ -23,6 +19,7 @@
 from docling_eval.prediction_providers.base_prediction_provider import (
     BasePredictionProvider,
 )
+from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
 
 _log = logging.getLogger(__name__)
 
@@ -164,58 +161,22 @@ def prediction_format(self) -> PredictionFormats:
         return self._prediction_format
 
     def _load_doctags_doc(self, record: DatasetRecord) -> Optional[DoclingDocument]:
+        r"""
+        Load the DoclingDocument from doctags + image.
+        Check ExternalDoclingDocLoader for details on the loading alrogithm.
         """
-        Load doctags file into DoclingDocument.
-
-        Args:
-            record: Groundtruth dataset record
-
-        Returns:
-            DoclingDocument or None if file not found
-        """
-        # Read the doctags file
-        doctags_fn = self._prediction_source_path / f"{record.doc_id}.dt"
-        if self._ignore_missing_files and not doctags_fn.is_file():
-            return None
-
-        try:
-            with open(doctags_fn, "r") as fd:
-                doctags = fd.read()
-
-            page_image: Optional[Image.Image] = None
-
-            # Try to get an image file for the predictions:
-            # 1. Check the pred_images_path.
-            # 2. Use the GT page image if the corresponding flag is set.
-            # 3. Look inside the same dir as the doctag files.
-            if self._prediction_images_path:
-                page_image_fn = self._prediction_images_path / f"{record.doc_id}.png"
-                if page_image_fn.is_file():
-                    page_image = Image.open(page_image_fn)
-                else:
-                    _log.warning("Failed to load pred image: %s", page_image_fn)
-            elif self._use_ground_truth_page_images:
-                page_image = record.ground_truth_page_images[0]
-            else:
-                page_image_fn = self._prediction_source_path / f"{record.doc_id}.png"
-                if page_image_fn.is_file():
-                    page_image = Image.open(page_image_fn)
-                else:
-                    _log.warning("Failed to load pred image: %s", page_image_fn)
-
-            # Build DoclingDocument
-            doctags_page = DocTagsPage(tokens=doctags, image=page_image)
-            doctags_doc = DocTagsDocument(pages=[doctags_page])
-            doc = DoclingDocument.load_from_doctags(
-                doctags_doc, document_name=record.doc_id
-            )
-
-            return doc
-        except Exception as e:
-            _log.error(f"Error loading doctags document {record.doc_id}: {str(e)}")
-            if not self._ignore_missing_files:
-                raise
-            return None
+        doc_id = record.doc_id
+        gt_page_images = record.ground_truth_page_images
+        gt_page_image = gt_page_images[0] if len(gt_page_images) > 0 else None
+        doc = ExternalDoclingDocumentLoader.load_doctags(
+            doc_id,
+            self._prediction_source_path,
+            page_images_root=self._prediction_images_path,
+            gt_page_image=gt_page_image,
+        )
+        if not self._ignore_missing_files:
+            raise ValueError(f"Missing missing document {doc_id}")
+        return doc
 
     def _load_json_doc(self, record: DatasetRecord) -> Optional[DoclingDocument]:
         """
diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_doc_loader.py
index c132b6ef..c3efd495 100644
--- a/docling_eval/utils/external_docling_doc_loader.py
+++ b/docling_eval/utils/external_docling_doc_loader.py
@@ -1,28 +1,133 @@
+import logging
 from pathlib import Path
 from typing import Optional
 
-from docling_core.types.doc.document import DoclingDocument
+from docling_core.types.doc.document import (
+    DoclingDocument,
+    DocTagsDocument,
+    DocTagsPage,
+)
+from PIL import Image
 
+from docling_eval.datamodels.dataset_record import DatasetRecord
 
-class ExternalDoclingDocLoader:
-    def __init__(self, external_predictions_dir: Path):
+_log = logging.getLogger(__name__)
+
+
+class ExternalDoclingDocumentLoader:
+    r""" """
+
+    def __init__(
+        self,
+        external_predictions_dir: Path,
+    ):
+        r""" """
         self._external_predictions_dir = external_predictions_dir
 
-    def __call__(self, doc_id: str) -> Optional[DoclingDocument]:
+    def __call__(self, record: DatasetRecord) -> Optional[DoclingDocument]:
         r"""
         Load the DoclingDocument from the external predictions path
+
+        The following fields are used from the `record` parameter:
+        - record.doc_id
+        - record.ground_truth_page_images[0]
         """
-        json_path = self._external_predictions_dir / f"{doc_id}.json"
-        dt_path = self._external_predictions_dir / f"{doc_id}.dt"
-        yaml_path = self._external_predictions_dir / f"{doc_id}.yaml"
-        yml_path = self._external_predictions_dir / f"{doc_id}.yml"
-
-        if json_path.is_file():
-            return DoclingDocument.load_from_json(json_path)
-        if dt_path.is_file():
-            return DoclingDocument.load_from_doctags(dt_path)
-        if yaml_path.is_file():
-            return DoclingDocument.load_from_yaml(yaml_path)
-        if yml_path.is_file():
-            return DoclingDocument.load_from_yaml(yml_path)
+        doc_id = record.doc_id
+
+        json_fn = self._external_predictions_dir / f"{doc_id}.json"
+        doctags_fn = ExternalDoclingDocumentLoader.build_doctags_path(
+            self._external_predictions_dir, doc_id
+        )
+        yaml_fn = self._external_predictions_dir / f"{doc_id}.yaml"
+        yml_fn = self._external_predictions_dir / f"{doc_id}.yml"
+
+        if json_fn.is_file():
+            return DoclingDocument.load_from_json(json_fn)
+        if doctags_fn.is_file():
+            gt_page_images = record.ground_truth_page_images
+            gt_page_image = gt_page_images[0] if len(gt_page_images) > 0 else None
+
+            return ExternalDoclingDocumentLoader.load_doctags(
+                doc_id,
+                self._external_predictions_dir,
+                gt_page_image=gt_page_image,
+            )
+        if yaml_fn.is_file():
+            return DoclingDocument.load_from_yaml(yaml_fn)
+        if yml_fn.is_file():
+            return DoclingDocument.load_from_yaml(yml_fn)
         return None
+
+    @staticmethod
+    def build_doctags_path(doctags_root: Path, doc_id: str) -> Path:
+        r"""Get the full path of the doctags file"""
+        dt_path = doctags_root / f"{doc_id}.dt"
+        return dt_path
+
+    @staticmethod
+    def load_doctags(
+        doc_id: str,
+        doctags_root: Path,
+        page_images_root: Optional[Path] = None,
+        gt_page_image: Optional[Image.Image] = None,
+        image_filename_extension: str = "png",
+    ) -> Optional[DoclingDocument]:
+        r"""
+        Load a single page DoclingDocument object from a doctags file and a page image.
+
+        The page image is supplied from these sources in the specific order:
+        1. The page_images_root: An image with filename <doc_id>.<image_filename_extension> is used
+        2. gt_page_image: An explicit Image object is used.
+        3. Search for the image with filename <doc_id>.<image_filename_extension> in the doctags root
+
+        Parameters
+        ----------
+        doctags_root: Root path to load doctags as files with name <doc_id>.dt
+        doc_id: The document id of the file to be loaded
+        page_images_root: If provided, search for the page images here first.
+        gt_page_image: If provided, search use that object for the page image.
+        image_filename_extension: The file extension for the page image.
+
+        Returns
+        -------
+        DoclingDocument object or None if the document cannot be reconstructed
+        """
+        # Read the doctags file
+        doctags_fn = ExternalDoclingDocumentLoader.build_doctags_path(
+            doctags_root, doc_id
+        )
+
+        try:
+            with open(doctags_fn, "r") as fd:
+                doctags = fd.read()
+
+            page_image: Optional[Image.Image] = None
+
+            if page_images_root:
+                page_image_fn = (
+                    page_images_root / f"{doc_id}.{image_filename_extension}"
+                )
+                if page_image_fn.is_file():
+                    page_image = Image.open(page_image_fn)
+                else:
+                    _log.warning("Failed to load page image: %s", page_image_fn)
+            elif gt_page_image is not None:
+                page_image = gt_page_image
+            else:
+                page_image_fn = doctags_root / f"{doc_id}.{image_filename_extension}"
+                if page_image_fn.is_file():
+                    page_image = Image.open(page_image_fn)
+                else:
+                    _log.warning(
+                        "Missing page image file: %s. Reconstruct doctags without page image",
+                        page_image_fn,
+                    )
+
+            # Build DoclingDocument
+            doctags_page = DocTagsPage(tokens=doctags, image=page_image)
+            doctags_doc = DocTagsDocument(pages=[doctags_page])
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name=doc_id)
+            return doc
+        except Exception as e:
+            _log.error(f"Error loading doctags document {doc_id}: {str(e)}")
+            return None
diff --git a/tests/test_layout_evaluator.py b/tests/test_layout_evaluator.py
index 2de0752a..eafaef87 100644
--- a/tests/test_layout_evaluator.py
+++ b/tests/test_layout_evaluator.py
@@ -61,10 +61,15 @@ def test_layout_evaluator_external_predictions():
     r"""Testing the evaluator with external predictions"""
     eval = LayoutEvaluator()
     gt_path = Path("scratch/DPBench/gt_dataset")
-    preds_path = Path("scratch/DPBench/predicted_documents/json")
 
-    v = eval(gt_path, external_predictions_path=preds_path)
-    assert v is not None
+    preds_path = [
+        Path("scratch/DPBench/predicted_documents/json"),
+        Path("scratch/DPBench/predicted_documents/doctag"),
+        Path("scratch/DPBench/predicted_documents/yaml"),
+    ]
+    for pred_path in preds_path:
+        v = eval(gt_path, external_predictions_path=pred_path)
+        assert v is not None
 
 
 if __name__ == "__main__":

From 33511c922cad0bacb9ccfee0c57ae528b5127f54 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Fri, 5 Dec 2025 18:14:04 +0100
Subject: [PATCH 17/22] chore: Rename code file as
 external_docling_document_loader.py

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling_eval/evaluators/bbox_text_evaluator.py                | 4 +++-
 docling_eval/evaluators/doc_structure_evaluator.py            | 4 +++-
 docling_eval/evaluators/keyvalue_evaluator.py                 | 4 +++-
 docling_eval/evaluators/layout_evaluator.py                   | 4 +++-
 docling_eval/evaluators/markdown_text_evaluator.py            | 4 +++-
 docling_eval/evaluators/pixel_layout_evaluator.py             | 4 +++-
 docling_eval/evaluators/table_evaluator.py                    | 4 +++-
 docling_eval/prediction_providers/file_provider.py            | 4 +++-
 ...ling_doc_loader.py => external_docling_document_loader.py} | 0
 9 files changed, 24 insertions(+), 8 deletions(-)
 rename docling_eval/utils/{external_docling_doc_loader.py => external_docling_document_loader.py} (100%)

diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
index a93f1461..42109dee 100644
--- a/docling_eval/evaluators/bbox_text_evaluator.py
+++ b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -25,7 +25,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
index a2f59c30..c0842015 100644
--- a/docling_eval/evaluators/doc_structure_evaluator.py
+++ b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -18,7 +18,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py
index 9a5d3c72..baa78697 100644
--- a/docling_eval/evaluators/keyvalue_evaluator.py
+++ b/docling_eval/evaluators/keyvalue_evaluator.py
@@ -21,7 +21,9 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index a906d707..0d11394d 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -30,7 +30,9 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 from docling_eval.utils.utils import tensor_to_float
 
 _log = logging.getLogger(__name__)
diff --git a/docling_eval/evaluators/markdown_text_evaluator.py b/docling_eval/evaluators/markdown_text_evaluator.py
index aa112d64..afbc68ee 100644
--- a/docling_eval/evaluators/markdown_text_evaluator.py
+++ b/docling_eval/evaluators/markdown_text_evaluator.py
@@ -26,7 +26,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/evaluators/pixel_layout_evaluator.py b/docling_eval/evaluators/pixel_layout_evaluator.py
index b3df7396..1913f1b7 100644
--- a/docling_eval/evaluators/pixel_layout_evaluator.py
+++ b/docling_eval/evaluators/pixel_layout_evaluator.py
@@ -39,7 +39,9 @@
     PagePixelLayoutEvaluation,
 )
 from docling_eval.evaluators.stats import compute_stats
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 from docling_eval.utils.utils import dict_get
 
 _log = logging.getLogger(__name__)
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index 05fec46b..fb2f6aac 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -23,7 +23,9 @@
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
 from docling_eval.evaluators.table.teds import TEDScorer
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/prediction_providers/file_provider.py b/docling_eval/prediction_providers/file_provider.py
index 17b9bad3..5c93e7b9 100644
--- a/docling_eval/prediction_providers/file_provider.py
+++ b/docling_eval/prediction_providers/file_provider.py
@@ -19,7 +19,9 @@
 from docling_eval.prediction_providers.base_prediction_provider import (
     BasePredictionProvider,
 )
-from docling_eval.utils.external_docling_doc_loader import ExternalDoclingDocumentLoader
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
diff --git a/docling_eval/utils/external_docling_doc_loader.py b/docling_eval/utils/external_docling_document_loader.py
similarity index 100%
rename from docling_eval/utils/external_docling_doc_loader.py
rename to docling_eval/utils/external_docling_document_loader.py

From 94b39385cbd78f5b25d8548307bf6a0968764006 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Mon, 8 Dec 2025 13:31:28 +0100
Subject: [PATCH 18/22] fix: Fix typo

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docs/examples/matrix.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/examples/matrix.py b/docs/examples/matrix.py
index a6e04cb2..6d7625c0 100644
--- a/docs/examples/matrix.py
+++ b/docs/examples/matrix.py
@@ -85,7 +85,7 @@ def main(args):
 
 
 if __name__ == "__main__":
-    desription = """
+    description = """
     Running multi-evaluation and consolidation inside a working directory and generate matrix reports
     
     The working directory must have the structure:
@@ -108,7 +108,7 @@ def main(args):
                      └── evaluation_<benchmark>_<modality1>.json
     """
     parser = argparse.ArgumentParser(
-        description=desription, formatter_class=argparse.RawTextHelpFormatter
+        description=description, formatter_class=argparse.RawTextHelpFormatter
     )
     parser.add_argument(
         "-t",

From ae10646fdd9ed91638bcaa9b18b99e4d71c5090e Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Mon, 8 Dec 2025 13:54:25 +0100
Subject: [PATCH 19/22] feat: Introduce examples how to evaluate using external
 predictions using the API and the CLI.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 ...valuate_dpbench_on_external_predictions.sh | 72 ++++++++++++++++
 .../examples/evaluate_external_predictions.py | 85 +++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100755 docs/examples/evaluate_dpbench_on_external_predictions.sh
 create mode 100644 docs/examples/evaluate_external_predictions.py

diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh
new file mode 100755
index 00000000..413b12de
--- /dev/null
+++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+###########################################################################################
+# Invariants
+#
+
+readonly GT_DIR=scratch/DPBench/gt_dataset
+
+readonly MODALITIES=(
+layout
+table_structure
+document_structure
+reading_order
+markdown_text
+bboxes_text
+key_value
+timings
+)
+
+
+###########################################################################################
+# Functions
+#
+
+evaluate() {
+    local pred_dir save_dir modality
+    pred_dir="$1"
+    save_dir="$2"
+
+    # Check if the GT/preds dirs exist
+    if [ ! -d "${GT_DIR}" ]; then
+        echo "Missing GT dir: ${GT_DIR}"
+        exit 1
+    fi
+    if [ ! -d "${pred_dir}" ]; then
+        echo "Missing predictions dir: ${pred_dir}"
+        exit 2
+    fi
+
+    for modality in "${MODALITIES[@]}"; do
+        echo "Evaluation modality: ${modality}, predictions: ${pred_dir}"
+        uv run docling-eval evaluate \
+            --benchmark DPBench \
+            --modality "${modality}" \
+            --input-dir "${GT_DIR}" \
+            --external-predictions-path "${pred_dir}" \
+            --output-dir "${save_dir}"
+    done
+}
+
+
+###########################################################################################
+# Main
+#
+
+# json predictions
+evaluate \
+    scratch/DPBench/predicted_documents/json \
+    scratch/DPBench/external_evaluations_jsons
+
+
+# doctags predictions
+evaluate \
+    scratch/DPBench/predicted_documents/doctag \
+    scratch/DPBench/external_evaluations_doctags
+
+
+# yaml predictions
+evaluate \
+    scratch/DPBench/predicted_documents/yaml \
+    scratch/DPBench/external_evaluations_yaml
+
diff --git a/docs/examples/evaluate_external_predictions.py b/docs/examples/evaluate_external_predictions.py
new file mode 100644
index 00000000..9e7f9dd9
--- /dev/null
+++ b/docs/examples/evaluate_external_predictions.py
@@ -0,0 +1,85 @@
+import argparse
+import logging
+from pathlib import Path
+
+from docling_eval.cli.main import evaluate
+from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality
+
+_log = logging.getLogger(__name__)
+
+
+def evaluate_external_predictions(
+    benchmark: BenchMarkNames,
+    modality: EvaluationModality,
+    gt_path: Path,
+    predictions_dir: Path,
+    save_dir: Path,
+):
+    r""" """
+    evaluate(
+        modality,
+        benchmark,
+        gt_path,
+        save_dir,
+        external_predictions_path=predictions_dir,
+    )
+
+
+def main():
+    r""" """
+    parser = argparse.ArgumentParser(
+        description="Example how to use GT from parquet and predictions from externally provided prediction files",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "-b",
+        "--benchmark",
+        required=True,
+        type=BenchMarkNames,
+        help="Evaluation modality",
+    )
+    parser.add_argument(
+        "-m",
+        "--modality",
+        required=True,
+        type=EvaluationModality,
+        help="Evaluation modality",
+    )
+    parser.add_argument(
+        "-g",
+        "--gt_parquet_dir",
+        required=True,
+        type=Path,
+        help="Path to the parquet GT dataset",
+    )
+    parser.add_argument(
+        "-p",
+        "--predictions_dir",
+        required=True,
+        type=Path,
+        help="Dir with the external prediction files (json, dt, yaml)",
+    )
+    parser.add_argument(
+        "-s",
+        "--save_dir",
+        required=False,
+        type=Path,
+        help="Path to save the produced evaluation files",
+    )
+    args = parser.parse_args()
+
+    # Configure logger
+    log_format = "%(asctime)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    evaluate_external_predictions(
+        args.benchmark,
+        args.modality,
+        args.gt_parquet_dir,
+        args.predictions_dir,
+        args.save_dir,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 8c52e36b48defb2c1372af583940d89fec0a423f Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Mon, 8 Dec 2025 15:16:19 +0100
Subject: [PATCH 20/22] feat: Prediction vizualizer

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_eval/cli/main.py | 66 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
index d65bd634..2de5afe9 100644
--- a/docling_eval/cli/main.py
+++ b/docling_eval/cli/main.py
@@ -126,6 +126,7 @@
 from docling_eval.prediction_providers.tableformer_provider import (
     TableFormerPredictionProvider,
 )
+from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer
 
 
 class DoclingLayoutOptionsManager:
@@ -362,7 +363,7 @@ def get_prediction_provider(
     docling_layout_keep_empty_clusters: Optional[bool] = None,
     # Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
     docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
-    docling_force_full_page_ocr: Optional[bool] = None,
+    docling_force_full_page_ocr: bool = False,
     granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
     max_new_tokens: Optional[int] = None,
 ):
@@ -376,7 +377,7 @@ def get_prediction_provider(
         ocr_factory = get_ocr_factory()
 
         ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
-            kind="easyocr",
+            kind="rapidocr",
             force_full_page_ocr=docling_force_full_page_ocr,
         )
         # Use all CPU cores
@@ -1570,6 +1571,67 @@ def visualize_cmd(
     )
 
 
+@app.command(name="create_viz")
+def create_viz(
+    dataset_dir: Annotated[
+        Path,
+        typer.Option(
+            help=(
+                "Dataset directory (GT parquet or eval_dataset parquet with predictions) "
+                "containing the split folder with parquet shards."
+            )
+        ),
+    ],
+    split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test",
+    external_predictions_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            help=(
+                "Directory with DoclingDocument predictions named as <doc_id>.[json|dt|yaml|yml]. "
+                "If omitted, predictions are taken from the dataset parquet."
+            )
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help=(
+                "Directory where HTML visualizations are written. Defaults to "
+                "<dataset_dir>/visualizations when omitted."
+            )
+        ),
+    ] = None,
+    begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
+    end_index: Annotated[
+        int, typer.Option(help="End index (exclusive), -1 for all")
+    ] = -1,
+    ignore_missing_predictions: Annotated[
+        bool,
+        typer.Option(
+            help="Skip documents without a matching prediction instead of failing"
+        ),
+    ] = False,
+):
+    """
+    Create paired GT vs. prediction HTML visualizations without generating parquet output.
+    """
+    visualizations_dir = (
+        output_dir if output_dir is not None else dataset_dir / "visualizations"
+    )
+
+    visualizer = PredictionsVisualizer(
+        visualizations_dir=visualizations_dir,
+        external_predictions_dir=external_predictions_path,
+        ignore_missing_predictions=ignore_missing_predictions,
+    )
+    visualizer.create_visualizations(
+        dataset_dir=dataset_dir,
+        split=split,
+        begin_index=begin_index,
+        end_index=end_index,
+    )
+
+
 @app.callback()
 def main():
     """Docling Evaluation CLI for benchmarking document processing tasks."""

From 6f7331c71050afb27a9749f9a750427a125b1a7d Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Tue, 9 Dec 2025 08:20:33 +0100
Subject: [PATCH 21/22] Update
 docling_eval/utils/external_predictions_visualizer.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
---
 .../utils/external_predictions_visualizer.py    | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/docling_eval/utils/external_predictions_visualizer.py b/docling_eval/utils/external_predictions_visualizer.py
index 2eed6946..35342c98 100644
--- a/docling_eval/utils/external_predictions_visualizer.py
+++ b/docling_eval/utils/external_predictions_visualizer.py
@@ -150,9 +150,14 @@ def _save_visualization(self, record: DatasetRecordWithPrediction) -> None:
             record.predicted_page_images,
         )
 
-        save_comparison_html_with_clusters(
-            filename=self._visualizations_dir / f"{record.doc_id}.html",
-            true_doc=gt_doc,
-            pred_doc=pred_doc,
-            draw_reading_order=True,
-        )
+        try:
+            save_comparison_html_with_clusters(
+                filename=self._visualizations_dir / f"{record.doc_id}.html",
+                true_doc=gt_doc,
+                pred_doc=pred_doc,
+                draw_reading_order=True,
+            )
+        except (IndexError, ValueError) as e:
+            _LOGGER.warning(
+                f"Failed to save visualization for doc_id {record.doc_id}: {e}"
+            )

From 21eae304e5bbedf300318ff17d6fd6dc916e8d2c Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <nli@zurich.ibm.com>
Date: Tue, 9 Dec 2025 13:35:19 +0100
Subject: [PATCH 22/22] feat: Update examples bash script to demonstrate
 visualisations on external predictions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 ...valuate_dpbench_on_external_predictions.sh | 38 +++++++++++++++++--
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh
index 413b12de..6189e90b 100755
--- a/docs/examples/evaluate_dpbench_on_external_predictions.sh
+++ b/docs/examples/evaluate_dpbench_on_external_predictions.sh
@@ -38,7 +38,7 @@ evaluate() {
     fi
 
     for modality in "${MODALITIES[@]}"; do
-        echo "Evaluation modality: ${modality}, predictions: ${pred_dir}"
+        echo "Evaluate: modality: ${modality}: predictions: ${pred_dir}"
         uv run docling-eval evaluate \
             --benchmark DPBench \
             --modality "${modality}" \
@@ -49,24 +49,54 @@ evaluate() {
 }
 
 
+visualize() {
+    local pred_dir save_dir modality
+    pred_dir="$1"
+    save_dir="$2"
+
+    # Check if the GT/preds dirs exist
+    if [ ! -d "${GT_DIR}" ]; then
+        echo "Missing GT dir: ${GT_DIR}"
+        exit 1
+    fi
+    if [ ! -d "${pred_dir}" ]; then
+        echo "Missing predictions dir: ${pred_dir}"
+        exit 2
+    fi
+
+    echo "Visualize predictions: ${pred_dir}"
+    uv run docling-eval create_viz \
+        --dataset-dir "${GT_DIR}" \
+        --external-predictions-path "${pred_dir}" \
+        --output-dir "${save_dir}"
+}
+
 ###########################################################################################
 # Main
 #
 
+# Predictions
+
 # json predictions
 evaluate \
     scratch/DPBench/predicted_documents/json \
-    scratch/DPBench/external_evaluations_jsons
+    scratch/DPBench/external_predictions_jsons
 
 
 # doctags predictions
 evaluate \
     scratch/DPBench/predicted_documents/doctag \
-    scratch/DPBench/external_evaluations_doctags
+    scratch/DPBench/external_predictions_doctags
 
 
 # yaml predictions
 evaluate \
     scratch/DPBench/predicted_documents/yaml \
-    scratch/DPBench/external_evaluations_yaml
+    scratch/DPBench/external_predictions_yaml
+
+
+# Visualisations
+visualize \
+    scratch/DPBench/predicted_documents/json \
+    scratch/DPBench/external_predictions_visualisations