docling-project · nikos-livathinos · Dec 8, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -639,6 +639,7 @@ def evaluate(
     odir: Path,
     split: str = "test",
     cvat_overview_path: Optional[Path] = None,
+    external_predictions_path: Optional[Path] = None,
 ) -> Optional[DatasetEvaluationType]:
     """Evaluate predictions against ground truth."""
     if not os.path.exists(idir):
@@ -659,6 +660,7 @@ def evaluate(
         evaluation = timings_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -673,6 +675,7 @@ def evaluate(
         evaluation = layout_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -681,7 +684,9 @@ def evaluate(
         # Evaluate with the pixel-wise layout evaluation
         pixel_layout_evaluator = PixelLayoutEvaluator()
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
-            idir, split=split
+            idir,
+            split=split,
+            external_predictions_path=external_predictions_path,
         )
         pixel_save_root: Path = save_fn.parent
         pixel_layout_evaluator.save_evaluations(
@@ -695,6 +700,7 @@ def evaluate(
         evaluation = table_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -707,36 +713,44 @@ def evaluate(
         evaluation = doc_struct_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
     elif modality == EvaluationModality.OCR:
-        if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
-            text_unit = TextCellUnit.LINE
-        else:
-            text_unit = TextCellUnit.WORD
-
-        logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})")
-        logging.info(f"Text unit set to {text_unit}")
+        if not external_predictions_path:
+            if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
+                text_unit = TextCellUnit.LINE
+            else:
+                text_unit = TextCellUnit.WORD
+
+            logging.info(
+                f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})"
+            )
+            logging.info(f"Text unit set to {text_unit}")
 
-        ocr_evaluator = OCREvaluator(
-            intermediate_evaluations_path=odir, text_unit=text_unit
-        )
-        evaluation = ocr_evaluator(  # type: ignore
-            idir,
-            split=split,
-        )
+            ocr_evaluator = OCREvaluator(
+                intermediate_evaluations_path=odir, text_unit=text_unit
+            )
+            evaluation = ocr_evaluator(  # type: ignore
+                idir,
+                split=split,
+                external_predictions_path=external_predictions_path,
+            )
 
-        with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            with open(save_fn, "w") as fd:
+                json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+        else:
+            logging.error("External predictions are not supported for OCR evaluations")
 
     elif modality == EvaluationModality.READING_ORDER:
         readingorder_evaluator = ReadingOrderEvaluator()
         evaluation = readingorder_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -753,6 +767,7 @@ def evaluate(
         evaluation = md_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -769,6 +784,7 @@ def evaluate(
         evaluation = bbox_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -784,6 +800,7 @@ def evaluate(
         evaluation = keyvalue_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -1487,6 +1504,12 @@ def evaluate_cmd(
         ),
     ] = None,
     split: Annotated[str, typer.Option(help="Dataset split")] = "test",
+    external_predictions_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
+        ),
+    ] = None,
 ):
     """Evaluate predictions against ground truth."""
     input_dir, output_dir = derive_input_output_dirs(
@@ -1506,6 +1529,7 @@ def evaluate_cmd(
         idir=input_dir,
         odir=eval_output_dir,
         split=split,
+        external_predictions_path=external_predictions_path,
     )
 
 

diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
@@ -100,6 +100,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetEvaluationType:
         r"""
         Perform the evaluation

diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -4,7 +4,7 @@
 
 import nltk
 from datasets import load_dataset
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from docling_core.types.doc.document import DoclingDocument, TextItem
 from nltk import edit_distance, word_tokenize
 from nltk.metrics import f_measure, precision, recall
@@ -25,6 +25,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -94,8 +97,16 @@ def __init_(
         nltk.download("popular", quiet=True)
 
     def __call__(
-        self, ds_path: Path, split: str = "test"
+        self,
+        ds_path: Path,
+        split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetBoxesTextEvaluation:
+        r""" """
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"oveview of dataset: {ds}")
@@ -125,15 +136,23 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+
+            # Load the pred_doc
+            if ext_docdoc_loader is not None:
+                pred_doc = ext_docdoc_loader(data_record)
+            else:
+                pred_doc = data_record.predicted_doc
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -212,7 +231,15 @@ def _match_bboxes(
                     continue
                 assert len(doc_item.prov) == 1
                 prov = doc_item.prov[0]
-                bboxes[doc_key].append(prov.bbox)
+
+                # Ensure bbox is in top-left origin
+                bbox = prov.bbox
+                if bbox.coord_origin != CoordOrigin.TOPLEFT:
+                    page_no = prov.page_no
+                    page_size = doc.pages[page_no].size
+                    bbox = bbox.to_top_left_origin(page_size.height)
+
+                bboxes[doc_key].append(bbox)
                 texts[doc_key].append(doc_item.text)
 
         # Decide which document is the pivot

diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -18,6 +18,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -71,13 +74,18 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetDocStructureEvaluation:
         r"""
         Parameters
         ----------
         ds_path: Path to load the parquet files of the dataset
         split: Split of the dataset to load
         """
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"Overview of the dataset: {ds}")
@@ -106,15 +114,21 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+            if ext_docdoc_loader:
+                pred_doc = ext_docdoc_loader(data_record)
+            else:
+                pred_doc = data_record.predicted_doc
 
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)

diff --git a/docling_eval/evaluators/keyvalue_evaluator.py b/docling_eval/evaluators/keyvalue_evaluator.py
@@ -21,6 +21,9 @@
     docling_document_from_doctags,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -415,7 +418,17 @@ def __init__(
     # --------------------------------------------------------------------- #
     # Public API
     # --------------------------------------------------------------------- #
-    def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluation:
+    def __call__(
+        self,
+        ds_path: Path,
+        split: str = "test",
+        external_predictions_path: Optional[Path] = None,
+    ) -> DatasetKeyValueEvaluation:
+        r""" """
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
+
         split_glob = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: split_glob})
         _log.info("Loaded split '%s' – %d samples", split, len(ds[split]))
@@ -461,13 +474,13 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluat
             doc_id = record.doc_id
 
             # ----- sanity checks --------------------------------------------------
-            if record.status not in self._accepted_status:
+            if ext_docdoc_loader is None and record.status not in self._accepted_status:
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 _log.error("Skipping %s – conversion failed", doc_id)
                 continue
 
             gt_doc = record.ground_truth_doc
-            pred_doc = self._get_pred_doc(record)
+            pred_doc = self._get_pred_doc(record, ext_docdoc_loader)
             if pred_doc is None:
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
                 _log.error("Skipping %s – missing prediction", doc_id)
@@ -635,10 +648,15 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluat
     # Helpers
     # --------------------------------------------------------------------- #
     def _get_pred_doc(
-        self, data_record: DatasetRecordWithPrediction
+        self,
+        data_record: DatasetRecordWithPrediction,
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
     ) -> Optional[DoclingDocument]:
         """Fetch the prediction in the first available format declared by `prediction_sources`."""
         pred_doc: Optional[DoclingDocument] = None
+        if ext_docdoc_loader is not None:
+            pred_doc = ext_docdoc_loader(data_record)
+            return pred_doc
 
         for fmt in self._prediction_sources:
             if fmt == PredictionFormats.DOCLING_DOCUMENT: