docling-project · nikos-livathinos · Dec 9, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -126,6 +126,7 @@
 from docling_eval.prediction_providers.tableformer_provider import (
     TableFormerPredictionProvider,
 )
+from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer
 
 
 class DoclingLayoutOptionsManager:
@@ -362,7 +363,7 @@ def get_prediction_provider(
     docling_layout_keep_empty_clusters: Optional[bool] = None,
     # Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
     docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
-    docling_force_full_page_ocr: Optional[bool] = None,
+    docling_force_full_page_ocr: bool = False,
     granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
     max_new_tokens: Optional[int] = None,
 ):
@@ -376,7 +377,7 @@ def get_prediction_provider(
         ocr_factory = get_ocr_factory()
 
         ocr_options: OcrOptions = ocr_factory.create_options(  # type: ignore
-            kind="easyocr",
+            kind="rapidocr",
             force_full_page_ocr=docling_force_full_page_ocr,
         )
         # Use all CPU cores
@@ -639,6 +640,7 @@ def evaluate(
     odir: Path,
     split: str = "test",
     cvat_overview_path: Optional[Path] = None,
+    external_predictions_path: Optional[Path] = None,
 ) -> Optional[DatasetEvaluationType]:
     """Evaluate predictions against ground truth."""
     if not os.path.exists(idir):
@@ -659,6 +661,7 @@ def evaluate(
         evaluation = timings_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -673,6 +676,7 @@ def evaluate(
         evaluation = layout_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -681,7 +685,9 @@ def evaluate(
         # Evaluate with the pixel-wise layout evaluation
         pixel_layout_evaluator = PixelLayoutEvaluator()
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
-            idir, split=split
+            idir,
+            split=split,
+            external_predictions_path=external_predictions_path,
         )
         pixel_save_root: Path = save_fn.parent
         pixel_layout_evaluator.save_evaluations(
@@ -695,6 +701,7 @@ def evaluate(
         evaluation = table_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -707,36 +714,44 @@ def evaluate(
         evaluation = doc_struct_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
     elif modality == EvaluationModality.OCR:
-        if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
-            text_unit = TextCellUnit.LINE
-        else:
-            text_unit = TextCellUnit.WORD
-
-        logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})")
-        logging.info(f"Text unit set to {text_unit}")
+        if not external_predictions_path:
+            if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
+                text_unit = TextCellUnit.LINE
+            else:
+                text_unit = TextCellUnit.WORD
+
+            logging.info(
+                f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})"
+            )
+            logging.info(f"Text unit set to {text_unit}")
 
-        ocr_evaluator = OCREvaluator(
-            intermediate_evaluations_path=odir, text_unit=text_unit
-        )
-        evaluation = ocr_evaluator(  # type: ignore
-            idir,
-            split=split,
-        )
+            ocr_evaluator = OCREvaluator(
+                intermediate_evaluations_path=odir, text_unit=text_unit
+            )
+            evaluation = ocr_evaluator(  # type: ignore
+                idir,
+                split=split,
+                external_predictions_path=external_predictions_path,
+            )
 
-        with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            with open(save_fn, "w") as fd:
+                json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+        else:
+            logging.error("External predictions are not supported for OCR evaluations")
 
     elif modality == EvaluationModality.READING_ORDER:
         readingorder_evaluator = ReadingOrderEvaluator()
         evaluation = readingorder_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -753,6 +768,7 @@ def evaluate(
         evaluation = md_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
 
         with open(save_fn, "w") as fd:
@@ -769,6 +785,7 @@ def evaluate(
         evaluation = bbox_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -784,6 +801,7 @@ def evaluate(
         evaluation = keyvalue_evaluator(  # type: ignore
             idir,
             split=split,
+            external_predictions_path=external_predictions_path,
         )
         with open(save_fn, "w") as fd:
             json.dump(
@@ -1487,6 +1505,12 @@ def evaluate_cmd(
         ),
     ] = None,
     split: Annotated[str, typer.Option(help="Dataset split")] = "test",
+    external_predictions_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
+        ),
+    ] = None,
 ):
     """Evaluate predictions against ground truth."""
     input_dir, output_dir = derive_input_output_dirs(
@@ -1506,6 +1530,7 @@ def evaluate_cmd(
         idir=input_dir,
         odir=eval_output_dir,
         split=split,
+        external_predictions_path=external_predictions_path,
     )
 
 
@@ -1554,6 +1579,67 @@ def visualize_cmd(
     )
 
 
+@app.command(name="create_viz")
+def create_viz(
+    dataset_dir: Annotated[
+        Path,
+        typer.Option(
+            help=(
+                "Dataset directory (GT parquet or eval_dataset parquet with predictions) "
+                "containing the split folder with parquet shards."
+            )
+        ),
+    ],
+    split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test",
+    external_predictions_path: Annotated[
+        Optional[Path],
+        typer.Option(
+            help=(
+                "Directory with DoclingDocument predictions named as <doc_id>.[json|dt|yaml|yml]. "
+                "If omitted, predictions are taken from the dataset parquet."
+            )
+        ),
+    ] = None,
+    output_dir: Annotated[
+        Optional[Path],
+        typer.Option(
+            help=(
+                "Directory where HTML visualizations are written. Defaults to "
+                "<dataset_dir>/visualizations when omitted."
+            )
+        ),
+    ] = None,
+    begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
+    end_index: Annotated[
+        int, typer.Option(help="End index (exclusive), -1 for all")
+    ] = -1,
+    ignore_missing_predictions: Annotated[
+        bool,
+        typer.Option(
+            help="Skip documents without a matching prediction instead of failing"
+        ),
+    ] = False,
+):
+    """
+    Create paired GT vs. prediction HTML visualizations without generating parquet output.
+    """
+    visualizations_dir = (
+        output_dir if output_dir is not None else dataset_dir / "visualizations"
+    )
+
+    visualizer = PredictionsVisualizer(
+        visualizations_dir=visualizations_dir,
+        external_predictions_dir=external_predictions_path,
+        ignore_missing_predictions=ignore_missing_predictions,
+    )
+    visualizer.create_visualizations(
+        dataset_dir=dataset_dir,
+        split=split,
+        begin_index=begin_index,
+        end_index=end_index,
+    )
+
+
 @app.callback()
 def main():
     """Docling Evaluation CLI for benchmarking document processing tasks."""

diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
@@ -100,6 +100,7 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetEvaluationType:
         r"""
         Perform the evaluation

diff --git a/docling_eval/evaluators/bbox_text_evaluator.py b/docling_eval/evaluators/bbox_text_evaluator.py
@@ -4,7 +4,7 @@
 
 import nltk
 from datasets import load_dataset
-from docling_core.types.doc.base import BoundingBox
+from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from docling_core.types.doc.document import DoclingDocument, TextItem
 from nltk import edit_distance, word_tokenize
 from nltk.metrics import f_measure, precision, recall
@@ -25,6 +25,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -94,8 +97,16 @@ def __init_(
         nltk.download("popular", quiet=True)
 
     def __call__(
-        self, ds_path: Path, split: str = "test"
+        self,
+        ds_path: Path,
+        split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetBoxesTextEvaluation:
+        r""" """
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"oveview of dataset: {ds}")
@@ -125,15 +136,23 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+
+            # Load the pred_doc
+            if ext_docdoc_loader is not None:
+                pred_doc = ext_docdoc_loader(data_record)
+            else:
+                pred_doc = data_record.predicted_doc
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)
                 rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
@@ -212,7 +231,15 @@ def _match_bboxes(
                     continue
                 assert len(doc_item.prov) == 1
                 prov = doc_item.prov[0]
-                bboxes[doc_key].append(prov.bbox)
+
+                # Ensure bbox is in top-left origin
+                bbox = prov.bbox
+                if bbox.coord_origin != CoordOrigin.TOPLEFT:
+                    page_no = prov.page_no
+                    page_size = doc.pages[page_no].size
+                    bbox = bbox.to_top_left_origin(page_size.height)
+
+                bboxes[doc_key].append(bbox)
                 texts[doc_key].append(doc_item.text)
 
         # Decide which document is the pivot

diff --git a/docling_eval/evaluators/doc_structure_evaluator.py b/docling_eval/evaluators/doc_structure_evaluator.py
@@ -18,6 +18,9 @@
     UnitEvaluation,
 )
 from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
+from docling_eval.utils.external_docling_document_loader import (
+    ExternalDoclingDocumentLoader,
+)
 
 _log = logging.getLogger(__name__)
 
@@ -71,13 +74,18 @@ def __call__(
         self,
         ds_path: Path,
         split: str = "test",
+        external_predictions_path: Optional[Path] = None,
     ) -> DatasetDocStructureEvaluation:
         r"""
         Parameters
         ----------
         ds_path: Path to load the parquet files of the dataset
         split: Split of the dataset to load
         """
+        ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
+        if external_predictions_path is not None:
+            ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)
+
         parquet_files = str(ds_path / split / "*.parquet")
         ds = load_dataset("parquet", data_files={split: parquet_files})
         _log.info(f"Overview of the dataset: {ds}")
@@ -106,15 +114,21 @@ def __call__(
         ):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             doc_id = data_record.doc_id
-            if data_record.status not in self._accepted_status:
+            if (
+                ext_docdoc_loader is None
+                and data_record.status not in self._accepted_status
+            ):
                 _log.error(
                     "Skipping record without successfull conversion status: %s", doc_id
                 )
                 rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
                 continue
 
             true_doc = data_record.ground_truth_doc
-            pred_doc = data_record.predicted_doc
+            if ext_docdoc_loader:
+                pred_doc = ext_docdoc_loader(data_record)
+            else:
+                pred_doc = data_record.predicted_doc
 
             if pred_doc is None:
                 _log.error("There is no prediction for doc_id=%s", doc_id)