Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6331723
chore: Move the teds.py inside the subdir evaluators/table
nikos-livathinos Dec 4, 2025
85890fb
feat: Introduce the external_predictions_path in BaseEvaluator and du…
nikos-livathinos Dec 4, 2025
5f9a279
feat: Extend test_dataset_builder.py to save document predictions in …
nikos-livathinos Dec 4, 2025
e6e8409
feat: Extend MarkDownTextEvaluator to support external_predictions_pa…
nikos-livathinos Dec 4, 2025
5624e61
feat: Extend LayoutEvaluator to support external_predictions_path. Ad…
nikos-livathinos Dec 4, 2025
426b6d1
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 4, 2025
171ad74
fix: Add missing pytest dependencies in tests
nikos-livathinos Dec 4, 2025
0f0cfb5
fix: Fix loading the external predictions in LayoutEvaluator
nikos-livathinos Dec 4, 2025
8069571
feat: Introduce external predictions in DocStructureEvaluator. Add un…
nikos-livathinos Dec 4, 2025
8ba6b45
feat: Extend the TableEvaluator to support external predictions. Add …
nikos-livathinos Dec 4, 2025
949d6cc
feat: Extend the KeyValueEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
13badc5
feat: Extend the PixelLayoutEvaluator to support external predictions…
nikos-livathinos Dec 5, 2025
8c2a065
feat: Extend the BboxTextEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
08391b3
feat: Disable the OCREvaluator when using the external predictions
nikos-livathinos Dec 5, 2025
595ba6c
fix: Fixing guard for external predictions in TimingsEvaluator, Readi…
nikos-livathinos Dec 5, 2025
406b122
fix: Export the doctag files with the correct file extension
nikos-livathinos Dec 5, 2025
ebe70b0
feat: Refactor the ExternalDoclingDocumentLoader to properly load a D…
nikos-livathinos Dec 5, 2025
33511c9
chore: Rename code file as external_docling_document_loader.py
nikos-livathinos Dec 5, 2025
b1525b6
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 8, 2025
94b3938
fix: Fix typo
nikos-livathinos Dec 8, 2025
ae10646
feat: Introduce examples how to evaluate using external predictions u…
nikos-livathinos Dec 8, 2025
8c52e36
feat: Prediction vizualizer
cau-git Dec 8, 2025
71f5e17
feat: Prediction vizualizer
cau-git Dec 8, 2025
6f7331c
Update docling_eval/utils/external_predictions_visualizer.py
cau-git Dec 9, 2025
57bd131
Merge branch 'main' into cau/add-external-vis-tool
nikos-livathinos Dec 9, 2025
21eae30
feat: Update examples bash script to demonstrate visualisations on ex…
nikos-livathinos Dec 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 64 additions & 2 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from docling_eval.prediction_providers.tableformer_provider import (
TableFormerPredictionProvider,
)
from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer


class DoclingLayoutOptionsManager:
Expand Down Expand Up @@ -362,7 +363,7 @@ def get_prediction_provider(
docling_layout_keep_empty_clusters: Optional[bool] = None,
# Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
docling_force_full_page_ocr: Optional[bool] = None,
docling_force_full_page_ocr: bool = False,
granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
max_new_tokens: Optional[int] = None,
):
Expand All @@ -376,7 +377,7 @@ def get_prediction_provider(
ocr_factory = get_ocr_factory()

ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
kind="easyocr",
kind="rapidocr",
force_full_page_ocr=docling_force_full_page_ocr,
)
# Use all CPU cores
Expand Down Expand Up @@ -1578,6 +1579,67 @@ def visualize_cmd(
)


@app.command(name="create_viz")
def create_viz(
dataset_dir: Annotated[
Path,
typer.Option(
help=(
"Dataset directory (GT parquet or eval_dataset parquet with predictions) "
"containing the split folder with parquet shards."
)
),
],
split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test",
external_predictions_path: Annotated[
Optional[Path],
typer.Option(
help=(
"Directory with DoclingDocument predictions named as <doc_id>.[json|dt|yaml|yml]. "
"If omitted, predictions are taken from the dataset parquet."
)
),
] = None,
output_dir: Annotated[
Optional[Path],
typer.Option(
help=(
"Directory where HTML visualizations are written. Defaults to "
"<dataset_dir>/visualizations when omitted."
)
),
] = None,
begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
end_index: Annotated[
int, typer.Option(help="End index (exclusive), -1 for all")
] = -1,
ignore_missing_predictions: Annotated[
bool,
typer.Option(
help="Skip documents without a matching prediction instead of failing"
),
] = False,
):
"""
Create paired GT vs. prediction HTML visualizations without generating parquet output.
"""
visualizations_dir = (
output_dir if output_dir is not None else dataset_dir / "visualizations"
)

visualizer = PredictionsVisualizer(
visualizations_dir=visualizations_dir,
external_predictions_dir=external_predictions_path,
ignore_missing_predictions=ignore_missing_predictions,
)
visualizer.create_visualizations(
dataset_dir=dataset_dir,
split=split,
begin_index=begin_index,
end_index=end_index,
)


@app.callback()
def main():
"""Docling Evaluation CLI for benchmarking document processing tasks."""
Expand Down
163 changes: 163 additions & 0 deletions docling_eval/utils/external_predictions_visualizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import logging
from pathlib import Path
from typing import List, Optional, Tuple

from datasets import Dataset, load_dataset
from docling.datamodel.base_models import ConversionStatus
from docling_core.types.doc.document import DoclingDocument
from PIL import Image
from tqdm import tqdm # type: ignore

from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
from docling_eval.datamodels.types import BenchMarkColumns, PredictionFormats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)
from docling_eval.utils.utils import extract_images, insert_images_from_pil
from docling_eval.visualisation.visualisations import save_comparison_html_with_clusters

_LOGGER = logging.getLogger(__name__)


class PredictionsVisualizer:
"""
Render ground-truth vs. prediction visualizations for an existing dataset.

Works with either:
- A dataset that already embeds predictions (DatasetRecordWithPrediction parquet)
- A ground-truth-only dataset paired with an external predictions directory
containing DoclingDocument files named <doc_id>.[json|dt|yaml|yml]
"""

def __init__(
self,
visualizations_dir: Path,
*,
external_predictions_dir: Optional[Path] = None,
ignore_missing_predictions: bool = False,
):
self._loader = (
ExternalDoclingDocumentLoader(external_predictions_dir)
if external_predictions_dir is not None
else None
)
self._visualizations_dir = visualizations_dir
self._ignore_missing_predictions = ignore_missing_predictions

def create_visualizations(
self,
dataset_dir: Path,
split: str = "test",
begin_index: int = 0,
end_index: int = -1,
) -> None:
"""
Generate paired HTML visualizations between ground truth and predictions.
"""
dataset = self._load_split(dataset_dir, split)
dataset = self._slice_dataset(dataset, begin_index, end_index)
self._visualizations_dir.mkdir(parents=True, exist_ok=True)

for _, row in tqdm(
enumerate(dataset),
desc="Rendering visualizations",
total=len(dataset),
ncols=120,
):
record = DatasetRecordWithPrediction.model_validate(row)
pred_doc = self._resolve_prediction_document(record)
if pred_doc is None:
message = f"Missing prediction for document {record.doc_id}"
if self._ignore_missing_predictions:
_LOGGER.warning(message)
continue
raise FileNotFoundError(message)

pred_doc, pred_pictures, pred_page_images = self._prepare_prediction_assets(
record, pred_doc
)

record_for_viz = record.model_copy(deep=True)
record_for_viz.predicted_doc = pred_doc
record_for_viz.predicted_pictures = pred_pictures
record_for_viz.predicted_page_images = pred_page_images
record_for_viz.prediction_format = PredictionFormats.DOCLING_DOCUMENT
record_for_viz.status = ConversionStatus.SUCCESS

self._save_visualization(record_for_viz)

def _resolve_prediction_document(
self, record: DatasetRecordWithPrediction
) -> Optional[DoclingDocument]:
if self._loader is not None:
return self._loader(record)
return record.predicted_doc

def _prepare_prediction_assets(
self, record: DatasetRecordWithPrediction, pred_doc: DoclingDocument
) -> Tuple[DoclingDocument, List[Image.Image], List[Image.Image]]:
if self._loader is None and (
record.predicted_pictures or record.predicted_page_images
):
return (
pred_doc.model_copy(deep=True),
list(record.predicted_pictures),
list(record.predicted_page_images),
)

prepared_doc, pred_pictures, pred_page_images = extract_images(
document=pred_doc.model_copy(deep=True),
pictures_column=BenchMarkColumns.PREDICTION_PICTURES.value,
page_images_column=BenchMarkColumns.PREDICTION_PAGE_IMAGES.value,
)
return prepared_doc, pred_pictures, pred_page_images

def _load_split(self, dataset_dir: Path, split: str) -> Dataset:
split_dir = dataset_dir / split
split_files = sorted(split_dir.glob("*.parquet"))
if not split_files:
raise FileNotFoundError(f"No parquet files found under {split_dir}")
dataset = load_dataset(
"parquet", data_files={split: [str(path) for path in split_files]}
)
return dataset[split]

def _slice_dataset(
self, dataset: Dataset, begin_index: int, end_index: int
) -> Dataset:
total = len(dataset)
begin = max(begin_index, 0)
end = total if end_index < 0 else min(end_index, total)

if begin >= end:
return dataset.select([])
if begin == 0 and end == total:
return dataset
return dataset.select(range(begin, end))

def _save_visualization(self, record: DatasetRecordWithPrediction) -> None:
if record.predicted_doc is None:
return

gt_doc = insert_images_from_pil(
record.ground_truth_doc.model_copy(deep=True),
record.ground_truth_pictures,
record.ground_truth_page_images,
)
pred_doc = insert_images_from_pil(
record.predicted_doc.model_copy(deep=True),
record.predicted_pictures,
record.predicted_page_images,
)

try:
save_comparison_html_with_clusters(
filename=self._visualizations_dir / f"{record.doc_id}.html",
true_doc=gt_doc,
pred_doc=pred_doc,
draw_reading_order=True,
)
except (IndexError, ValueError) as e:
_LOGGER.warning(
f"Failed to save visualization for doc_id {record.doc_id}: {e}"
)
38 changes: 34 additions & 4 deletions docs/examples/evaluate_dpbench_on_external_predictions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ evaluate() {
fi

for modality in "${MODALITIES[@]}"; do
echo "Evaluation modality: ${modality}, predictions: ${pred_dir}"
echo "Evaluate: modality: ${modality}: predictions: ${pred_dir}"
uv run docling-eval evaluate \
--benchmark DPBench \
--modality "${modality}" \
Expand All @@ -49,24 +49,54 @@ evaluate() {
}


visualize() {
local pred_dir save_dir modality
pred_dir="$1"
save_dir="$2"

# Check if the GT/preds dirs exist
if [ ! -d "${GT_DIR}" ]; then
echo "Missing GT dir: ${GT_DIR}"
exit 1
fi
if [ ! -d "${pred_dir}" ]; then
echo "Missing predictions dir: ${pred_dir}"
exit 2
fi

echo "Visualize predictions: ${pred_dir}"
uv run docling-eval create_viz \
--dataset-dir "${GT_DIR}" \
--external-predictions-path "${pred_dir}" \
--output-dir "${save_dir}"
}

###########################################################################################
# Main
#

# Predictions

# json predictions
evaluate \
scratch/DPBench/predicted_documents/json \
scratch/DPBench/external_evaluations_jsons
scratch/DPBench/external_predictions_jsons


# doctags predictions
evaluate \
scratch/DPBench/predicted_documents/doctag \
scratch/DPBench/external_evaluations_doctags
scratch/DPBench/external_predictions_doctags


# yaml predictions
evaluate \
scratch/DPBench/predicted_documents/yaml \
scratch/DPBench/external_evaluations_yaml
scratch/DPBench/external_predictions_yaml


# Visualisations
visualize \
scratch/DPBench/predicted_documents/json \
scratch/DPBench/external_predictions_visualisations

64 changes: 64 additions & 0 deletions tests/test_predictions_visualizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pathlib import Path

import pytest
from datasets import load_dataset

from docling_eval.datamodels.dataset_record import DatasetRecordWithPrediction
from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer


def _first_doc_id(parquet_root: Path) -> str:
split_files = sorted((parquet_root / "test").glob("*.parquet"))
ds = load_dataset(
"parquet", data_files={"test": [str(path) for path in split_files]}
)
record = DatasetRecordWithPrediction.model_validate(ds["test"][0])
return record.doc_id


@pytest.mark.dependency(
depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
scope="session",
)
def test_predictions_visualizer_with_embedded_predictions() -> None:
dataset_dir = Path("scratch/DPBench/eval_dataset_e2e")
output_dir = Path("scratch/DPBench/visualizer_tests/embedded")
output_dir.mkdir(parents=True, exist_ok=True)

visualizer = PredictionsVisualizer(visualizations_dir=output_dir)
visualizer.create_visualizations(
dataset_dir=dataset_dir,
split="test",
begin_index=0,
end_index=1,
)

doc_id = _first_doc_id(dataset_dir)
layout_file = output_dir / f"{doc_id}_layout.html"
assert layout_file.is_file()


@pytest.mark.dependency(
depends=["tests/test_dataset_builder.py::test_run_dpbench_e2e"],
scope="session",
)
def test_predictions_visualizer_with_external_predictions() -> None:
gt_dir = Path("scratch/DPBench/gt_dataset")
external_predictions_dir = Path("scratch/DPBench/predicted_documents/json")
output_dir = Path("scratch/DPBench/visualizer_tests/external")
output_dir.mkdir(parents=True, exist_ok=True)

visualizer = PredictionsVisualizer(
visualizations_dir=output_dir,
external_predictions_dir=external_predictions_dir,
)
visualizer.create_visualizations(
dataset_dir=gt_dir,
split="test",
begin_index=0,
end_index=1,
)

doc_id = _first_doc_id(gt_dir)
layout_file = output_dir / f"{doc_id}_layout.html"
assert layout_file.is_file()
Loading