Skip to content
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6331723
chore: Move the teds.py inside the subdir evaluators/table
nikos-livathinos Dec 4, 2025
85890fb
feat: Introduce the external_predictions_path in BaseEvaluator and du…
nikos-livathinos Dec 4, 2025
5f9a279
feat: Extend test_dataset_builder.py to save document predictions in …
nikos-livathinos Dec 4, 2025
e6e8409
feat: Extend MarkDownTextEvaluator to support external_predictions_pa…
nikos-livathinos Dec 4, 2025
5624e61
feat: Extend LayoutEvaluator to support external_predictions_path. Ad…
nikos-livathinos Dec 4, 2025
426b6d1
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 4, 2025
171ad74
fix: Add missing pytest dependencies in tests
nikos-livathinos Dec 4, 2025
0f0cfb5
fix: Fix loading the external predictions in LayoutEvaluator
nikos-livathinos Dec 4, 2025
8069571
feat: Introduce external predictions in DocStructureEvaluator. Add un…
nikos-livathinos Dec 4, 2025
8ba6b45
feat: Extend the TableEvaluator to support external predictions. Add …
nikos-livathinos Dec 4, 2025
949d6cc
feat: Extend the KeyValueEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
13badc5
feat: Extend the PixelLayoutEvaluator to support external predictions…
nikos-livathinos Dec 5, 2025
8c2a065
feat: Extend the BboxTextEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
08391b3
feat: Disable the OCREvaluator when using the external predictions
nikos-livathinos Dec 5, 2025
595ba6c
fix: Fixing guard for external predictions in TimingsEvaluator, Readi…
nikos-livathinos Dec 5, 2025
406b122
fix: Export the doctag files with the correct file extension
nikos-livathinos Dec 5, 2025
ebe70b0
feat: Refactor the ExternalDoclingDocumentLoader to properly load a D…
nikos-livathinos Dec 5, 2025
33511c9
chore: Rename code file as external_docling_document_loader.py
nikos-livathinos Dec 5, 2025
b1525b6
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 8, 2025
94b3938
fix: Fix typo
nikos-livathinos Dec 8, 2025
ae10646
feat: Introduce examples how to evaluate using external predictions u…
nikos-livathinos Dec 8, 2025
8c52e36
feat: Prediction vizualizer
cau-git Dec 8, 2025
71f5e17
feat: Prediction vizualizer
cau-git Dec 8, 2025
6f7331c
Update docling_eval/utils/external_predictions_visualizer.py
cau-git Dec 9, 2025
57bd131
Merge branch 'main' into cau/add-external-vis-tool
nikos-livathinos Dec 9, 2025
21eae30
feat: Update examples bash script to demonstrate visualisations on ex…
nikos-livathinos Dec 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 105 additions & 19 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from docling_eval.prediction_providers.tableformer_provider import (
TableFormerPredictionProvider,
)
from docling_eval.utils.external_predictions_visualizer import PredictionsVisualizer


class DoclingLayoutOptionsManager:
Expand Down Expand Up @@ -362,7 +363,7 @@ def get_prediction_provider(
docling_layout_keep_empty_clusters: Optional[bool] = None,
# Controls orphan text cells only for the programmatic Docling pipeline (PDF_DOCLING)
docling_programmatic_add_orphan_text_cells: Optional[bool] = None,
docling_force_full_page_ocr: Optional[bool] = None,
docling_force_full_page_ocr: bool = False,
granite_docling_vlm_options: Optional[InlineVlmOptions] = None,
max_new_tokens: Optional[int] = None,
):
Expand All @@ -376,7 +377,7 @@ def get_prediction_provider(
ocr_factory = get_ocr_factory()

ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
kind="easyocr",
kind="rapidocr",
force_full_page_ocr=docling_force_full_page_ocr,
)
# Use all CPU cores
Expand Down Expand Up @@ -639,6 +640,7 @@ def evaluate(
odir: Path,
split: str = "test",
cvat_overview_path: Optional[Path] = None,
external_predictions_path: Optional[Path] = None,
) -> Optional[DatasetEvaluationType]:
"""Evaluate predictions against ground truth."""
if not os.path.exists(idir):
Expand All @@ -659,6 +661,7 @@ def evaluate(
evaluation = timings_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -673,6 +676,7 @@ def evaluate(
evaluation = layout_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -681,7 +685,9 @@ def evaluate(
# Evaluate with the pixel-wise layout evaluation
pixel_layout_evaluator = PixelLayoutEvaluator()
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
idir, split=split
idir,
split=split,
external_predictions_path=external_predictions_path,
)
pixel_save_root: Path = save_fn.parent
pixel_layout_evaluator.save_evaluations(
Expand All @@ -695,6 +701,7 @@ def evaluate(
evaluation = table_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -707,36 +714,44 @@ def evaluate(
evaluation = doc_struct_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)

elif modality == EvaluationModality.OCR:
if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
text_unit = TextCellUnit.LINE
else:
text_unit = TextCellUnit.WORD

logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})")
logging.info(f"Text unit set to {text_unit}")
if not external_predictions_path:
if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
text_unit = TextCellUnit.LINE
else:
text_unit = TextCellUnit.WORD

logging.info(
f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})"
)
logging.info(f"Text unit set to {text_unit}")

ocr_evaluator = OCREvaluator(
intermediate_evaluations_path=odir, text_unit=text_unit
)
evaluation = ocr_evaluator( # type: ignore
idir,
split=split,
)
ocr_evaluator = OCREvaluator(
intermediate_evaluations_path=odir, text_unit=text_unit
)
evaluation = ocr_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
else:
logging.error("External predictions are not supported for OCR evaluations")

elif modality == EvaluationModality.READING_ORDER:
readingorder_evaluator = ReadingOrderEvaluator()
evaluation = readingorder_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -753,6 +768,7 @@ def evaluate(
evaluation = md_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -769,6 +785,7 @@ def evaluate(
evaluation = bbox_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)
with open(save_fn, "w") as fd:
json.dump(
Expand All @@ -784,6 +801,7 @@ def evaluate(
evaluation = keyvalue_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)
with open(save_fn, "w") as fd:
json.dump(
Expand Down Expand Up @@ -1487,6 +1505,12 @@ def evaluate_cmd(
),
] = None,
split: Annotated[str, typer.Option(help="Dataset split")] = "test",
external_predictions_path: Annotated[
Optional[Path],
typer.Option(
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
),
] = None,
):
"""Evaluate predictions against ground truth."""
input_dir, output_dir = derive_input_output_dirs(
Expand All @@ -1506,6 +1530,7 @@ def evaluate_cmd(
idir=input_dir,
odir=eval_output_dir,
split=split,
external_predictions_path=external_predictions_path,
)


Expand Down Expand Up @@ -1554,6 +1579,67 @@ def visualize_cmd(
)


@app.command(name="create_viz")
def create_viz(
dataset_dir: Annotated[
Path,
typer.Option(
help=(
"Dataset directory (GT parquet or eval_dataset parquet with predictions) "
"containing the split folder with parquet shards."
)
),
],
split: Annotated[str, typer.Option(help="Dataset split to visualize")] = "test",
external_predictions_path: Annotated[
Optional[Path],
typer.Option(
help=(
"Directory with DoclingDocument predictions named as <doc_id>.[json|dt|yaml|yml]. "
"If omitted, predictions are taken from the dataset parquet."
)
),
] = None,
output_dir: Annotated[
Optional[Path],
typer.Option(
help=(
"Directory where HTML visualizations are written. Defaults to "
"<dataset_dir>/visualizations when omitted."
)
),
] = None,
begin_index: Annotated[int, typer.Option(help="Begin index (inclusive)")] = 0,
end_index: Annotated[
int, typer.Option(help="End index (exclusive), -1 for all")
] = -1,
ignore_missing_predictions: Annotated[
bool,
typer.Option(
help="Skip documents without a matching prediction instead of failing"
),
] = False,
):
"""
Create paired GT vs. prediction HTML visualizations without generating parquet output.
"""
visualizations_dir = (
output_dir if output_dir is not None else dataset_dir / "visualizations"
)

visualizer = PredictionsVisualizer(
visualizations_dir=visualizations_dir,
external_predictions_dir=external_predictions_path,
ignore_missing_predictions=ignore_missing_predictions,
)
visualizer.create_visualizations(
dataset_dir=dataset_dir,
split=split,
begin_index=begin_index,
end_index=end_index,
)


@app.callback()
def main():
"""Docling Evaluation CLI for benchmarking document processing tasks."""
Expand Down
1 change: 1 addition & 0 deletions docling_eval/evaluators/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def __call__(
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetEvaluationType:
r"""
Perform the evaluation
Expand Down
37 changes: 32 additions & 5 deletions docling_eval/evaluators/bbox_text_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import nltk
from datasets import load_dataset
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.base import BoundingBox, CoordOrigin
from docling_core.types.doc.document import DoclingDocument, TextItem
from nltk import edit_distance, word_tokenize
from nltk.metrics import f_measure, precision, recall
Expand All @@ -25,6 +25,9 @@
UnitEvaluation,
)
from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,8 +97,16 @@ def __init_(
nltk.download("popular", quiet=True)

def __call__(
self, ds_path: Path, split: str = "test"
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetBoxesTextEvaluation:
r""" """
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
if external_predictions_path is not None:
ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)

parquet_files = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: parquet_files})
_log.info(f"oveview of dataset: {ds}")
Expand Down Expand Up @@ -125,15 +136,23 @@ def __call__(
):
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
if data_record.status not in self._accepted_status:
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s", doc_id
)
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
continue

true_doc = data_record.ground_truth_doc
pred_doc = data_record.predicted_doc

# Load the pred_doc
if ext_docdoc_loader is not None:
pred_doc = ext_docdoc_loader(data_record)
else:
pred_doc = data_record.predicted_doc
if pred_doc is None:
_log.error("There is no prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
Expand Down Expand Up @@ -212,7 +231,15 @@ def _match_bboxes(
continue
assert len(doc_item.prov) == 1
prov = doc_item.prov[0]
bboxes[doc_key].append(prov.bbox)

# Ensure bbox is in top-left origin
bbox = prov.bbox
if bbox.coord_origin != CoordOrigin.TOPLEFT:
page_no = prov.page_no
page_size = doc.pages[page_no].size
bbox = bbox.to_top_left_origin(page_size.height)

bboxes[doc_key].append(bbox)
texts[doc_key].append(doc_item.text)

# Decide which document is the pivot
Expand Down
18 changes: 16 additions & 2 deletions docling_eval/evaluators/doc_structure_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
UnitEvaluation,
)
from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,13 +74,18 @@ def __call__(
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetDocStructureEvaluation:
r"""
Parameters
----------
ds_path: Path to load the parquet files of the dataset
split: Split of the dataset to load
"""
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
if external_predictions_path is not None:
ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)

parquet_files = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: parquet_files})
_log.info(f"Overview of the dataset: {ds}")
Expand Down Expand Up @@ -106,15 +114,21 @@ def __call__(
):
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
if data_record.status not in self._accepted_status:
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s", doc_id
)
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
continue

true_doc = data_record.ground_truth_doc
pred_doc = data_record.predicted_doc
if ext_docdoc_loader:
pred_doc = ext_docdoc_loader(data_record)
else:
pred_doc = data_record.predicted_doc

if pred_doc is None:
_log.error("There is no prediction for doc_id=%s", doc_id)
Expand Down
Loading
Loading