Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
6331723
chore: Move the teds.py inside the subdir evaluators/table
nikos-livathinos Dec 4, 2025
85890fb
feat: Introduce the external_predictions_path in BaseEvaluator and du…
nikos-livathinos Dec 4, 2025
5f9a279
feat: Extend test_dataset_builder.py to save document predictions in …
nikos-livathinos Dec 4, 2025
e6e8409
feat: Extend MarkDownTextEvaluator to support external_predictions_pa…
nikos-livathinos Dec 4, 2025
5624e61
feat: Extend LayoutEvaluator to support external_predictions_path. Ad…
nikos-livathinos Dec 4, 2025
426b6d1
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 4, 2025
171ad74
fix: Add missing pytest dependencies in tests
nikos-livathinos Dec 4, 2025
0f0cfb5
fix: Fix loading the external predictions in LayoutEvaluator
nikos-livathinos Dec 4, 2025
8069571
feat: Introduce external predictions in DocStructureEvaluator. Add un…
nikos-livathinos Dec 4, 2025
8ba6b45
feat: Extend the TableEvaluator to support external predictions. Add …
nikos-livathinos Dec 4, 2025
949d6cc
feat: Extend the KeyValueEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
13badc5
feat: Extend the PixelLayoutEvaluator to support external predictions…
nikos-livathinos Dec 5, 2025
8c2a065
feat: Extend the BboxTextEvaluator to support external predictions. A…
nikos-livathinos Dec 5, 2025
08391b3
feat: Disable the OCREvaluator when using the external predictions
nikos-livathinos Dec 5, 2025
595ba6c
fix: Fixing guard for external predictions in TimingsEvaluator, Readi…
nikos-livathinos Dec 5, 2025
406b122
fix: Export the doctag files with the correct file extension
nikos-livathinos Dec 5, 2025
ebe70b0
feat: Refactor the ExternalDoclingDocumentLoader to properly load a D…
nikos-livathinos Dec 5, 2025
33511c9
chore: Rename code file as external_docling_document_loader.py
nikos-livathinos Dec 5, 2025
b1525b6
Merge branch 'main' into nli/external_predictions
nikos-livathinos Dec 8, 2025
94b3938
fix: Fix typo
nikos-livathinos Dec 8, 2025
ae10646
feat: Introduce examples how to evaluate using external predictions u…
nikos-livathinos Dec 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 41 additions & 17 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,7 @@ def evaluate(
odir: Path,
split: str = "test",
cvat_overview_path: Optional[Path] = None,
external_predictions_path: Optional[Path] = None,
) -> Optional[DatasetEvaluationType]:
"""Evaluate predictions against ground truth."""
if not os.path.exists(idir):
Expand All @@ -659,6 +660,7 @@ def evaluate(
evaluation = timings_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -673,6 +675,7 @@ def evaluate(
evaluation = layout_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -681,7 +684,9 @@ def evaluate(
# Evaluate with the pixel-wise layout evaluation
pixel_layout_evaluator = PixelLayoutEvaluator()
pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
idir, split=split
idir,
split=split,
external_predictions_path=external_predictions_path,
)
pixel_save_root: Path = save_fn.parent
pixel_layout_evaluator.save_evaluations(
Expand All @@ -695,6 +700,7 @@ def evaluate(
evaluation = table_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -707,36 +713,44 @@ def evaluate(
evaluation = doc_struct_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)

elif modality == EvaluationModality.OCR:
if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
text_unit = TextCellUnit.LINE
else:
text_unit = TextCellUnit.WORD

logging.info(f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})")
logging.info(f"Text unit set to {text_unit}")
if not external_predictions_path:
if benchmark in [BenchMarkNames.XFUND, BenchMarkNames.PIXPARSEIDL]:
text_unit = TextCellUnit.LINE
else:
text_unit = TextCellUnit.WORD

logging.info(
f"Benchmark received in evaluate: {benchmark} ({type(benchmark)})"
)
logging.info(f"Text unit set to {text_unit}")

ocr_evaluator = OCREvaluator(
intermediate_evaluations_path=odir, text_unit=text_unit
)
evaluation = ocr_evaluator( # type: ignore
idir,
split=split,
)
ocr_evaluator = OCREvaluator(
intermediate_evaluations_path=odir, text_unit=text_unit
)
evaluation = ocr_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
with open(save_fn, "w") as fd:
json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
else:
logging.error("External predictions are not supported for OCR evaluations")

elif modality == EvaluationModality.READING_ORDER:
readingorder_evaluator = ReadingOrderEvaluator()
evaluation = readingorder_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -753,6 +767,7 @@ def evaluate(
evaluation = md_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)

with open(save_fn, "w") as fd:
Expand All @@ -769,6 +784,7 @@ def evaluate(
evaluation = bbox_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)
with open(save_fn, "w") as fd:
json.dump(
Expand All @@ -784,6 +800,7 @@ def evaluate(
evaluation = keyvalue_evaluator( # type: ignore
idir,
split=split,
external_predictions_path=external_predictions_path,
)
with open(save_fn, "w") as fd:
json.dump(
Expand Down Expand Up @@ -1487,6 +1504,12 @@ def evaluate_cmd(
),
] = None,
split: Annotated[str, typer.Option(help="Dataset split")] = "test",
external_predictions_path: Annotated[
Optional[Path],
typer.Option(
help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
),
] = None,
):
"""Evaluate predictions against ground truth."""
input_dir, output_dir = derive_input_output_dirs(
Expand All @@ -1506,6 +1529,7 @@ def evaluate_cmd(
idir=input_dir,
odir=eval_output_dir,
split=split,
external_predictions_path=external_predictions_path,
)


Expand Down
1 change: 1 addition & 0 deletions docling_eval/evaluators/base_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def __call__(
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetEvaluationType:
r"""
Perform the evaluation
Expand Down
37 changes: 32 additions & 5 deletions docling_eval/evaluators/bbox_text_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import nltk
from datasets import load_dataset
from docling_core.types.doc.base import BoundingBox
from docling_core.types.doc.base import BoundingBox, CoordOrigin
from docling_core.types.doc.document import DoclingDocument, TextItem
from nltk import edit_distance, word_tokenize
from nltk.metrics import f_measure, precision, recall
Expand All @@ -25,6 +25,9 @@
UnitEvaluation,
)
from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,8 +97,16 @@ def __init_(
nltk.download("popular", quiet=True)

def __call__(
self, ds_path: Path, split: str = "test"
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetBoxesTextEvaluation:
r""" """
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
if external_predictions_path is not None:
ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)

parquet_files = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: parquet_files})
_log.info(f"oveview of dataset: {ds}")
Expand Down Expand Up @@ -125,15 +136,23 @@ def __call__(
):
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
if data_record.status not in self._accepted_status:
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s", doc_id
)
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
continue

true_doc = data_record.ground_truth_doc
pred_doc = data_record.predicted_doc

# Load the pred_doc
if ext_docdoc_loader is not None:
pred_doc = ext_docdoc_loader(data_record)
else:
pred_doc = data_record.predicted_doc
if pred_doc is None:
_log.error("There is no prediction for doc_id=%s", doc_id)
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
Expand Down Expand Up @@ -212,7 +231,15 @@ def _match_bboxes(
continue
assert len(doc_item.prov) == 1
prov = doc_item.prov[0]
bboxes[doc_key].append(prov.bbox)

# Ensure bbox is in top-left origin
bbox = prov.bbox
if bbox.coord_origin != CoordOrigin.TOPLEFT:
page_no = prov.page_no
page_size = doc.pages[page_no].size
bbox = bbox.to_top_left_origin(page_size.height)

bboxes[doc_key].append(bbox)
texts[doc_key].append(doc_item.text)

# Decide which document is the pivot
Expand Down
18 changes: 16 additions & 2 deletions docling_eval/evaluators/doc_structure_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
UnitEvaluation,
)
from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,13 +74,18 @@ def __call__(
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetDocStructureEvaluation:
r"""
Parameters
----------
ds_path: Path to load the parquet files of the dataset
split: Split of the dataset to load
"""
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
if external_predictions_path is not None:
ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)

parquet_files = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: parquet_files})
_log.info(f"Overview of the dataset: {ds}")
Expand Down Expand Up @@ -106,15 +114,21 @@ def __call__(
):
data_record = DatasetRecordWithPrediction.model_validate(data)
doc_id = data_record.doc_id
if data_record.status not in self._accepted_status:
if (
ext_docdoc_loader is None
and data_record.status not in self._accepted_status
):
_log.error(
"Skipping record without successfull conversion status: %s", doc_id
)
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
continue

true_doc = data_record.ground_truth_doc
pred_doc = data_record.predicted_doc
if ext_docdoc_loader:
pred_doc = ext_docdoc_loader(data_record)
else:
pred_doc = data_record.predicted_doc

if pred_doc is None:
_log.error("There is no prediction for doc_id=%s", doc_id)
Expand Down
26 changes: 22 additions & 4 deletions docling_eval/evaluators/keyvalue_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
docling_document_from_doctags,
)
from docling_eval.evaluators.stats import DatasetStatistics, compute_stats
from docling_eval.utils.external_docling_document_loader import (
ExternalDoclingDocumentLoader,
)

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -415,7 +418,17 @@ def __init__(
# --------------------------------------------------------------------- #
# Public API
# --------------------------------------------------------------------- #
def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluation:
def __call__(
self,
ds_path: Path,
split: str = "test",
external_predictions_path: Optional[Path] = None,
) -> DatasetKeyValueEvaluation:
r""" """
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
if external_predictions_path is not None:
ext_docdoc_loader = ExternalDoclingDocumentLoader(external_predictions_path)

split_glob = str(ds_path / split / "*.parquet")
ds = load_dataset("parquet", data_files={split: split_glob})
_log.info("Loaded split '%s' – %d samples", split, len(ds[split]))
Expand Down Expand Up @@ -461,13 +474,13 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluat
doc_id = record.doc_id

# ----- sanity checks --------------------------------------------------
if record.status not in self._accepted_status:
if ext_docdoc_loader is None and record.status not in self._accepted_status:
rejected_samples[EvaluationRejectionType.INVALID_CONVERSION_STATUS] += 1
_log.error("Skipping %s – conversion failed", doc_id)
continue

gt_doc = record.ground_truth_doc
pred_doc = self._get_pred_doc(record)
pred_doc = self._get_pred_doc(record, ext_docdoc_loader)
if pred_doc is None:
rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
_log.error("Skipping %s – missing prediction", doc_id)
Expand Down Expand Up @@ -635,10 +648,15 @@ def __call__(self, ds_path: Path, split: str = "test") -> DatasetKeyValueEvaluat
# Helpers
# --------------------------------------------------------------------- #
def _get_pred_doc(
self, data_record: DatasetRecordWithPrediction
self,
data_record: DatasetRecordWithPrediction,
ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None,
) -> Optional[DoclingDocument]:
"""Fetch the prediction in the first available format declared by `prediction_sources`."""
pred_doc: Optional[DoclingDocument] = None
if ext_docdoc_loader is not None:
pred_doc = ext_docdoc_loader(data_record)
return pred_doc

for fmt in self._prediction_sources:
if fmt == PredictionFormats.DOCLING_DOCUMENT:
Expand Down
Loading
Loading