Skip to content
60 changes: 27 additions & 33 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1260,6 +1260,23 @@ def _add_formatted_list_item(
)
return elem_ref

def _add_list_item_with_marker(
self,
doc: DoclingDocument,
elements: list,
numid: int,
ilevel: int,
is_numbered: bool,
level: int,
) -> None:
"""Resolve enumeration marker and add a formatted list item."""
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(doc, elements, enum_marker, is_numbered, level)

def _add_list_item(
self,
*,
Expand All @@ -1273,7 +1290,6 @@ def _add_list_item(
# this method is always called with is_numbered. Numbered lists should be properly addressed.
if not elements:
return elem_ref
enum_marker = ""

level = self._get_level()
prev_indent = self._prev_indent()
Expand All @@ -1295,14 +1311,8 @@ def _add_list_item(
self.parents[level] = list_gr
elem_ref.append(list_gr.get_ref())

# Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level
self._add_list_item_with_marker(
doc, elements, numid, ilevel, is_numbered, level
)
elif (
self._prev_numid() == numid
Expand All @@ -1322,16 +1332,11 @@ def _add_list_item(
self.parents[i] = list_gr1
elem_ref.append(list_gr1.get_ref())

# TODO: Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
self._add_list_item_with_marker(
doc,
elements,
enum_marker,
numid,
ilevel,
is_numbered,
self.level_at_new_list + ilevel,
)
Expand All @@ -1345,29 +1350,18 @@ def _add_list_item(
if k > self.level_at_new_list + ilevel:
self.parents[k] = None

# TODO: Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
self._add_list_item_with_marker(
doc,
elements,
enum_marker,
numid,
ilevel,
is_numbered,
self.level_at_new_list + ilevel,
)

elif self._prev_numid() == numid or prev_indent == ilevel:
# Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
else:
enum_marker = ""
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, level - 1
self._add_list_item_with_marker(
doc, elements, numid, ilevel, is_numbered, level - 1
)
else:
_log.warning("List item not matching any insert condition.")
Expand Down
8 changes: 4 additions & 4 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,9 @@ def _execute_pipeline(
f"No pipeline could be initialized for {in_doc.file}."
)
else:
_log.warning(
"No pipeline could be initialized for %s.", in_doc.file
)
conv_res = ConversionResult(
input=in_doc,
status=ConversionStatus.FAILURE,
Expand All @@ -646,13 +649,10 @@ def _execute_pipeline(
if raises_on_error:
raise ConversionError(f"Input document {in_doc.file} is not valid.")
else:
# invalid doc or not of desired format
_log.warning("Input document %s is not valid.", in_doc.file)
conv_res = ConversionResult(
input=in_doc,
status=ConversionStatus.FAILURE,
)
_log.warning(
f"Input document {in_doc.file} is not valid, skipping conversion."
)

return conv_res
12 changes: 2 additions & 10 deletions docling/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,16 +201,8 @@ def prepare_element(
# Crop the image form the page
element_prov = element.prov[0]
bbox = element_prov.bbox
width = bbox.r - bbox.l
height = bbox.t - bbox.b

# TODO: move to a utility in the BoundingBox class
expanded_bbox = BoundingBox(
l=bbox.l - width * self.expansion_factor,
t=bbox.t + height * self.expansion_factor,
r=bbox.r + width * self.expansion_factor,
b=bbox.b - height * self.expansion_factor,
coord_origin=bbox.coord_origin,
expanded_bbox = bbox.expand_by_scale(
self.expansion_factor, self.expansion_factor
)

page_ix = element_prov.page_no - conv_res.pages[0].page_no
Expand Down
24 changes: 23 additions & 1 deletion tests/test_invalid_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from docling.datamodel.base_models import ConversionStatus, DocumentStream
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.document_converter import ConversionError, DocumentConverter


Expand Down Expand Up @@ -42,3 +42,25 @@ def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConv
def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
with pytest.raises(ConversionError):
converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)


def test_convert_no_pipeline_wout_exception():
converter = DocumentConverter()
# Bypass the model validator by setting pipeline_options to None after construction.
# This triggers the defensive "no pipeline" code path in _execute_pipeline.
converter.format_to_options[InputFormat.MD].pipeline_options = None
result = converter.convert(
DocumentStream(name="test.md", stream=BytesIO(b"# Hello")),
raises_on_error=False,
)
assert result.status == ConversionStatus.FAILURE


def test_convert_no_pipeline_with_exception():
converter = DocumentConverter()
converter.format_to_options[InputFormat.MD].pipeline_options = None
with pytest.raises(ConversionError):
converter.convert(
DocumentStream(name="test.md", stream=BytesIO(b"# Hello")),
raises_on_error=True,
)
80 changes: 80 additions & 0 deletions tests/test_verify_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pytest
from docling_core.types.doc import DoclingDocument, ProvenanceItem
from docling_core.types.doc.base import BoundingBox, Size
from docling_core.types.doc.labels import DocItemLabel

from tests.verify_utils import verify_docitems


def _make_doc_with_bbox(
*, left: float, page_width: float = 612.0, page_height: float = 792.0
) -> DoclingDocument:
doc = DoclingDocument(name="test")
doc.add_page(page_no=1, size=Size(width=page_width, height=page_height))
doc.add_text(
label=DocItemLabel.PARAGRAPH,
text="bbox check",
orig="bbox check",
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=left, t=20.0, r=30.0, b=40.0),
charspan=(0, 10),
),
)
return doc


def test_verify_docitems_allows_small_bbox_variance_for_non_fuzzy_docs():
verify_docitems(
doc_pred=_make_doc_with_bbox(left=11.53),
doc_true=_make_doc_with_bbox(left=10.0),
fuzzy=False,
pdf_filename="fixture.json",
)


def test_verify_docitems_rejects_large_bbox_variance_for_non_fuzzy_docs():
with pytest.raises(AssertionError, match="BBox left mismatch"):
verify_docitems(
doc_pred=_make_doc_with_bbox(left=12.01),
doc_true=_make_doc_with_bbox(left=10.0),
fuzzy=False,
pdf_filename="fixture.json",
)


def test_verify_docitems_allows_reasonable_bbox_variance_for_fuzzy_docs():
verify_docitems(
doc_pred=_make_doc_with_bbox(left=17.23, page_width=2000.0, page_height=2829.0),
doc_true=_make_doc_with_bbox(left=10.0, page_width=2000.0, page_height=2829.0),
fuzzy=True,
pdf_filename="fixture.json",
)


def test_verify_docitems_rejects_gross_bbox_variance_for_fuzzy_docs():
with pytest.raises(AssertionError, match="BBox left mismatch"):
verify_docitems(
doc_pred=_make_doc_with_bbox(
left=25.0, page_width=2000.0, page_height=2829.0
),
doc_true=_make_doc_with_bbox(
left=10.0, page_width=2000.0, page_height=2829.0
),
fuzzy=True,
pdf_filename="fixture.json",
)


def test_verify_docitems_rejects_bbox_presence_mismatch():
doc_true = _make_doc_with_bbox(left=10.0)
doc_pred = _make_doc_with_bbox(left=10.0)
doc_pred.texts[0].prov[0].bbox = None

with pytest.raises(AssertionError, match="BBox presence mismatch"):
verify_docitems(
doc_pred=doc_pred,
doc_true=doc_true,
fuzzy=False,
pdf_filename="fixture.json",
)
58 changes: 57 additions & 1 deletion tests/verify_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import math
import os
from pathlib import Path
from typing import Optional
Expand All @@ -12,6 +13,7 @@
TableItem,
TextItem,
)
from docling_core.types.doc.base import BoundingBox
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from PIL import Image as PILImage
from pydantic import BaseModel, TypeAdapter
Expand All @@ -21,6 +23,10 @@

COORD_PREC = 2 # decimal places for coordinates
CONFID_PREC = 3 # decimal places for confidence
STRICT_BBOX_TOL_RATIO = 0.0025 # allow minor cross-platform layout variance
FUZZY_BBOX_TOL_RATIO = (
0.005 # OCR/image output varies more, but gross shifts should fail
)


class _TestPagesMeta(BaseModel):
Expand All @@ -31,6 +37,40 @@ def from_page(cls, page: Page):
return cls(num_cells=len(page.cells))


def _assert_bbox_close(
*,
true_bbox: BoundingBox,
pred_bbox: BoundingBox,
fuzzy: bool,
page_extent: Optional[float],
pdf_filename: str,
):
"""Compare bbox coordinates at the same precision used in serialized fixtures."""

tol_ratio = FUZZY_BBOX_TOL_RATIO if fuzzy else STRICT_BBOX_TOL_RATIO
tol = max(10 ** (-COORD_PREC), (page_extent or 0.0) * tol_ratio)

assert true_bbox.coord_origin == pred_bbox.coord_origin, (
f"[{pdf_filename}] BBox coord_origin mismatch"
)

for label, true_value, pred_value in (
("left", true_bbox.l, pred_bbox.l),
("top", true_bbox.t, pred_bbox.t),
("right", true_bbox.r, pred_bbox.r),
("bottom", true_bbox.b, pred_bbox.b),
):
true_rounded = round(true_value, COORD_PREC)
pred_rounded = round(pred_value, COORD_PREC)
diff = abs(true_rounded - pred_rounded)

assert math.isclose(true_rounded, pred_rounded, rel_tol=0.0, abs_tol=tol), (
f"[{pdf_filename}] BBox {label} mismatch:"
f" {true_rounded} vs {pred_rounded}"
f" (raw pred: {pred_value}, diff: {diff:.2f}, tol: {tol:.2f})"
)


def levenshtein(str1: str, str2: str) -> int:
# Ensure str1 is the shorter string to optimize memory usage
if len(str1) > len(str2):
Expand Down Expand Up @@ -258,12 +298,28 @@ def verify_docitems(
if len(true_item.prov) > 0:
true_prov = true_item.prov[0]
pred_prov = pred_item.prov[0]
true_page = doc_true.pages.get(true_prov.page_no)
pred_page = doc_pred.pages.get(pred_prov.page_no)

assert true_prov.page_no == pred_prov.page_no, (
f"[{pdf_filename}] Page provenance mistmatch"
)
assert (true_prov.bbox is None) == (pred_prov.bbox is None), (
f"[{pdf_filename}] BBox presence mismatch"
)

# TODO: add bbox check with tolerance
if true_prov.bbox is not None and pred_prov.bbox is not None:
_assert_bbox_close(
true_bbox=true_prov.bbox,
pred_bbox=pred_prov.bbox,
fuzzy=fuzzy,
page_extent=(
max(page.size.width, page.size.height)
if (page := true_page or pred_page) is not None
else None
),
pdf_filename=pdf_filename,
)

# Validate source
assert bool(true_item.source) == bool(pred_item.source), (
Expand Down
Loading