Skip to content

Commit 4708f93

Browse files
authored
fix: properly handle missing page image case for export_to_html (#166)
* fix: properly handle missing image case Signed-off-by: Yusik Kim <[email protected]> * chore: pre-commit changes Signed-off-by: Yusik Kim <[email protected]> * fix: simplify test and address broken test Signed-off-by: Yusik Kim <[email protected]> --------- Signed-off-by: Yusik Kim <[email protected]>
1 parent 28bd65c commit 4708f93

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

docling_core/types/doc/document.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2764,14 +2764,17 @@ def _image_fallback(item: TextItem):
27642764
"</figure>"
27652765
)
27662766

2767+
img_fallback = _image_fallback(item)
2768+
27672769
# If the formula is not processed correcty, use its image
27682770
if (
27692771
item.text == ""
27702772
and item.orig != ""
27712773
and image_mode == ImageRefMode.EMBEDDED
27722774
and len(item.prov) > 0
2775+
and img_fallback is not None
27732776
):
2774-
text = _image_fallback(item)
2777+
text = img_fallback
27752778

27762779
# Building a math equation in MathML format
27772780
# ref https://www.w3.org/TR/wai-aria-1.1/#math
@@ -2791,9 +2794,13 @@ def _image_fallback(item: TextItem):
27912794
"Malformed formula cannot be rendered. "
27922795
f"Error {err.__class__.__name__}, formula={math_formula}"
27932796
)
2794-
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2795-
text = _image_fallback(item)
2796-
else:
2797+
if (
2798+
image_mode == ImageRefMode.EMBEDDED
2799+
and len(item.prov) > 0
2800+
and img_fallback is not None
2801+
):
2802+
text = img_fallback
2803+
elif len(math_formula) > 0:
27972804
text = f"<pre>{math_formula}</pre>"
27982805

27992806
elif math_formula != "":

test/test_docling_doc.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,25 @@ def test_formula_mathml():
775775
assert doc_html == gt_html
776776

777777

778+
def test_formula_with_missing_fallback():
779+
doc = DoclingDocument(name="Dummy")
780+
bbox = BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT)
781+
prov = ProvenanceItem(page_no=1, bbox=bbox, charspan=(0, 2))
782+
doc.add_text(label=DocItemLabel.FORMULA, text="", orig="(II.24) 2 Imar", prov=prov)
783+
784+
actual = doc.export_to_html(
785+
formula_to_mathml=True, html_head="", image_mode=ImageRefMode.EMBEDDED
786+
)
787+
788+
expected = """<!DOCTYPE html>
789+
<html lang="en">
790+
791+
<div class="formula-not-decoded">Formula not decoded</div>
792+
</html>"""
793+
794+
assert actual == expected
795+
796+
778797
def test_docitem_get_image():
779798
# Prepare the document
780799
doc = DoclingDocument(name="Dummy")

0 commit comments

Comments
 (0)