Skip to content

Commit eb9b4b3

Browse files
authored
fix: image fallback for malformed equations (#149)
fix mathml for malformed equations Signed-off-by: Michele Dolfi <[email protected]>
1 parent 285438d commit eb9b4b3

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

docling_core/types/doc/document.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import hashlib
66
import html
77
import json
8+
import logging
89
import mimetypes
910
import os
1011
import re
@@ -20,6 +21,7 @@
2021
from xml.sax.saxutils import unescape
2122

2223
import latex2mathml.converter
24+
import latex2mathml.exceptions
2325
import pandas as pd
2426
import yaml
2527
from PIL import Image as PILImage
@@ -44,6 +46,8 @@
4446
from docling_core.types.doc.tokens import DocumentToken, TableToken
4547
from docling_core.types.doc.utils import relative_path
4648

49+
_logger = logging.getLogger(__name__)
50+
4751
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
4852
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
4953
CURRENT_VERSION: Final = "1.0.0"
@@ -2487,34 +2491,47 @@ def _prepare_tag_content(
24872491
)
24882492
text = ""
24892493

2490-
# If the formula is not processed correcty, use its image
2491-
if (
2492-
item.text == ""
2493-
and item.orig != ""
2494-
and image_mode == ImageRefMode.EMBEDDED
2495-
and len(item.prov) > 0
2496-
):
2494+
def _image_fallback(item: TextItem):
24972495
item_image = item.get_image(doc=self)
24982496
if item_image is not None:
24992497
img_ref = ImageRef.from_pil(item_image, dpi=72)
2500-
text = (
2498+
return (
25012499
"<figure>"
25022500
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
25032501
"</figure>"
25042502
)
25052503

2504+
# If the formula is not processed correcty, use its image
2505+
if (
2506+
item.text == ""
2507+
and item.orig != ""
2508+
and image_mode == ImageRefMode.EMBEDDED
2509+
and len(item.prov) > 0
2510+
):
2511+
text = _image_fallback(item)
2512+
25062513
# Building a math equation in MathML format
25072514
# ref https://www.w3.org/TR/wai-aria-1.1/#math
25082515
elif formula_to_mathml:
2509-
mathml_element = latex2mathml.converter.convert_to_element(
2510-
math_formula, display="block"
2511-
)
2512-
annotation = SubElement(
2513-
mathml_element, "annotation", dict(encoding="TeX")
2514-
)
2515-
annotation.text = math_formula
2516-
mathml = unescape(tostring(mathml_element, encoding="unicode"))
2517-
text = f"<div>{mathml}</div>"
2516+
try:
2517+
mathml_element = latex2mathml.converter.convert_to_element(
2518+
math_formula, display="block"
2519+
)
2520+
annotation = SubElement(
2521+
mathml_element, "annotation", dict(encoding="TeX")
2522+
)
2523+
annotation.text = math_formula
2524+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
2525+
text = f"<div>{mathml}</div>"
2526+
except Exception as err:
2527+
_logger.warning(
2528+
"Malformed formula cannot be rendered. "
2529+
f"Error {err.__class__.__name__}, formula={math_formula}"
2530+
)
2531+
if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2532+
text = _image_fallback(item)
2533+
else:
2534+
text = f"<pre>{math_formula}</pre>"
25182535

25192536
elif math_formula != "":
25202537
text = f"<pre>{math_formula}</pre>"

0 commit comments

Comments
 (0)