|
5 | 5 | import hashlib |
6 | 6 | import html |
7 | 7 | import json |
| 8 | +import logging |
8 | 9 | import mimetypes |
9 | 10 | import os |
10 | 11 | import re |
|
20 | 21 | from xml.sax.saxutils import unescape |
21 | 22 |
|
22 | 23 | import latex2mathml.converter |
| 24 | +import latex2mathml.exceptions |
23 | 25 | import pandas as pd |
24 | 26 | import yaml |
25 | 27 | from PIL import Image as PILImage |
|
44 | 46 | from docling_core.types.doc.tokens import DocumentToken, TableToken |
45 | 47 | from docling_core.types.doc.utils import relative_path |
46 | 48 |
|
| 49 | +_logger = logging.getLogger(__name__) |
| 50 | + |
47 | 51 | Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))] |
48 | 52 | LevelNumber = typing.Annotated[int, Field(ge=1, le=100)] |
49 | 53 | CURRENT_VERSION: Final = "1.0.0" |
@@ -2487,34 +2491,47 @@ def _prepare_tag_content( |
2487 | 2491 | ) |
2488 | 2492 | text = "" |
2489 | 2493 |
|
2490 | | - # If the formula is not processed correcty, use its image |
2491 | | - if ( |
2492 | | - item.text == "" |
2493 | | - and item.orig != "" |
2494 | | - and image_mode == ImageRefMode.EMBEDDED |
2495 | | - and len(item.prov) > 0 |
2496 | | - ): |
| 2494 | + def _image_fallback(item: TextItem): |
2497 | 2495 | item_image = item.get_image(doc=self) |
2498 | 2496 | if item_image is not None: |
2499 | 2497 | img_ref = ImageRef.from_pil(item_image, dpi=72) |
2500 | | - text = ( |
| 2498 | + return ( |
2501 | 2499 | "<figure>" |
2502 | 2500 | f'<img src="{img_ref.uri}" alt="{item.orig}" />' |
2503 | 2501 | "</figure>" |
2504 | 2502 | ) |
2505 | 2503 |
|
| 2504 | + # If the formula is not processed correcty, use its image |
| 2505 | + if ( |
| 2506 | + item.text == "" |
| 2507 | + and item.orig != "" |
| 2508 | + and image_mode == ImageRefMode.EMBEDDED |
| 2509 | + and len(item.prov) > 0 |
| 2510 | + ): |
| 2511 | + text = _image_fallback(item) |
| 2512 | + |
2506 | 2513 | # Building a math equation in MathML format |
2507 | 2514 | # ref https://www.w3.org/TR/wai-aria-1.1/#math |
2508 | 2515 | elif formula_to_mathml: |
2509 | | - mathml_element = latex2mathml.converter.convert_to_element( |
2510 | | - math_formula, display="block" |
2511 | | - ) |
2512 | | - annotation = SubElement( |
2513 | | - mathml_element, "annotation", dict(encoding="TeX") |
2514 | | - ) |
2515 | | - annotation.text = math_formula |
2516 | | - mathml = unescape(tostring(mathml_element, encoding="unicode")) |
2517 | | - text = f"<div>{mathml}</div>" |
| 2516 | + try: |
| 2517 | + mathml_element = latex2mathml.converter.convert_to_element( |
| 2518 | + math_formula, display="block" |
| 2519 | + ) |
| 2520 | + annotation = SubElement( |
| 2521 | + mathml_element, "annotation", dict(encoding="TeX") |
| 2522 | + ) |
| 2523 | + annotation.text = math_formula |
| 2524 | + mathml = unescape(tostring(mathml_element, encoding="unicode")) |
| 2525 | + text = f"<div>{mathml}</div>" |
| 2526 | + except Exception as err: |
| 2527 | + _logger.warning( |
| 2528 | + "Malformed formula cannot be rendered. " |
| 2529 | + f"Error {err.__class__.__name__}, formula={math_formula}" |
| 2530 | + ) |
| 2531 | + if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0: |
| 2532 | + text = _image_fallback(item) |
| 2533 | + else: |
| 2534 | + text = f"<pre>{math_formula}</pre>" |
2518 | 2535 |
|
2519 | 2536 | elif math_formula != "": |
2520 | 2537 | text = f"<pre>{math_formula}</pre>" |
|
0 commit comments