Skip to content

Commit c6590e8

Browse files
authored
fix: add html escape in md export and fix formula escapes (#143)
* add html escape in md export and fix formula escapes Signed-off-by: Michele Dolfi <[email protected]> * escape also html export Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent 0519d50 commit c6590e8

File tree

4 files changed

+90
-66
lines changed

4 files changed

+90
-66
lines changed

docling_core/types/doc/document.py

Lines changed: 83 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import base64
44
import copy
55
import hashlib
6+
import html
67
import json
78
import mimetypes
89
import os
@@ -1045,7 +1046,7 @@ def export_to_html(
10451046

10461047
text = ""
10471048
if doc is not None and add_caption and len(self.captions):
1048-
text = self.caption_text(doc)
1049+
text = html.escape(self.caption_text(doc))
10491050

10501051
if len(self.data.table_cells) == 0:
10511052
return ""
@@ -1071,7 +1072,7 @@ def export_to_html(
10711072
if colstart != j:
10721073
continue
10731074

1074-
content = cell.text.strip()
1075+
content = html.escape(cell.text.strip())
10751076
celltag = "td"
10761077
if cell.column_header:
10771078
celltag = "th"
@@ -2082,6 +2083,46 @@ def export_to_markdown( # noqa: C901
20822083
previous_level = 0 # Track the previous item's level
20832084
in_list = False # Track if we're currently processing list items
20842085

2086+
# Our export markdown doesn't contain any emphasis styling:
2087+
# Bold, Italic, or Bold-Italic
2088+
# Hence, any underscore that we print into Markdown is coming from document text
2089+
# That means we need to escape it, to properly reflect content in the markdown
2090+
# However, we need to preserve underscores in image URLs
2091+
# to maintain their validity
2092+
# For example: ![image](path/to_image.png) should remain unchanged
2093+
def _escape_underscores(text):
2094+
"""Escape underscores but leave them intact in the URL.."""
2095+
# Firstly, identify all the URL patterns.
2096+
url_pattern = r"!\[.*?\]\((.*?)\)"
2097+
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2098+
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2099+
combined_pattern = f"({url_pattern})|({latex_pattern})"
2100+
2101+
parts = []
2102+
last_end = 0
2103+
2104+
for match in re.finditer(combined_pattern, text):
2105+
# Text to add before the URL (needs to be escaped)
2106+
before_url = text[last_end : match.start()]
2107+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2108+
2109+
# Add the full URL part (do not escape)
2110+
parts.append(match.group(0))
2111+
last_end = match.end()
2112+
2113+
# Add the final part of the text (which needs to be escaped)
2114+
if last_end < len(text):
2115+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2116+
2117+
return "".join(parts)
2118+
2119+
def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
2120+
if do_escape_underscores and escaping_underscores:
2121+
text = _escape_underscores(text)
2122+
if do_escape_html:
2123+
text = html.escape(text, quote=False)
2124+
mdtexts.append(text)
2125+
20852126
for ix, (item, level) in enumerate(
20862127
self.iterate_items(self.body, with_groups=True, page_no=page_no)
20872128
):
@@ -2130,7 +2171,7 @@ def export_to_markdown( # noqa: C901
21302171
in_list = False
21312172
marker = "" if strict_text else "#"
21322173
text = f"{marker} {item.text}"
2133-
mdtexts.append(text.strip() + "\n")
2174+
_append_text(text.strip() + "\n")
21342175

21352176
elif (
21362177
isinstance(item, TextItem)
@@ -2143,12 +2184,12 @@ def export_to_markdown( # noqa: C901
21432184
if len(marker) < 2:
21442185
marker = "##"
21452186
text = f"{marker} {item.text}\n"
2146-
mdtexts.append(text.strip() + "\n")
2187+
_append_text(text.strip() + "\n")
21472188

21482189
elif isinstance(item, CodeItem) and item.label in labels:
21492190
in_list = False
21502191
text = f"```\n{item.text}\n```\n"
2151-
mdtexts.append(text)
2192+
_append_text(text, do_escape_underscores=False, do_escape_html=False)
21522193

21532194
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
21542195
in_list = True
@@ -2165,85 +2206,54 @@ def export_to_markdown( # noqa: C901
21652206
marker = "-" # Markdown needs only dash as item marker.
21662207

21672208
text = f"{list_indent}{marker} {item.text}"
2168-
mdtexts.append(text)
2209+
_append_text(text)
21692210

21702211
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
21712212
in_list = False
2172-
mdtexts.append(f"$${item.text}$$\n")
2213+
_append_text(
2214+
f"$${item.text}$$\n",
2215+
do_escape_underscores=False,
2216+
do_escape_html=False,
2217+
)
21732218

21742219
elif isinstance(item, TextItem) and item.label in labels:
21752220
in_list = False
21762221
if len(item.text) and text_width > 0:
2222+
text = item.text
21772223
wrapped_text = textwrap.fill(text, width=text_width)
2178-
mdtexts.append(wrapped_text + "\n")
2224+
_append_text(wrapped_text + "\n")
21792225
elif len(item.text):
21802226
text = f"{item.text}\n"
2181-
mdtexts.append(text)
2227+
_append_text(text)
21822228

21832229
elif isinstance(item, TableItem) and not strict_text:
21842230
in_list = False
2185-
mdtexts.append(item.caption_text(self))
2231+
_append_text(item.caption_text(self))
21862232
md_table = item.export_to_markdown()
2187-
mdtexts.append("\n" + md_table + "\n")
2233+
_append_text("\n" + md_table + "\n")
21882234

21892235
elif isinstance(item, PictureItem) and not strict_text:
21902236
in_list = False
2191-
mdtexts.append(item.caption_text(self))
2237+
_append_text(item.caption_text(self))
21922238

21932239
line = item.export_to_markdown(
21942240
doc=self,
21952241
image_placeholder=image_placeholder,
21962242
image_mode=image_mode,
21972243
)
21982244

2199-
mdtexts.append(line)
2245+
_append_text(line, do_escape_html=False, do_escape_underscores=False)
22002246

22012247
elif isinstance(item, DocItem) and item.label in labels:
22022248
in_list = False
2203-
text = "<missing-text>"
2204-
mdtexts.append(text)
2249+
text = "<!-- missing-text -->"
2250+
_append_text(text, do_escape_html=False, do_escape_underscores=False)
22052251

22062252
mdtext = (delim.join(mdtexts)).strip()
22072253
mdtext = re.sub(
22082254
r"\n\n\n+", "\n\n", mdtext
22092255
) # remove cases of double or more empty lines.
22102256

2211-
# Our export markdown doesn't contain any emphasis styling:
2212-
# Bold, Italic, or Bold-Italic
2213-
# Hence, any underscore that we print into Markdown is coming from document text
2214-
# That means we need to escape it, to properly reflect content in the markdown
2215-
# However, we need to preserve underscores in image URLs
2216-
# to maintain their validity
2217-
# For example: ![image](path/to_image.png) should remain unchanged
2218-
def escape_underscores(text):
2219-
"""Escape underscores but leave them intact in the URL.."""
2220-
# Firstly, identify all the URL patterns.
2221-
url_pattern = r"!\[.*?\]\((.*?)\)"
2222-
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223-
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224-
combined_pattern = f"({url_pattern})|({latex_pattern})"
2225-
2226-
parts = []
2227-
last_end = 0
2228-
2229-
for match in re.finditer(combined_pattern, text):
2230-
# Text to add before the URL (needs to be escaped)
2231-
before_url = text[last_end : match.start()]
2232-
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2233-
2234-
# Add the full URL part (do not escape)
2235-
parts.append(match.group(0))
2236-
last_end = match.end()
2237-
2238-
# Add the final part of the text (which needs to be escaped)
2239-
if last_end < len(text):
2240-
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2241-
2242-
return "".join(parts)
2243-
2244-
if escaping_underscores:
2245-
mdtext = escape_underscores(mdtext)
2246-
22472257
return mdtext
22482258

22492259
def export_to_text( # noqa: C901
@@ -2371,6 +2381,11 @@ def close_lists(
23712381

23722382
in_ordered_list: List[bool] = [] # False
23732383

2384+
def _sanitize_text(text: str, do_escape_html=True) -> str:
2385+
if do_escape_html:
2386+
text = html.escape(text, quote=False)
2387+
return text
2388+
23742389
for ix, (item, curr_level) in enumerate(
23752390
self.iterate_items(self.body, with_groups=True, page_no=page_no)
23762391
):
@@ -2421,14 +2436,17 @@ def close_lists(
24212436

24222437
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
24232438

2424-
text = f"<h1>{item.text}</h1>"
2439+
text = f"<h1>{_sanitize_text(item.text)}</h1>"
24252440
html_texts.append(text.strip())
24262441

24272442
elif isinstance(item, SectionHeaderItem):
24282443

24292444
section_level: int = item.level + 1
24302445

2431-
text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
2446+
text = (
2447+
f"<h{(section_level)}>"
2448+
f"{_sanitize_text(item.text)}</h{(section_level)}>"
2449+
)
24322450
html_texts.append(text.strip())
24332451

24342452
elif isinstance(item, TextItem) and item.label in [
@@ -2443,31 +2461,37 @@ def close_lists(
24432461
if section_level >= 6:
24442462
section_level = 6
24452463

2446-
text = f"<h{section_level}>{item.text}</h{section_level}>"
2464+
text = (
2465+
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
2466+
)
24472467
html_texts.append(text.strip())
24482468

24492469
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
24502470

2451-
text = f"<pre>{item.text}</pre>"
2471+
text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
24522472
html_texts.append(text)
24532473

24542474
elif isinstance(item, ListItem):
24552475

2456-
text = f"<li>{item.text}</li>"
2476+
text = f"<li>{_sanitize_text(item.text)}</li>"
24572477
html_texts.append(text)
24582478

24592479
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
24602480

2461-
text = f"<li>{item.text}</li>"
2481+
text = f"<li>{_sanitize_text(item.text)}</li>"
24622482
html_texts.append(text)
24632483

24642484
elif isinstance(item, CodeItem) and item.label in labels:
2465-
text = f"<pre><code>{item.text}</code></pre>"
2485+
text = (
2486+
"<pre><code>"
2487+
f"{_sanitize_text(item.text, do_escape_html=False)}"
2488+
"</code></pre>"
2489+
)
24662490
html_texts.append(text.strip())
24672491

24682492
elif isinstance(item, TextItem) and item.label in labels:
24692493

2470-
text = f"<p>{item.text}</p>"
2494+
text = f"<p>{_sanitize_text(item.text)}</p>"
24712495
html_texts.append(text.strip())
24722496
elif isinstance(item, TableItem):
24732497

0 commit comments

Comments
 (0)