Skip to content

Commit 0d5cd11

Browse files
authored
feat: escape underscores that are within latex equations (#137)
* Add regex to escape underscores within latex equations Signed-off-by: Rafael Teixeira de Lima <[email protected]> * Remove debug Signed-off-by: Rafael Teixeira de Lima <[email protected]> * Handle block equations Signed-off-by: Rafael Teixeira de Lima <[email protected]> * Add double $ to standalone equations outputs Signed-off-by: Rafael Teixeira de Lima <[email protected]> --------- Signed-off-by: Rafael Teixeira de Lima <[email protected]>
1 parent c9739b2 commit 0d5cd11

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

docling_core/types/doc/document.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2167,6 +2167,10 @@ def export_to_markdown( # noqa: C901
21672167
text = f"{list_indent}{marker} {item.text}"
21682168
mdtexts.append(text)
21692169

2170+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2171+
in_list = False
2172+
mdtexts.append(f"$${item.text}$$")
2173+
21702174
elif isinstance(item, TextItem) and item.label in labels:
21712175
in_list = False
21722176
if len(item.text) and text_width > 0:
@@ -2215,10 +2219,14 @@ def escape_underscores(text):
22152219
"""Escape underscores but leave them intact in the URL.."""
22162220
# Firstly, identify all the URL patterns.
22172221
url_pattern = r"!\[.*?\]\((.*?)\)"
2222+
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223+
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224+
combined_pattern = f"({url_pattern})|({latex_pattern})"
2225+
22182226
parts = []
22192227
last_end = 0
22202228

2221-
for match in re.finditer(url_pattern, text):
2229+
for match in re.finditer(combined_pattern, text):
22222230
# Text to add before the URL (needs to be escaped)
22232231
before_url = text[last_end : match.start()]
22242232
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

0 commit comments

Comments
 (0)