Skip to content

Commit fd7529f

Browse files
authored
fix(markdown): preserve underscores in image URLs during markdown export (#98)
Signed-off-by: Boyce <[email protected]>
1 parent 3a158b9 commit fd7529f

File tree

3 files changed

+24
-5
lines changed

3 files changed

+24
-5
lines changed

docling_core/types/doc/document.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2115,10 +2115,30 @@ def export_to_markdown( # noqa: C901
21152115
# Bold, Italic, or Bold-Italic
21162116
# Hence, any underscore that we print into Markdown is coming from document text
21172117
# That means we need to escape it, to properly reflect content in the markdown
2118+
# However, we need to preserve underscores in image URLs
2119+
# to maintain their validity
2120+
# For example: ![image](path/to_image.png) should remain unchanged
21182121
def escape_underscores(text):
2119-
# Replace "_" with "\_" only if it's not already escaped
2120-
escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
2121-
return escaped_text
2122+
"""Escape underscores but leave them intact in the URL.."""
2123+
# Firstly, identify all the URL patterns.
2124+
url_pattern = r"!\[.*?\]\((.*?)\)"
2125+
parts = []
2126+
last_end = 0
2127+
2128+
for match in re.finditer(url_pattern, text):
2129+
# Text to add before the URL (needs to be escaped)
2130+
before_url = text[last_end : match.start()]
2131+
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2132+
2133+
# Add the full URL part (do not escape)
2134+
parts.append(match.group(0))
2135+
last_end = match.end()
2136+
2137+
# Add the final part of the text (which needs to be escaped)
2138+
if last_end < len(text):
2139+
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2140+
2141+
return "".join(parts)
21222142

21232143
mdtext = escape_underscores(mdtext)
21242144

test/data/doc/constructed_doc.referenced.md.gt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ This is the caption of figure 1.
3131

3232
This is the caption of figure 2.
3333

34-
![Image](constructed\_images/image\_000001\_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)
34+
![Image](constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)

test/test_docling_doc.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,6 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
670670

671671
for p in paths:
672672
instr = instr.replace(str(p), str(p.name))
673-
instr = instr.replace(str(p).replace("_", "\\_"), str(p.name))
674673

675674
return instr
676675

0 commit comments

Comments
 (0)