fix(markdown): preserve underscores in image URLs during markdown export (#98)

Aboysky · web-flow · commit fd7529f4096e · 2024-12-09T11:38:23.000+01:00
Signed-off-by: Boyce &lt;aboyskyi@gmail.com&gt;
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -2115,10 +2115,30 @@ def export_to_markdown(  # noqa: C901
         # Bold, Italic, or Bold-Italic
         # Hence, any underscore that we print into Markdown is coming from document text
         # That means we need to escape it, to properly reflect content in the markdown
+        # However, we need to preserve underscores in image URLs
+        # to maintain their validity
+        # For example: ![image](path/to_image.png) should remain unchanged
         def escape_underscores(text):
-            # Replace "_" with "\_" only if it's not already escaped
-            escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
-            return escaped_text
+            """Escape underscores but leave them intact in the URL.."""
+            # Firstly, identify all the URL patterns.
+            url_pattern = r"!\[.*?\]\((.*?)\)"
+            parts = []
+            last_end = 0
+
+            for match in re.finditer(url_pattern, text):
+                # Text to add before the URL (needs to be escaped)
+                before_url = text[last_end : match.start()]
+                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
+
+                # Add the full URL part (do not escape)
+                parts.append(match.group(0))
+                last_end = match.end()
+
+            # Add the final part of the text (which needs to be escaped)
+            if last_end < len(text):
+                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
+
+            return "".join(parts)
 
         mdtext = escape_underscores(mdtext)
 
diff --git a/test/data/doc/constructed_doc.referenced.md.gt b/test/data/doc/constructed_doc.referenced.md.gt
@@ -31,4 +31,4 @@ This is the caption of figure 1.
 
 This is the caption of figure 2.
 
-![Image](constructed\_images/image\_000001\_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)
+![Image](constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
@@ -670,7 +670,6 @@ def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
 
     for p in paths:
         instr = instr.replace(str(p), str(p.name))
-        instr = instr.replace(str(p).replace("_", "\\_"), str(p.name))
 
     return instr
 

Original file line number	Diff line number	Diff line change
`@@ -31,4 +31,4 @@ This is the caption of figure 1.`
`31`	`31`
`32`	`32`	`This is the caption of figure 2.`
`33`	`33`
`34`		`-![Image](constructed\_images/image\_000001\_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)`
	`34`	`+![Image](constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png)`