Skip to content

Commit 92965fb

Browse files
add fenced-code extension to the md parser (#4044)
#3578 --------- Co-authored-by: qued <[email protected]> Co-authored-by: Alan Bertl <[email protected]>
1 parent f078cd9 commit 92965fb

File tree

5 files changed

+81
-29
lines changed

5 files changed

+81
-29
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.5-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Improve markdown code block handling** Code blocks in markdown were previously being processed as embedded code instead of plain text.
9+
110
## 0.18.4
211

312
### Enhancements

example-docs/codeblock.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
## HTML Example
2+
3+
```html
4+
<!DOCTYPE html>
5+
<html lang="en">
6+
<head>
7+
<meta charset="UTF-8">
8+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
9+
<title>Sample HTML</title>
10+
</head>
11+
<body>
12+
<h1>Hello, World!</h1>
13+
<p>This is a simple HTML example.</p>
14+
</body>
15+
</html>
16+
```
17+
## XML Example
18+
```xml
19+
<note>
20+
<to>Tove</to>
21+
<from>Jani</from>
22+
<heading>Reminder</heading>
23+
<body>Don't forget me this weekend!</body>
24+
</note>
25+
```
26+
```xml
27+
<note>
28+
<to>Tove</to>
29+
<from>Jani</from>
30+
<heading>Reminder</heading>
31+
<body>Don't forget me this weekend!</body>
32+
</note>
33+
```

test_unstructured/partition/test_md.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,39 @@ def test_partition_md_non_xml_processing_instruction():
296296

297297
elements = partition_md(text=php_content)
298298
assert len(elements) == 1
299+
300+
301+
def test_partition_fenced_code():
302+
filename = example_doc_path("codeblock.md")
303+
elements = partition_md(filename=filename)
304+
305+
# Should have 5 elements: 2 titles and 3 code blocks
306+
assert len(elements) == 5
307+
308+
assert elements[0].text == "HTML Example"
309+
310+
expected_html = """<!DOCTYPE html>
311+
<html lang="en">
312+
<head>
313+
<meta charset="UTF-8">
314+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
315+
<title>Sample HTML</title>
316+
</head>
317+
<body>
318+
<h1>Hello, World!</h1>
319+
<p>This is a simple HTML example.</p>
320+
</body>
321+
</html>"""
322+
assert elements[1].text == expected_html
323+
324+
assert elements[2].text == "XML Example"
325+
326+
expected_xml = """<note>
327+
<to>Tove</to>
328+
<from>Jani</from>
329+
<heading>Reminder</heading>
330+
<body>Don't forget me this weekend!</body>
331+
</note>"""
332+
assert elements[3].text == expected_xml
333+
334+
assert elements[4].text == expected_xml

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.4" # pragma: no cover
1+
__version__ = "0.18.5-dev0" # pragma: no cover

unstructured/partition/md.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

3-
import re
4-
from typing import IO, Any, Match
3+
from typing import IO, Any
54

65
import markdown
76
import requests
@@ -23,30 +22,6 @@ def optional_decode(contents: str | bytes) -> str:
2322
DETECTION_ORIGIN: str = "md"
2423

2524

26-
def _preprocess_markdown_code_blocks(text: str) -> str:
27-
"""Pre-process code blocks so that processing instructions can be properly escaped.
28-
29-
The markdown library can fail to properly escape processing instructions like <?xml>, <?php>,
30-
etc. in code blocks. This function adds minimal indentation to the processing instruction line
31-
to force markdown to treat it as text content rather than XML.
32-
"""
33-
# Breakdown of the regex:
34-
# ```\s*\n - Opening triple backticks + optional whitespace + newline
35-
# ([ \t]{0,3})? - Capture group 1: optional 0-3 spaces/tabs (existing indentation)
36-
# (<\?[a-zA-Z][^>]*\?>.*?) - Capture group 2: processing instruction + any following content
37-
# \n?``` - Optional newline + closing triple backticks
38-
code_block_pattern = r"```\s*\n([ \t]{0,3})?(<\?[a-zA-Z][^>]*\?>.*?)\n?```"
39-
40-
def indent_processing_instruction(match: Match[str]) -> str:
41-
content = match.group(2)
42-
# Ensure processing instruction has at least 4-space indentation
43-
if content.lstrip().startswith("<?"):
44-
content = " " + content.lstrip()
45-
return f"```\n{content}\n```"
46-
47-
return re.sub(code_block_pattern, indent_processing_instruction, text, flags=re.DOTALL)
48-
49-
5025
def partition_md(
5126
filename: str | None = None,
5227
file: IO[bytes] | None = None,
@@ -98,8 +73,7 @@ def partition_md(
9873

9974
text = response.text
10075

101-
processed_text = _preprocess_markdown_code_blocks(text)
102-
html = markdown.markdown(processed_text, extensions=["tables"])
76+
html = markdown.markdown(text, extensions=["tables", "fenced_code"])
10377

10478
return partition_html(
10579
text=html,

0 commit comments

Comments
 (0)