Skip to content

Commit aec29a7

Browse files
authored
fix(markdown): ensure correct parsing of nested lists (#1995)
* fix(markdown): ensure correct parsing of nested lists Signed-off-by: Cesar Berrospi Ramis <[email protected]> * chore: update dependencies in uv.lock file Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 1985841 commit aec29a7

File tree

6 files changed

+778
-563
lines changed

6 files changed

+778
-563
lines changed

docling/backend/md_backend.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from enum import Enum
66
from io import BytesIO
77
from pathlib import Path
8-
from typing import List, Literal, Optional, Set, Union
8+
from typing import Literal, Optional, Union, cast
99

1010
import marko
1111
import marko.element
@@ -14,6 +14,7 @@
1414
DocItemLabel,
1515
DoclingDocument,
1616
DocumentOrigin,
17+
ListItem,
1718
NodeItem,
1819
TableCell,
1920
TableData,
@@ -89,7 +90,7 @@ def replace_match(match):
8990
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
9091
super().__init__(in_doc, path_or_stream)
9192

92-
_log.debug("MD INIT!!!")
93+
_log.debug("Starting MarkdownDocumentBackend...")
9394

9495
# Markdown file:
9596
self.path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ def _close_table(self, doc: DoclingDocument):
131132
for md_table_row in self.md_table_buffer:
132133
_log.debug(md_table_row)
133134
_log.debug("=== TABLE END ===")
134-
tcells: List[TableCell] = []
135+
tcells: list[TableCell] = []
135136
result_table = []
136137
for n, md_table_row in enumerate(self.md_table_buffer):
137138
data = []
@@ -232,11 +233,12 @@ def _iterate_elements( # noqa: C901
232233
element: marko.element.Element,
233234
depth: int,
234235
doc: DoclingDocument,
235-
visited: Set[marko.element.Element],
236+
visited: set[marko.element.Element],
236237
creation_stack: list[
237238
_CreationPayload
238239
], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239240
list_ordered_flag_by_ref: dict[str, bool],
241+
list_last_item_by_ref: dict[str, ListItem],
240242
parent_item: Optional[NodeItem] = None,
241243
formatting: Optional[Formatting] = None,
242244
hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -279,7 +281,7 @@ def _iterate_elements( # noqa: C901
279281

280282
elif (
281283
isinstance(element, marko.block.ListItem)
282-
and len(element.children) == 1
284+
and len(element.children) > 0
283285
and isinstance((child := element.children[0]), marko.block.Paragraph)
284286
and len(child.children) > 0
285287
):
@@ -291,7 +293,15 @@ def _iterate_elements( # noqa: C901
291293
if parent_item
292294
else False
293295
)
294-
if len(child.children) > 1: # inline group will be created further down
296+
non_list_children: list[marko.element.Element] = [
297+
item
298+
for item in child.children
299+
if not isinstance(item, marko.block.ListItem)
300+
]
301+
if len(non_list_children) > 1: # inline group will be created further down
302+
parent_ref: Optional[str] = (
303+
parent_item.self_ref if parent_item else None
304+
)
295305
parent_item = self._create_list_item(
296306
doc=doc,
297307
parent_item=parent_item,
@@ -300,6 +310,8 @@ def _iterate_elements( # noqa: C901
300310
formatting=formatting,
301311
hyperlink=hyperlink,
302312
)
313+
if parent_ref:
314+
list_last_item_by_ref[parent_ref] = cast(ListItem, parent_item)
303315
else:
304316
creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
305317

@@ -334,9 +346,11 @@ def _iterate_elements( # noqa: C901
334346
element.dest
335347
)
336348

337-
elif isinstance(element, marko.inline.RawText):
338-
_log.debug(f" - Paragraph (raw text): {element.children}")
339-
snippet_text = element.children.strip()
349+
elif isinstance(element, (marko.inline.RawText, marko.inline.Literal)):
350+
_log.debug(f" - RawText/Literal: {element.children}")
351+
snippet_text = (
352+
element.children.strip() if isinstance(element.children, str) else ""
353+
)
340354
# Detect start of the table:
341355
if "|" in snippet_text or self.in_table:
342356
# most likely part of the markdown table
@@ -359,6 +373,7 @@ def _iterate_elements( # noqa: C901
359373
if parent_item
360374
else False
361375
)
376+
parent_ref = parent_item.self_ref if parent_item else None
362377
parent_item = self._create_list_item(
363378
doc=doc,
364379
parent_item=parent_item,
@@ -367,6 +382,11 @@ def _iterate_elements( # noqa: C901
367382
formatting=formatting,
368383
hyperlink=hyperlink,
369384
)
385+
if parent_ref:
386+
list_last_item_by_ref[parent_ref] = cast(
387+
ListItem, parent_item
388+
)
389+
370390
elif isinstance(to_create, _HeadingCreationPayload):
371391
# not keeping as parent_item as logic for correctly tracking
372392
# that not implemented yet (section components not captured
@@ -458,13 +478,25 @@ def _iterate_elements( # noqa: C901
458478
element, processed_block_types
459479
):
460480
for child in element.children:
481+
if (
482+
isinstance(element, marko.block.ListItem)
483+
and isinstance(child, marko.block.List)
484+
and parent_item
485+
and list_last_item_by_ref.get(parent_item.self_ref, None)
486+
):
487+
_log.debug(
488+
f"walking into new List hanging from item of parent list {parent_item.self_ref}"
489+
)
490+
parent_item = list_last_item_by_ref[parent_item.self_ref]
491+
461492
self._iterate_elements(
462493
element=child,
463494
depth=depth + 1,
464495
doc=doc,
465496
visited=visited,
466497
creation_stack=creation_stack,
467498
list_ordered_flag_by_ref=list_ordered_flag_by_ref,
499+
list_last_item_by_ref=list_last_item_by_ref,
468500
parent_item=parent_item,
469501
formatting=formatting,
470502
hyperlink=hyperlink,
@@ -483,7 +515,7 @@ def supports_pagination(cls) -> bool:
483515
return False
484516

485517
@classmethod
486-
def supported_formats(cls) -> Set[InputFormat]:
518+
def supported_formats(cls) -> set[InputFormat]:
487519
return {InputFormat.MD}
488520

489521
def convert(self) -> DoclingDocument:
@@ -510,6 +542,7 @@ def convert(self) -> DoclingDocument:
510542
visited=set(),
511543
creation_stack=[],
512544
list_ordered_flag_by_ref={},
545+
list_last_item_by_ref={},
513546
)
514547
self._close_table(doc=doc) # handle any last hanging table
515548

@@ -534,7 +567,6 @@ def _restore_original_html(txt, regex):
534567
]:
535568
html_str = _restore_original_html(txt=html_str, regex=regex)
536569
self._html_blocks = 0
537-
538570
# delegate to HTML backend
539571
stream = BytesIO(bytes(html_str, encoding="utf-8"))
540572
in_doc = InputDocument(

tests/data/groundtruth/docling_v2/mixed_without_h1.md.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
- A. first
44
- subitem
55
- B. second
6-
1. strange
6+
- 2 . strange
77

88
The end!
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
body:
2+
children:
3+
- $ref: '#/texts/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/groups/0'
6+
content_layer: body
7+
label: unspecified
8+
name: _root_
9+
self_ref: '#/body'
10+
form_items: []
11+
furniture:
12+
children: []
13+
content_layer: furniture
14+
label: unspecified
15+
name: _root_
16+
self_ref: '#/furniture'
17+
groups:
18+
- children:
19+
- $ref: '#/texts/2'
20+
content_layer: body
21+
label: section
22+
name: header-1
23+
parent:
24+
$ref: '#/body'
25+
self_ref: '#/groups/0'
26+
- children:
27+
- $ref: '#/texts/3'
28+
- $ref: '#/texts/5'
29+
- $ref: '#/texts/6'
30+
content_layer: body
31+
label: list
32+
name: list
33+
parent:
34+
$ref: '#/texts/2'
35+
self_ref: '#/groups/1'
36+
- children:
37+
- $ref: '#/texts/4'
38+
content_layer: body
39+
label: list
40+
name: list
41+
parent:
42+
$ref: '#/texts/3'
43+
self_ref: '#/groups/2'
44+
key_value_items: []
45+
name: mixed_without_h1
46+
origin:
47+
binary_hash: 7394721163373597328
48+
filename: mixed_without_h1.md
49+
mimetype: text/html
50+
pages: {}
51+
pictures: []
52+
schema_name: DoclingDocument
53+
tables: []
54+
texts:
55+
- children: []
56+
content_layer: furniture
57+
label: title
58+
orig: mixed_without_h1
59+
parent:
60+
$ref: '#/body'
61+
prov: []
62+
self_ref: '#/texts/0'
63+
text: mixed_without_h1
64+
- children: []
65+
content_layer: furniture
66+
label: text
67+
orig: Content before first heading
68+
parent:
69+
$ref: '#/body'
70+
prov: []
71+
self_ref: '#/texts/1'
72+
text: Content before first heading
73+
- children:
74+
- $ref: '#/groups/1'
75+
- $ref: '#/texts/7'
76+
content_layer: body
77+
label: section_header
78+
level: 1
79+
orig: Some heading
80+
parent:
81+
$ref: '#/groups/0'
82+
prov: []
83+
self_ref: '#/texts/2'
84+
text: Some heading
85+
- children:
86+
- $ref: '#/groups/2'
87+
content_layer: body
88+
enumerated: false
89+
label: list_item
90+
marker: ''
91+
orig: A. first
92+
parent:
93+
$ref: '#/groups/1'
94+
prov: []
95+
self_ref: '#/texts/3'
96+
text: A. first
97+
- children: []
98+
content_layer: body
99+
enumerated: false
100+
label: list_item
101+
marker: ''
102+
orig: subitem
103+
parent:
104+
$ref: '#/groups/2'
105+
prov: []
106+
self_ref: '#/texts/4'
107+
text: subitem
108+
- children: []
109+
content_layer: body
110+
enumerated: false
111+
label: list_item
112+
marker: ''
113+
orig: B. second
114+
parent:
115+
$ref: '#/groups/1'
116+
prov: []
117+
self_ref: '#/texts/5'
118+
text: B. second
119+
- children: []
120+
content_layer: body
121+
enumerated: false
122+
label: list_item
123+
marker: ''
124+
orig: 2 . strange
125+
parent:
126+
$ref: '#/groups/1'
127+
prov: []
128+
self_ref: '#/texts/6'
129+
text: 2 . strange
130+
- children: []
131+
content_layer: body
132+
label: text
133+
orig: The end!
134+
parent:
135+
$ref: '#/texts/2'
136+
prov: []
137+
self_ref: '#/texts/7'
138+
text: The end!
139+
version: 1.5.0

tests/data/md/mixed_without_h1.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ Content before first heading
77
- A. first
88
- subitem
99
- B. second
10-
- 2. strange
10+
- 2\. strange
1111

1212
The end!

tests/test_backend_markdown.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_convert_valid():
1616
relevant_paths = sorted((root_path / "md").rglob("*.md"))
1717
assert len(relevant_paths) > 0
1818

19-
yaml_filter = ["inline_and_formatting"]
19+
yaml_filter = ["inline_and_formatting", "mixed_without_h1"]
2020

2121
for in_path in relevant_paths:
2222
md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
@@ -41,17 +41,16 @@ def test_convert_valid():
4141
f.write(f"{act_data}\n")
4242

4343
if in_path.stem in yaml_filter:
44-
with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
45-
act_doc.save_as_yaml(
46-
yaml_gt_path,
47-
coord_precision=COORD_PREC,
48-
confid_precision=CONFID_PREC,
49-
)
44+
act_doc.save_as_yaml(
45+
yaml_gt_path,
46+
coord_precision=COORD_PREC,
47+
confid_precision=CONFID_PREC,
48+
)
5049
else:
5150
with open(md_gt_path, encoding="utf-8") as f:
5251
exp_data = f.read().rstrip()
5352
assert act_data == exp_data
5453

5554
if in_path.stem in yaml_filter:
5655
exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
57-
assert act_doc == exp_doc
56+
assert act_doc == exp_doc, f"export to yaml failed on {in_path}"

0 commit comments

Comments
 (0)