Skip to content

Commit c383f64

Browse files
authored
fix: fix misplaced list items (#317)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent ae96129 commit c383f64

File tree

5 files changed

+246
-2
lines changed

5 files changed

+246
-2
lines changed

docling_core/types/doc/document.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,18 @@ def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
18191819
item.parent = parent_ref
18201820

18211821
self.form_items.append(item)
1822+
1823+
elif isinstance(item, (UnorderedList, OrderedList, InlineGroup)):
1824+
item_label = "groups"
1825+
item_index = len(self.groups)
1826+
1827+
cref = f"#/{item_label}/{item_index}"
1828+
1829+
item.self_ref = cref
1830+
item.parent = parent_ref
1831+
1832+
self.groups.append(item)
1833+
18221834
else:
18231835
raise ValueError(f"Item {item} is not supported for insertion")
18241836

@@ -2154,8 +2166,8 @@ def add_list_item(
21542166
:param parent: Optional[NodeItem]: (Default value = None)
21552167
21562168
"""
2157-
if not parent:
2158-
parent = self.body
2169+
if not isinstance(parent, (OrderedList, UnorderedList)):
2170+
raise ValueError("ListItem's parent must be a list group")
21592171

21602172
if not orig:
21612173
orig = text
@@ -4197,3 +4209,58 @@ def validate_document(cls, d: "DoclingDocument"):
41974209
raise ValueError("Document hierachy is inconsistent.")
41984210

41994211
return d
4212+
4213+
@model_validator(mode="after")
4214+
def validate_misplaced_list_items(self):
4215+
"""validate_misplaced_list_items."""
4216+
# find list items without list parent, putting succesive ones together
4217+
misplaced_list_items: list[list[ListItem]] = []
4218+
prev: Optional[NodeItem] = None
4219+
for item, _ in self.iterate_items(
4220+
traverse_pictures=True,
4221+
included_content_layers={c for c in ContentLayer},
4222+
with_groups=True, # so that we can distinguish neighboring lists
4223+
):
4224+
if isinstance(item, ListItem) and (
4225+
item.parent is None
4226+
or not isinstance(
4227+
item.parent.resolve(doc=self), (OrderedList, UnorderedList)
4228+
)
4229+
):
4230+
# non_group_list_items.append(item)
4231+
if prev is None or not isinstance(prev, ListItem): # if new list
4232+
misplaced_list_items.append([item])
4233+
else:
4234+
misplaced_list_items[-1].append(item)
4235+
prev = item
4236+
4237+
for curr_list_items in reversed(misplaced_list_items):
4238+
4239+
# add group
4240+
new_group = (
4241+
OrderedList(self_ref="#")
4242+
if curr_list_items[0].enumerated
4243+
else UnorderedList(self_ref="#")
4244+
)
4245+
self.insert_item_before_sibling(
4246+
new_item=new_group,
4247+
sibling=curr_list_items[0],
4248+
)
4249+
4250+
# delete list items from document (should not be affected by group addition)
4251+
self.delete_items(node_items=curr_list_items)
4252+
4253+
# add list items to new group
4254+
for li in curr_list_items:
4255+
self.add_list_item(
4256+
text=li.text,
4257+
enumerated=li.enumerated,
4258+
marker=li.marker,
4259+
orig=li.orig,
4260+
prov=li.prov[0] if li.prov else None,
4261+
parent=new_group,
4262+
content_layer=li.content_layer,
4263+
formatting=li.formatting,
4264+
hyperlink=li.hyperlink,
4265+
)
4266+
return self
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
body:
2+
children:
3+
- $ref: '#/groups/1'
4+
- $ref: '#/texts/0'
5+
- $ref: '#/groups/0'
6+
content_layer: body
7+
label: unspecified
8+
name: _root_
9+
self_ref: '#/body'
10+
form_items: []
11+
furniture:
12+
children: []
13+
content_layer: furniture
14+
label: unspecified
15+
name: _root_
16+
self_ref: '#/furniture'
17+
groups:
18+
- children:
19+
- $ref: '#/texts/1'
20+
- $ref: '#/texts/2'
21+
content_layer: body
22+
label: list
23+
name: group
24+
parent:
25+
$ref: '#/body'
26+
self_ref: '#/groups/0'
27+
- children:
28+
- $ref: '#/texts/3'
29+
content_layer: body
30+
label: ordered_list
31+
name: group
32+
parent:
33+
$ref: '#/body'
34+
self_ref: '#/groups/1'
35+
key_value_items: []
36+
name: ''
37+
pages: {}
38+
pictures: []
39+
schema_name: DoclingDocument
40+
tables: []
41+
texts:
42+
- children: []
43+
content_layer: body
44+
label: text
45+
orig: bar
46+
parent:
47+
$ref: '#/body'
48+
prov: []
49+
self_ref: '#/texts/0'
50+
text: bar
51+
- children: []
52+
content_layer: body
53+
enumerated: false
54+
label: list_item
55+
marker: '-'
56+
orig: here
57+
parent:
58+
$ref: '#/groups/0'
59+
prov: []
60+
self_ref: '#/texts/1'
61+
text: here
62+
- children: []
63+
content_layer: body
64+
enumerated: false
65+
label: list_item
66+
marker: '-'
67+
orig: there
68+
parent:
69+
$ref: '#/groups/0'
70+
prov: []
71+
self_ref: '#/texts/2'
72+
text: there
73+
- children: []
74+
content_layer: body
75+
enumerated: true
76+
label: list_item
77+
marker: '1.'
78+
orig: foo
79+
parent:
80+
$ref: '#/groups/1'
81+
prov: []
82+
self_ref: '#/texts/3'
83+
text: foo
84+
version: 1.4.0
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
body:
2+
children:
3+
- $ref: '#/texts/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/texts/2'
6+
- $ref: '#/texts/3'
7+
content_layer: body
8+
label: unspecified
9+
name: _root_
10+
self_ref: '#/body'
11+
form_items: []
12+
furniture:
13+
children: []
14+
content_layer: furniture
15+
label: unspecified
16+
name: _root_
17+
self_ref: '#/furniture'
18+
groups: []
19+
key_value_items: []
20+
name: ''
21+
pages: {}
22+
pictures: []
23+
schema_name: DoclingDocument
24+
tables: []
25+
texts:
26+
- children: []
27+
content_layer: body
28+
enumerated: true
29+
label: list_item
30+
marker: '1.'
31+
orig: foo
32+
parent:
33+
$ref: '#/body'
34+
prov: []
35+
self_ref: '#/texts/0'
36+
text: foo
37+
- children: []
38+
content_layer: body
39+
label: text
40+
orig: bar
41+
parent:
42+
$ref: '#/body'
43+
prov: []
44+
self_ref: '#/texts/1'
45+
text: bar
46+
- children: []
47+
content_layer: body
48+
enumerated: false
49+
label: list_item
50+
marker: '-'
51+
orig: here
52+
parent:
53+
$ref: '#/body'
54+
prov: []
55+
self_ref: '#/texts/2'
56+
text: here
57+
- children: []
58+
content_layer: body
59+
enumerated: false
60+
label: list_item
61+
marker: '-'
62+
orig: there
63+
parent:
64+
$ref: '#/body'
65+
prov: []
66+
self_ref: '#/texts/3'
67+
text: there
68+
version: 1.3.0
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<doctag><ordered_list><list_item>foo</list_item>
2+
</ordered_list>
3+
<text>bar</text>
4+
<unordered_list><list_item>here</list_item>
5+
<list_item>there</list_item>
6+
</unordered_list>
7+
</doctag>

test/test_docling_doc.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,9 @@ def _construct_doc() -> DoclingDocument:
757757
leading_list = doc.add_group(parent=None, label=GroupLabel.LIST)
758758
doc.add_list_item(parent=leading_list, text="item of leading list")
759759

760+
with pytest.raises(ValueError, match="list group"):
761+
doc.add_list_item(text="Misplaced list item")
762+
760763
title = doc.add_title(
761764
text="Title of the Document"
762765
) # can be done if such information is present, or ommitted.
@@ -1616,3 +1619,18 @@ def _verify(filename: Path, document: DoclingDocument, generate: bool = False):
16161619

16171620
filename = Path("test/data/doc/constructed_doc.replaced_item.json")
16181621
_verify(filename=filename, document=doc, generate=GEN_TEST_DATA)
1622+
1623+
1624+
def test_misplaced_list_items():
1625+
filename = Path("test/data/doc/misplaced_list_items.yaml")
1626+
doc = DoclingDocument.load_from_yaml(filename)
1627+
1628+
dt_pred = doc.export_to_doctags()
1629+
_verify_regression_test(dt_pred, filename=str(filename), ext="dt")
1630+
1631+
exp_file = filename.parent / f"{filename.stem}.out.yaml"
1632+
if GEN_TEST_DATA:
1633+
doc.save_as_yaml(exp_file)
1634+
else:
1635+
exp_doc = DoclingDocument.load_from_yaml(exp_file)
1636+
assert doc == exp_doc

0 commit comments

Comments
 (0)