Skip to content

Commit a7fa2b8

Browse files
authored
feat: add enumerated field inference to ListItemMarkerProcessor (#119)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 295cd33 commit a7fa2b8

File tree

3 files changed

+141
-78
lines changed

3 files changed

+141
-78
lines changed

docling_ibm_models/list_item_normalizer/list_marker_processor.py

Lines changed: 133 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77

88
import logging
99
import re
10-
from typing import Union
10+
from typing import Optional, Union
1111

1212
from docling_core.types.doc.document import (
13-
DocItemLabel,
1413
DoclingDocument,
1514
ListItem,
15+
NodeItem,
1616
ProvenanceItem,
1717
RefItem,
1818
TextItem,
@@ -33,8 +33,9 @@ class ListItemMarkerProcessor:
3333
4. Group consecutive ListItems into appropriate list containers
3434
"""
3535

36-
def __init__(self):
36+
def __init__(self, infer_enumerated: bool = True):
3737
"""Initialize the processor with marker patterns."""
38+
self._infer_enumerated = infer_enumerated
3839
# Bullet markers (unordered lists)
3940
self._bullet_patterns = [
4041
r"[\u2022\u2023\u25E6\u2043\u204C\u204D\u2219\u25AA\u25AB\u25CF\u25CB]", # Various bullet symbols
@@ -73,10 +74,6 @@ def __init__(self):
7374
for pattern in self._numbered_patterns
7475
]
7576

76-
self._compiled_item_patterns = (
77-
self._compiled_bullet_item_patterns + self._compiled_numbered_item_patterns
78-
)
79-
8077
def _is_bullet_marker(self, text: str) -> bool:
8178
"""Check if text is a bullet marker."""
8279
text = text.strip()
@@ -87,6 +84,46 @@ def _is_numbered_marker(self, text: str) -> bool:
8784
text = text.strip()
8885
return any(pattern.match(text) for pattern in self._compiled_numbered_patterns)
8986

87+
def _is_bullet_item(self, text: str) -> bool:
88+
return any(
89+
pattern.match(text) for pattern in self._compiled_bullet_item_patterns
90+
)
91+
92+
def _is_numbered_item(self, text: str) -> bool:
93+
return any(
94+
pattern.match(text) for pattern in self._compiled_numbered_item_patterns
95+
)
96+
97+
@classmethod
98+
def _create_list_item(
99+
cls,
100+
self_ref,
101+
marker: str,
102+
text: str,
103+
orig: str,
104+
prov: list[ProvenanceItem],
105+
enumerated: Optional[bool] = None,
106+
) -> ListItem:
107+
item = ListItem(
108+
self_ref=self_ref,
109+
marker=marker,
110+
text=text,
111+
orig=orig,
112+
prov=prov,
113+
)
114+
if enumerated is not None:
115+
item.enumerated = enumerated
116+
return item
117+
118+
def _find_match(
119+
self, text: str, patterns: list[re.Pattern]
120+
) -> Optional[re.Match[str]]:
121+
for pattern in patterns:
122+
mtch = pattern.match(text)
123+
if mtch:
124+
return mtch
125+
return None
126+
90127
def _find_marker_content_pairs(self, doc: DoclingDocument):
91128
"""
92129
Find pairs of marker-only TextItems and their content TextItems.
@@ -95,26 +132,24 @@ def _find_marker_content_pairs(self, doc: DoclingDocument):
95132
List of (marker_item, content_item) tuples. content_item can be None
96133
if the marker item already contains content.
97134
"""
98-
self._matched_items: dict[int, tuple[RefItem, bool]] = (
135+
self._matched_items: dict[int, tuple[RefItem, bool, bool]] = (
99136
{}
100-
) # index to (self_ref, is_pure_marker)
137+
) # index to (self_ref, is_pure_marker, is_enumerated)
101138
self._other: dict[int, RefItem] = {} # index to self_ref
102139

103-
for i, (item, level) in enumerate(doc.iterate_items(with_groups=False)):
140+
for i, (item, _) in enumerate(doc.iterate_items(with_groups=False)):
104141
if not isinstance(item, TextItem):
105142
continue
106143

107144
if self._is_bullet_marker(item.orig):
108-
self._matched_items[i] = (item.get_ref(), True)
145+
self._matched_items[i] = (item.get_ref(), True, False)
109146
elif self._is_numbered_marker(item.orig):
110-
self._matched_items[i] = (item.get_ref(), True)
147+
self._matched_items[i] = (item.get_ref(), True, True)
148+
elif self._is_bullet_item(item.orig):
149+
self._matched_items[i] = (item.get_ref(), False, False)
150+
elif self._is_numbered_item(item.orig):
151+
self._matched_items[i] = (item.get_ref(), False, True)
111152
else:
112-
for pattern in self._compiled_item_patterns:
113-
mtch = pattern.match(item.orig)
114-
if mtch:
115-
self._matched_items[i] = (item.get_ref(), False)
116-
117-
if i not in self._matched_items:
118153
self._other[i] = item.get_ref()
119154

120155
def _group_consecutive_list_items(self, doc: DoclingDocument) -> DoclingDocument:
@@ -142,16 +177,27 @@ def process_list_item(self, item: ListItem) -> ListItem:
142177
The method modifies the input item in place when a pattern matches.
143178
If the item is not actually a ListItem type, a warning is logged.
144179
"""
145-
for pattern in self._compiled_item_patterns:
146-
mtch = pattern.match(item.orig)
147-
if mtch:
148-
if isinstance(item, ListItem): # update item in place
149-
item.marker = mtch[1]
150-
item.text = mtch[2]
151-
else:
152-
_log.warning(
153-
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
154-
)
180+
181+
is_enumerated: bool
182+
if mtch := self._find_match(
183+
text=item.orig, patterns=self._compiled_bullet_item_patterns
184+
):
185+
is_enumerated = False
186+
elif mtch := self._find_match(
187+
text=item.orig, patterns=self._compiled_numbered_item_patterns
188+
):
189+
is_enumerated = True
190+
191+
if mtch:
192+
if isinstance(item, ListItem): # update item in place
193+
item.marker = mtch[1]
194+
item.text = mtch[2]
195+
if self._infer_enumerated:
196+
item.enumerated = is_enumerated
197+
else:
198+
_log.warning(
199+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
200+
)
155201
return item
156202

157203
def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
@@ -177,30 +223,43 @@ def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
177223
their semantic meaning. A warning is logged if pattern matching occurs
178224
on unexpected item types.
179225
"""
180-
for pattern in self._compiled_item_patterns:
181-
mtch = pattern.match(item.orig)
182-
if mtch:
183-
if isinstance(item, ListItem): # update item in place
184-
item.marker = mtch[1]
185-
item.text = mtch[2]
186-
187-
return item
188-
elif isinstance(item, TextItem) and (
189-
item.label
190-
not in [DocItemLabel.SECTION_HEADER, DocItemLabel.FOOTNOTE]
191-
):
192-
# Create new ListItem
193-
return ListItem(
194-
self_ref=item.get_ref().cref,
195-
marker=mtch[1],
196-
text=mtch[2],
197-
orig=item.orig,
198-
prov=item.prov,
199-
)
200-
else:
201-
_log.warning(
202-
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
203-
)
226+
is_enumerated: bool
227+
if mtch := self._find_match(
228+
text=item.orig, patterns=self._compiled_bullet_item_patterns
229+
):
230+
is_enumerated = False
231+
elif mtch := self._find_match(
232+
text=item.orig, patterns=self._compiled_numbered_item_patterns
233+
):
234+
is_enumerated = True
235+
236+
if mtch:
237+
marker = mtch[1]
238+
text = mtch[2]
239+
240+
if isinstance(item, ListItem): # update item in place
241+
item.marker = marker
242+
item.text = text
243+
if self._infer_enumerated:
244+
item.enumerated = is_enumerated
245+
246+
return item
247+
elif isinstance(item, TextItem) and (
248+
item.label not in [DocItemLabel.SECTION_HEADER, DocItemLabel.FOOTNOTE]
249+
):
250+
# Create new ListItem
251+
return self._create_list_item(
252+
self_ref=item.get_ref().cref,
253+
marker=marker,
254+
text=text,
255+
orig=item.orig,
256+
prov=item.prov,
257+
enumerated=is_enumerated if self._infer_enumerated else None,
258+
)
259+
else:
260+
_log.warning(
261+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
262+
)
204263
return item
205264

206265
def update_list_items_in_place(
@@ -217,30 +276,19 @@ def update_list_items_in_place(
217276
def merge_markers_and_text_items_into_list_items(
218277
self, doc: DoclingDocument
219278
) -> DoclingDocument:
220-
def create_listitem(
221-
marker_text: str,
222-
content_text: str,
223-
orig_text: str,
224-
prov: list[ProvenanceItem],
225-
) -> ListItem:
226-
# Create new ListItem
227-
return ListItem(
228-
self_ref="#",
229-
marker=marker_text,
230-
text=content_text,
231-
orig=orig_text,
232-
prov=prov,
233-
)
234279

235280
# Find all marker-content pairs: this function will identify text-items
236281
# with a marker fused into the text
237282
self._find_marker_content_pairs(doc)
238283

284+
# Accumulate items for post-loop deletion to avoid reference validity issues
285+
to_delete: list[NodeItem] = []
286+
239287
# If you find a sole marker-item followed by a text, there are
240288
# good chances we need to merge them into a list-item. This
241289
# function is only necessary as long as the layout-model does not
242290
# recognize list-items properly
243-
for ind, (self_ref, is_marker) in self._matched_items.items():
291+
for ind, (self_ref, is_marker, is_enumerated) in self._matched_items.items():
244292

245293
if is_marker:
246294

@@ -258,21 +306,32 @@ def create_listitem(
258306
prov = marker_item.prov
259307
prov.extend(next_item.prov)
260308

261-
list_item = create_listitem(
262-
marker_text=marker_text,
263-
content_text=content_text,
264-
orig_text=f"{marker_text} {content_text}",
309+
list_item = self._create_list_item(
310+
self_ref="#",
311+
marker=marker_text,
312+
text=content_text,
313+
orig=f"{marker_text} {content_text}",
265314
prov=prov,
315+
enumerated=(
316+
is_enumerated
317+
if self._infer_enumerated
318+
else (
319+
marker_item.enumerated
320+
if isinstance(marker_item, ListItem)
321+
else None
322+
)
323+
),
266324
)
267325

268326
# Insert the new ListItem
269327
doc.insert_item_before_sibling(
270328
new_item=list_item, sibling=marker_item
271329
)
272330

273-
# Delete original items
274-
items_to_delete = [marker_item, next_item]
275-
doc.delete_items(node_items=items_to_delete)
331+
# Accumulate items to delete
332+
to_delete.extend([marker_item, next_item])
333+
334+
doc.delete_items(node_items=to_delete)
276335

277336
return doc
278337

tests/test_listitem_marker_model.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from docling_core.types.doc.document import DoclingDocument, ProvenanceItem
1+
from docling_core.types.doc.document import DoclingDocument, ListItem, ProvenanceItem
22
from docling_core.types.doc.base import BoundingBox, CoordOrigin
33

44
from docling_core.types.doc.labels import DocItemLabel
@@ -62,9 +62,13 @@ def test_listitem_marker_model():
6262

6363
assert processed_doc.texts[0].text=="• Second item with bullet and content"
6464

65+
assert isinstance(processed_doc.texts[1], ListItem)
6566
assert processed_doc.texts[1].text=="Third item with bullet and content"
6667
assert processed_doc.texts[1].marker=="•"
68+
assert not processed_doc.texts[1].enumerated
6769

70+
assert isinstance(processed_doc.texts[2], ListItem)
6871
assert processed_doc.texts[2].label==DocItemLabel.LIST_ITEM
6972
assert processed_doc.texts[2].text=="First item content"
7073
assert processed_doc.texts[2].marker=="1."
74+
assert processed_doc.texts[2].enumerated

uv.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)