Skip to content

Commit e063b97

Browse files
feat: add initial rule-based model to identify ListItem markers (#113)
Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> Co-authored-by: Panos Vagenas <[email protected]>
1 parent 3f84f82 commit e063b97

File tree

4 files changed

+372
-4
lines changed

4 files changed

+372
-4
lines changed

docling_ibm_models/list_item_normalizer/__init__.py

Whitespace-only changes.
Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
"""
2+
List Item Marker Processor for Docling Documents
3+
4+
This module provides a rule-based model to identify list item markers and
5+
merge marker-only TextItems with their content to create proper ListItems.
6+
"""
7+
8+
import logging
9+
import re
10+
from typing import Union
11+
12+
from docling_core.types.doc.document import (
13+
DocItemLabel,
14+
DoclingDocument,
15+
ListItem,
16+
ProvenanceItem,
17+
RefItem,
18+
TextItem,
19+
)
20+
from docling_core.types.doc.labels import DocItemLabel
21+
22+
_log = logging.getLogger(__name__)
23+
24+
25+
class ListItemMarkerProcessor:
26+
"""
27+
A rule-based processor for identifying and processing list item markers.
28+
29+
This class can:
30+
1. Identify various list item markers (bullets, numbers, letters)
31+
2. Detect marker-only TextItems followed by content TextItems
32+
3. Merge them into proper ListItems
33+
4. Group consecutive ListItems into appropriate list containers
34+
"""
35+
36+
def __init__(self):
37+
"""Initialize the processor with marker patterns."""
38+
# Bullet markers (unordered lists)
39+
self._bullet_patterns = [
40+
r"[\u2022\u2023\u25E6\u2043\u204C\u204D\u2219\u25AA\u25AB\u25CF\u25CB]", # Various bullet symbols
41+
r"[-*+•·‣⁃]", # Common ASCII and Unicode bullets
42+
r"[►▶▸‣➤➢]", # Arrow-like bullets
43+
r"[✓✔✗✘]", # Checkmark bullets
44+
]
45+
46+
# Numbered markers (ordered lists)
47+
self._numbered_patterns = [
48+
r"\d+\.", # 1. 2. 3.
49+
r"\d+\)", # 1) 2) 3)
50+
r"\(\d+\)", # (1) (2) (3)
51+
r"\[\d+\]", # [1] [2] [3]
52+
r"[ivxlcdm]+\.", # i. ii. iii. (Roman numerals lowercase)
53+
r"[IVXLCDM]+\.", # I. II. III. (Roman numerals uppercase)
54+
r"[a-z]\.", # a. b. c.
55+
r"[A-Z]\.", # A. B. C.
56+
r"[a-z]\)", # a) b) c)
57+
r"[A-Z]\)", # A) B) C)
58+
]
59+
60+
# Compile all patterns
61+
self._compiled_bullet_patterns = [
62+
re.compile(f"^{pattern}$") for pattern in self._bullet_patterns
63+
]
64+
self._compiled_numbered_patterns = [
65+
re.compile(f"^{pattern}$") for pattern in self._numbered_patterns
66+
]
67+
68+
self._compiled_bullet_item_patterns = [
69+
re.compile(f"^({pattern})" + r"\s(.+)") for pattern in self._bullet_patterns
70+
]
71+
self._compiled_numbered_item_patterns = [
72+
re.compile(f"^({pattern})" + r"\s(.+)")
73+
for pattern in self._numbered_patterns
74+
]
75+
76+
self._compiled_item_patterns = (
77+
self._compiled_bullet_item_patterns + self._compiled_numbered_item_patterns
78+
)
79+
80+
def _is_bullet_marker(self, text: str) -> bool:
81+
"""Check if text is a bullet marker."""
82+
text = text.strip()
83+
return any(pattern.match(text) for pattern in self._compiled_bullet_patterns)
84+
85+
def _is_numbered_marker(self, text: str) -> bool:
86+
"""Check if text is a numbered marker."""
87+
text = text.strip()
88+
return any(pattern.match(text) for pattern in self._compiled_numbered_patterns)
89+
90+
def _find_marker_content_pairs(self, doc: DoclingDocument):
91+
"""
92+
Find pairs of marker-only TextItems and their content TextItems.
93+
94+
Returns:
95+
List of (marker_item, content_item) tuples. content_item can be None
96+
if the marker item already contains content.
97+
"""
98+
self._matched_items: dict[int, tuple[RefItem, bool]] = (
99+
{}
100+
) # index to (self_ref, is_pure_marker)
101+
self._other: dict[int, RefItem] = {} # index to self_ref
102+
103+
for i, (item, level) in enumerate(doc.iterate_items(with_groups=False)):
104+
if not isinstance(item, TextItem):
105+
continue
106+
107+
if self._is_bullet_marker(item.orig):
108+
self._matched_items[i] = (item.get_ref(), True)
109+
elif self._is_numbered_marker(item.orig):
110+
self._matched_items[i] = (item.get_ref(), True)
111+
else:
112+
for pattern in self._compiled_item_patterns:
113+
mtch = pattern.match(item.orig)
114+
if mtch:
115+
self._matched_items[i] = (item.get_ref(), False)
116+
117+
if i not in self._matched_items:
118+
self._other[i] = item.get_ref()
119+
120+
def _group_consecutive_list_items(self, doc: DoclingDocument) -> DoclingDocument:
121+
"""
122+
Might need to group list-items, not sure yet how...
123+
"""
124+
return doc
125+
126+
def process_list_item(self, item: ListItem) -> ListItem:
127+
"""Process a ListItem to extract and update marker and text from bullet/numbered patterns.
128+
129+
This method applies compiled regex patterns to match bullet point or numbered list
130+
formatting in the original text, then updates the ListItem's marker and text fields
131+
accordingly.
132+
133+
Args:
134+
item (ListItem): The list item to process, containing original text that may
135+
have bullet or numbered list formatting.
136+
137+
Returns:
138+
ListItem: The same ListItem instance with updated marker and text fields
139+
if a pattern match was found, otherwise unchanged.
140+
141+
Note:
142+
The method modifies the input item in place when a pattern matches.
143+
If the item is not actually a ListItem type, a warning is logged.
144+
"""
145+
for pattern in self._compiled_item_patterns:
146+
mtch = pattern.match(item.orig)
147+
if mtch:
148+
if isinstance(item, ListItem): # update item in place
149+
item.marker = mtch[1]
150+
item.text = mtch[2]
151+
else:
152+
_log.warning(
153+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
154+
)
155+
return item
156+
157+
def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
158+
"""Process a TextItem to detect and convert bullet/numbered list formatting.
159+
160+
This method examines TextItem instances to determine if they contain bullet point
161+
or numbered list formatting. If detected and appropriate, it either updates an
162+
existing ListItem or converts the TextItem into a new ListItem.
163+
164+
Args:
165+
item (TextItem): The text item to process, which may contain bullet or
166+
numbered list formatting in its original text.
167+
168+
Returns:
169+
Union[TextItem, ListItem]:
170+
- If item is already a ListItem: returns the updated ListItem
171+
- If item is a TextItem with list formatting (and not a section heading
172+
or footnote): returns a new ListItem with extracted marker and text
173+
- Otherwise: returns the original TextItem unchanged
174+
175+
Note:
176+
Section headings and footnotes are excluded from conversion to preserve
177+
their semantic meaning. A warning is logged if pattern matching occurs
178+
on unexpected item types.
179+
"""
180+
for pattern in self._compiled_item_patterns:
181+
mtch = pattern.match(item.orig)
182+
if mtch:
183+
if isinstance(item, ListItem): # update item in place
184+
item.marker = mtch[1]
185+
item.text = mtch[2]
186+
187+
return item
188+
elif isinstance(item, TextItem) and (
189+
item.label
190+
not in [DocItemLabel.SECTION_HEADER, DocItemLabel.FOOTNOTE]
191+
):
192+
# Create new ListItem
193+
return ListItem(
194+
self_ref=item.get_ref().cref,
195+
marker=mtch[1],
196+
text=mtch[2],
197+
orig=item.orig,
198+
prov=item.prov,
199+
)
200+
else:
201+
_log.warning(
202+
f"matching text for bullet_item_patterns that is not ListItem: {item.label}"
203+
)
204+
return item
205+
206+
def update_list_items_in_place(
207+
self, doc: DoclingDocument, allow_textitem: bool = False
208+
) -> DoclingDocument:
209+
for item, level in doc.iterate_items():
210+
if isinstance(item, ListItem):
211+
item = self.process_list_item(item)
212+
elif allow_textitem and isinstance(item, TextItem):
213+
item = self.process_text_item(item)
214+
215+
return doc
216+
217+
def merge_markers_and_text_items_into_list_items(
218+
self, doc: DoclingDocument
219+
) -> DoclingDocument:
220+
def create_listitem(
221+
marker_text: str,
222+
content_text: str,
223+
orig_text: str,
224+
prov: list[ProvenanceItem],
225+
) -> ListItem:
226+
# Create new ListItem
227+
return ListItem(
228+
self_ref="#",
229+
marker=marker_text,
230+
text=content_text,
231+
orig=orig_text,
232+
prov=prov,
233+
)
234+
235+
# Find all marker-content pairs: this function will identify text-items
236+
# with a marker fused into the text
237+
self._find_marker_content_pairs(doc)
238+
239+
# If you find a sole marker-item followed by a text, there are
240+
# good chances we need to merge them into a list-item. This
241+
# function is only necessary as long as the layout-model does not
242+
# recognize list-items properly
243+
for ind, (self_ref, is_marker) in self._matched_items.items():
244+
245+
if is_marker:
246+
247+
marker_item = self_ref.resolve(doc=doc)
248+
249+
if ind + 1 in self._other:
250+
next_item = self._other[ind + 1].resolve(doc=doc)
251+
252+
if (isinstance(next_item, TextItem)) and (
253+
next_item.label in [DocItemLabel.TEXT, DocItemLabel.LIST_ITEM]
254+
):
255+
256+
marker_text: str = marker_item.text
257+
content_text: str = next_item.text
258+
prov = marker_item.prov
259+
prov.extend(next_item.prov)
260+
261+
list_item = create_listitem(
262+
marker_text=marker_text,
263+
content_text=content_text,
264+
orig_text=f"{marker_text} {content_text}",
265+
prov=prov,
266+
)
267+
268+
# Insert the new ListItem
269+
doc.insert_item_before_sibling(
270+
new_item=list_item, sibling=marker_item
271+
)
272+
273+
# Delete original items
274+
items_to_delete = [marker_item, next_item]
275+
doc.delete_items(node_items=items_to_delete)
276+
277+
return doc
278+
279+
def process_document(
280+
self,
281+
doc: DoclingDocument,
282+
allow_textitem: bool = False,
283+
merge_items: bool = False,
284+
) -> DoclingDocument:
285+
"""
286+
Process the entire document to identify and convert list markers.
287+
288+
Args:
289+
doc: The DoclingDocument to process
290+
291+
Returns:
292+
The processed document (modified in-place)
293+
"""
294+
doc = self.update_list_items_in_place(doc, allow_textitem=allow_textitem)
295+
296+
if merge_items:
297+
doc = self.merge_markers_and_text_items_into_list_items(doc)
298+
299+
# Group consecutive list items
300+
doc = self._group_consecutive_list_items(doc)
301+
302+
return doc
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from docling_core.types.doc.document import DoclingDocument, ProvenanceItem
2+
from docling_core.types.doc.base import BoundingBox, CoordOrigin
3+
4+
from docling_core.types.doc.labels import DocItemLabel
5+
6+
from docling_ibm_models.list_item_normalizer.list_marker_processor import ListItemMarkerProcessor
7+
8+
# Example usage and testing
9+
def test_listitem_marker_model():
10+
"""Example of how to use the ListItemMarkerProcessor."""
11+
12+
# Create a sample document
13+
doc = DoclingDocument(name="Sample Document")
14+
15+
doc.add_text(
16+
label=DocItemLabel.TEXT,
17+
text="• Second item with bullet and content", # Marker and content together
18+
prov=ProvenanceItem(
19+
page_no=0,
20+
bbox=BoundingBox(l=0, t=15, r=200, b=25, coord_origin=CoordOrigin.TOPLEFT),
21+
charspan=(0, 37)
22+
)
23+
)
24+
25+
doc.add_list_item(
26+
text="• Third item with bullet and content", # Marker and content together
27+
prov=ProvenanceItem(
28+
page_no=0,
29+
bbox=BoundingBox(l=0, t=15, r=200, b=25, coord_origin=CoordOrigin.TOPLEFT),
30+
charspan=(0, 37)
31+
)
32+
)
33+
34+
# Add some sample text items that should be converted to list items
35+
doc.add_text(
36+
label=DocItemLabel.TEXT,
37+
text="1.", # Marker only
38+
prov=ProvenanceItem(
39+
page_no=0,
40+
bbox=BoundingBox(l=0, t=0, r=10, b=10, coord_origin=CoordOrigin.TOPLEFT),
41+
charspan=(0, 2)
42+
)
43+
)
44+
45+
doc.add_text(
46+
label=DocItemLabel.TEXT,
47+
text="First item content", # Content only
48+
prov=ProvenanceItem(
49+
page_no=0,
50+
bbox=BoundingBox(l=15, t=0, r=100, b=10, coord_origin=CoordOrigin.TOPLEFT),
51+
charspan=(0, 18)
52+
)
53+
)
54+
55+
# Process the document
56+
processor = ListItemMarkerProcessor()
57+
processed_doc = processor.process_document(doc)
58+
59+
# print(" ---------- document: \n", processed_doc.export_to_markdown(), "\n ---------- \n")
60+
61+
assert len(processed_doc.texts)==3, "len(processed_doc.texts)==3"
62+
63+
assert processed_doc.texts[0].text=="• Second item with bullet and content"
64+
65+
assert processed_doc.texts[1].text=="Third item with bullet and content"
66+
assert processed_doc.texts[1].marker=="•"
67+
68+
assert processed_doc.texts[2].label==DocItemLabel.LIST_ITEM
69+
assert processed_doc.texts[2].text=="First item content"
70+
assert processed_doc.texts[2].marker=="1."

tests/test_reading_order.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
#
2-
# Copyright IBM Corp. 2024 - 2024
3-
# SPDX-License-Identifier: MIT
4-
#
51
import os
62
import json
73
import glob

0 commit comments

Comments
 (0)