77
88import logging
99import re
10- from typing import Union
10+ from typing import Optional , Union
1111
1212from docling_core .types .doc .document import (
13- DocItemLabel ,
1413 DoclingDocument ,
1514 ListItem ,
15+ NodeItem ,
1616 ProvenanceItem ,
1717 RefItem ,
1818 TextItem ,
@@ -33,8 +33,9 @@ class ListItemMarkerProcessor:
3333 4. Group consecutive ListItems into appropriate list containers
3434 """
3535
36- def __init__ (self ):
36+ def __init__ (self , infer_enumerated : bool = True ):
3737 """Initialize the processor with marker patterns."""
38+ self ._infer_enumerated = infer_enumerated
3839 # Bullet markers (unordered lists)
3940 self ._bullet_patterns = [
4041 r"[\u2022\u2023\u25E6\u2043\u204C\u204D\u2219\u25AA\u25AB\u25CF\u25CB]" , # Various bullet symbols
@@ -73,10 +74,6 @@ def __init__(self):
7374 for pattern in self ._numbered_patterns
7475 ]
7576
76- self ._compiled_item_patterns = (
77- self ._compiled_bullet_item_patterns + self ._compiled_numbered_item_patterns
78- )
79-
8077 def _is_bullet_marker (self , text : str ) -> bool :
8178 """Check if text is a bullet marker."""
8279 text = text .strip ()
@@ -87,6 +84,46 @@ def _is_numbered_marker(self, text: str) -> bool:
8784 text = text .strip ()
8885 return any (pattern .match (text ) for pattern in self ._compiled_numbered_patterns )
8986
87+ def _is_bullet_item (self , text : str ) -> bool :
88+ return any (
89+ pattern .match (text ) for pattern in self ._compiled_bullet_item_patterns
90+ )
91+
92+ def _is_numbered_item (self , text : str ) -> bool :
93+ return any (
94+ pattern .match (text ) for pattern in self ._compiled_numbered_item_patterns
95+ )
96+
97+ @classmethod
98+ def _create_list_item (
99+ cls ,
100+ self_ref ,
101+ marker : str ,
102+ text : str ,
103+ orig : str ,
104+ prov : list [ProvenanceItem ],
105+ enumerated : Optional [bool ] = None ,
106+ ) -> ListItem :
107+ item = ListItem (
108+ self_ref = self_ref ,
109+ marker = marker ,
110+ text = text ,
111+ orig = orig ,
112+ prov = prov ,
113+ )
114+ if enumerated is not None :
115+ item .enumerated = enumerated
116+ return item
117+
118+ def _find_match (
119+ self , text : str , patterns : list [re .Pattern ]
120+ ) -> Optional [re .Match [str ]]:
121+ for pattern in patterns :
122+ mtch = pattern .match (text )
123+ if mtch :
124+ return mtch
125+ return None
126+
90127 def _find_marker_content_pairs (self , doc : DoclingDocument ):
91128 """
92129 Find pairs of marker-only TextItems and their content TextItems.
@@ -95,26 +132,24 @@ def _find_marker_content_pairs(self, doc: DoclingDocument):
95132 List of (marker_item, content_item) tuples. content_item can be None
96133 if the marker item already contains content.
97134 """
98- self ._matched_items : dict [int , tuple [RefItem , bool ]] = (
135+ self ._matched_items : dict [int , tuple [RefItem , bool , bool ]] = (
99136 {}
100- ) # index to (self_ref, is_pure_marker)
137+ ) # index to (self_ref, is_pure_marker, is_enumerated )
101138 self ._other : dict [int , RefItem ] = {} # index to self_ref
102139
103- for i , (item , level ) in enumerate (doc .iterate_items (with_groups = False )):
140+ for i , (item , _ ) in enumerate (doc .iterate_items (with_groups = False )):
104141 if not isinstance (item , TextItem ):
105142 continue
106143
107144 if self ._is_bullet_marker (item .orig ):
108- self ._matched_items [i ] = (item .get_ref (), True )
145+ self ._matched_items [i ] = (item .get_ref (), True , False )
109146 elif self ._is_numbered_marker (item .orig ):
110- self ._matched_items [i ] = (item .get_ref (), True )
147+ self ._matched_items [i ] = (item .get_ref (), True , True )
148+ elif self ._is_bullet_item (item .orig ):
149+ self ._matched_items [i ] = (item .get_ref (), False , False )
150+ elif self ._is_numbered_item (item .orig ):
151+ self ._matched_items [i ] = (item .get_ref (), False , True )
111152 else :
112- for pattern in self ._compiled_item_patterns :
113- mtch = pattern .match (item .orig )
114- if mtch :
115- self ._matched_items [i ] = (item .get_ref (), False )
116-
117- if i not in self ._matched_items :
118153 self ._other [i ] = item .get_ref ()
119154
120155 def _group_consecutive_list_items (self , doc : DoclingDocument ) -> DoclingDocument :
@@ -142,16 +177,27 @@ def process_list_item(self, item: ListItem) -> ListItem:
142177 The method modifies the input item in place when a pattern matches.
143178 If the item is not actually a ListItem type, a warning is logged.
144179 """
145- for pattern in self ._compiled_item_patterns :
146- mtch = pattern .match (item .orig )
147- if mtch :
148- if isinstance (item , ListItem ): # update item in place
149- item .marker = mtch [1 ]
150- item .text = mtch [2 ]
151- else :
152- _log .warning (
153- f"matching text for bullet_item_patterns that is not ListItem: { item .label } "
154- )
180+
181+ is_enumerated : bool
182+ if mtch := self ._find_match (
183+ text = item .orig , patterns = self ._compiled_bullet_item_patterns
184+ ):
185+ is_enumerated = False
186+ elif mtch := self ._find_match (
187+ text = item .orig , patterns = self ._compiled_numbered_item_patterns
188+ ):
189+ is_enumerated = True
190+
191+ if mtch :
192+ if isinstance (item , ListItem ): # update item in place
193+ item .marker = mtch [1 ]
194+ item .text = mtch [2 ]
195+ if self ._infer_enumerated :
196+ item .enumerated = is_enumerated
197+ else :
198+ _log .warning (
199+ f"matching text for bullet_item_patterns that is not ListItem: { item .label } "
200+ )
155201 return item
156202
157203 def process_text_item (self , item : TextItem ) -> Union [TextItem , ListItem ]:
@@ -177,30 +223,43 @@ def process_text_item(self, item: TextItem) -> Union[TextItem, ListItem]:
177223 their semantic meaning. A warning is logged if pattern matching occurs
178224 on unexpected item types.
179225 """
180- for pattern in self ._compiled_item_patterns :
181- mtch = pattern .match (item .orig )
182- if mtch :
183- if isinstance (item , ListItem ): # update item in place
184- item .marker = mtch [1 ]
185- item .text = mtch [2 ]
186-
187- return item
188- elif isinstance (item , TextItem ) and (
189- item .label
190- not in [DocItemLabel .SECTION_HEADER , DocItemLabel .FOOTNOTE ]
191- ):
192- # Create new ListItem
193- return ListItem (
194- self_ref = item .get_ref ().cref ,
195- marker = mtch [1 ],
196- text = mtch [2 ],
197- orig = item .orig ,
198- prov = item .prov ,
199- )
200- else :
201- _log .warning (
202- f"matching text for bullet_item_patterns that is not ListItem: { item .label } "
203- )
226+ is_enumerated : bool
227+ if mtch := self ._find_match (
228+ text = item .orig , patterns = self ._compiled_bullet_item_patterns
229+ ):
230+ is_enumerated = False
231+ elif mtch := self ._find_match (
232+ text = item .orig , patterns = self ._compiled_numbered_item_patterns
233+ ):
234+ is_enumerated = True
235+
236+ if mtch :
237+ marker = mtch [1 ]
238+ text = mtch [2 ]
239+
240+ if isinstance (item , ListItem ): # update item in place
241+ item .marker = marker
242+ item .text = text
243+ if self ._infer_enumerated :
244+ item .enumerated = is_enumerated
245+
246+ return item
247+ elif isinstance (item , TextItem ) and (
248+ item .label not in [DocItemLabel .SECTION_HEADER , DocItemLabel .FOOTNOTE ]
249+ ):
250+ # Create new ListItem
251+ return self ._create_list_item (
252+ self_ref = item .get_ref ().cref ,
253+ marker = marker ,
254+ text = text ,
255+ orig = item .orig ,
256+ prov = item .prov ,
257+ enumerated = is_enumerated if self ._infer_enumerated else None ,
258+ )
259+ else :
260+ _log .warning (
261+ f"matching text for bullet_item_patterns that is not ListItem: { item .label } "
262+ )
204263 return item
205264
206265 def update_list_items_in_place (
@@ -217,30 +276,19 @@ def update_list_items_in_place(
217276 def merge_markers_and_text_items_into_list_items (
218277 self , doc : DoclingDocument
219278 ) -> DoclingDocument :
220- def create_listitem (
221- marker_text : str ,
222- content_text : str ,
223- orig_text : str ,
224- prov : list [ProvenanceItem ],
225- ) -> ListItem :
226- # Create new ListItem
227- return ListItem (
228- self_ref = "#" ,
229- marker = marker_text ,
230- text = content_text ,
231- orig = orig_text ,
232- prov = prov ,
233- )
234279
235280 # Find all marker-content pairs: this function will identify text-items
236281 # with a marker fused into the text
237282 self ._find_marker_content_pairs (doc )
238283
284+ # Accumulate items for post-loop deletion to avoid reference validity issues
285+ to_delete : list [NodeItem ] = []
286+
239287 # If you find a sole marker-item followed by a text, there are
240288 # good chances we need to merge them into a list-item. This
241289 # function is only necessary as long as the layout-model does not
242290 # recognize list-items properly
243- for ind , (self_ref , is_marker ) in self ._matched_items .items ():
291+ for ind , (self_ref , is_marker , is_enumerated ) in self ._matched_items .items ():
244292
245293 if is_marker :
246294
@@ -258,21 +306,32 @@ def create_listitem(
258306 prov = marker_item .prov
259307 prov .extend (next_item .prov )
260308
261- list_item = create_listitem (
262- marker_text = marker_text ,
263- content_text = content_text ,
264- orig_text = f"{ marker_text } { content_text } " ,
309+ list_item = self ._create_list_item (
310+ self_ref = "#" ,
311+ marker = marker_text ,
312+ text = content_text ,
313+ orig = f"{ marker_text } { content_text } " ,
265314 prov = prov ,
315+ enumerated = (
316+ is_enumerated
317+ if self ._infer_enumerated
318+ else (
319+ marker_item .enumerated
320+ if isinstance (marker_item , ListItem )
321+ else None
322+ )
323+ ),
266324 )
267325
268326 # Insert the new ListItem
269327 doc .insert_item_before_sibling (
270328 new_item = list_item , sibling = marker_item
271329 )
272330
273- # Delete original items
274- items_to_delete = [marker_item , next_item ]
275- doc .delete_items (node_items = items_to_delete )
331+ # Accumulate items to delete
332+ to_delete .extend ([marker_item , next_item ])
333+
334+ doc .delete_items (node_items = to_delete )
276335
277336 return doc
278337
0 commit comments