55from enum import Enum
66from io import BytesIO
77from pathlib import Path
8- from typing import List , Literal , Optional , Set , Union
8+ from typing import Literal , Optional , Union , cast
99
1010import marko
1111import marko .element
1414 DocItemLabel ,
1515 DoclingDocument ,
1616 DocumentOrigin ,
17+ ListItem ,
1718 NodeItem ,
1819 TableCell ,
1920 TableData ,
@@ -89,7 +90,7 @@ def replace_match(match):
8990 def __init__ (self , in_doc : "InputDocument" , path_or_stream : Union [BytesIO , Path ]):
9091 super ().__init__ (in_doc , path_or_stream )
9192
92- _log .debug ("MD INIT!!! " )
93+ _log .debug ("Starting MarkdownDocumentBackend... " )
9394
9495 # Markdown file:
9596 self .path_or_stream = path_or_stream
@@ -131,7 +132,7 @@ def _close_table(self, doc: DoclingDocument):
131132 for md_table_row in self .md_table_buffer :
132133 _log .debug (md_table_row )
133134 _log .debug ("=== TABLE END ===" )
134- tcells : List [TableCell ] = []
135+ tcells : list [TableCell ] = []
135136 result_table = []
136137 for n , md_table_row in enumerate (self .md_table_buffer ):
137138 data = []
@@ -232,11 +233,12 @@ def _iterate_elements( # noqa: C901
232233 element : marko .element .Element ,
233234 depth : int ,
234235 doc : DoclingDocument ,
235- visited : Set [marko .element .Element ],
236+ visited : set [marko .element .Element ],
236237 creation_stack : list [
237238 _CreationPayload
238239 ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
239240 list_ordered_flag_by_ref : dict [str , bool ],
241+ list_last_item_by_ref : dict [str , ListItem ],
240242 parent_item : Optional [NodeItem ] = None ,
241243 formatting : Optional [Formatting ] = None ,
242244 hyperlink : Optional [Union [AnyUrl , Path ]] = None ,
@@ -279,7 +281,7 @@ def _iterate_elements( # noqa: C901
279281
280282 elif (
281283 isinstance (element , marko .block .ListItem )
282- and len (element .children ) == 1
284+ and len (element .children ) > 0
283285 and isinstance ((child := element .children [0 ]), marko .block .Paragraph )
284286 and len (child .children ) > 0
285287 ):
@@ -291,7 +293,15 @@ def _iterate_elements( # noqa: C901
291293 if parent_item
292294 else False
293295 )
294- if len (child .children ) > 1 : # inline group will be created further down
296+ non_list_children : list [marko .element .Element ] = [
297+ item
298+ for item in child .children
299+ if not isinstance (item , marko .block .ListItem )
300+ ]
301+ if len (non_list_children ) > 1 : # inline group will be created further down
302+ parent_ref : Optional [str ] = (
303+ parent_item .self_ref if parent_item else None
304+ )
295305 parent_item = self ._create_list_item (
296306 doc = doc ,
297307 parent_item = parent_item ,
@@ -300,6 +310,8 @@ def _iterate_elements( # noqa: C901
300310 formatting = formatting ,
301311 hyperlink = hyperlink ,
302312 )
313+ if parent_ref :
314+ list_last_item_by_ref [parent_ref ] = cast (ListItem , parent_item )
303315 else :
304316 creation_stack .append (_ListItemCreationPayload (enumerated = enumerated ))
305317
@@ -334,9 +346,11 @@ def _iterate_elements( # noqa: C901
334346 element .dest
335347 )
336348
337- elif isinstance (element , marko .inline .RawText ):
338- _log .debug (f" - Paragraph (raw text): { element .children } " )
339- snippet_text = element .children .strip ()
349+ elif isinstance (element , (marko .inline .RawText , marko .inline .Literal )):
350+ _log .debug (f" - RawText/Literal: { element .children } " )
351+ snippet_text = (
352+ element .children .strip () if isinstance (element .children , str ) else ""
353+ )
340354 # Detect start of the table:
341355 if "|" in snippet_text or self .in_table :
342356 # most likely part of the markdown table
@@ -359,6 +373,7 @@ def _iterate_elements( # noqa: C901
359373 if parent_item
360374 else False
361375 )
376+ parent_ref = parent_item .self_ref if parent_item else None
362377 parent_item = self ._create_list_item (
363378 doc = doc ,
364379 parent_item = parent_item ,
@@ -367,6 +382,11 @@ def _iterate_elements( # noqa: C901
367382 formatting = formatting ,
368383 hyperlink = hyperlink ,
369384 )
385+ if parent_ref :
386+ list_last_item_by_ref [parent_ref ] = cast (
387+ ListItem , parent_item
388+ )
389+
370390 elif isinstance (to_create , _HeadingCreationPayload ):
371391 # not keeping as parent_item as logic for correctly tracking
372392 # that not implemented yet (section components not captured
@@ -458,13 +478,25 @@ def _iterate_elements( # noqa: C901
458478 element , processed_block_types
459479 ):
460480 for child in element .children :
481+ if (
482+ isinstance (element , marko .block .ListItem )
483+ and isinstance (child , marko .block .List )
484+ and parent_item
485+ and list_last_item_by_ref .get (parent_item .self_ref , None )
486+ ):
487+ _log .debug (
488+ f"walking into new List hanging from item of parent list { parent_item .self_ref } "
489+ )
490+ parent_item = list_last_item_by_ref [parent_item .self_ref ]
491+
461492 self ._iterate_elements (
462493 element = child ,
463494 depth = depth + 1 ,
464495 doc = doc ,
465496 visited = visited ,
466497 creation_stack = creation_stack ,
467498 list_ordered_flag_by_ref = list_ordered_flag_by_ref ,
499+ list_last_item_by_ref = list_last_item_by_ref ,
468500 parent_item = parent_item ,
469501 formatting = formatting ,
470502 hyperlink = hyperlink ,
@@ -483,7 +515,7 @@ def supports_pagination(cls) -> bool:
483515 return False
484516
485517 @classmethod
486- def supported_formats (cls ) -> Set [InputFormat ]:
518+ def supported_formats (cls ) -> set [InputFormat ]:
487519 return {InputFormat .MD }
488520
489521 def convert (self ) -> DoclingDocument :
@@ -510,6 +542,7 @@ def convert(self) -> DoclingDocument:
510542 visited = set (),
511543 creation_stack = [],
512544 list_ordered_flag_by_ref = {},
545+ list_last_item_by_ref = {},
513546 )
514547 self ._close_table (doc = doc ) # handle any last hanging table
515548
@@ -534,7 +567,6 @@ def _restore_original_html(txt, regex):
534567 ]:
535568 html_str = _restore_original_html (txt = html_str , regex = regex )
536569 self ._html_blocks = 0
537-
538570 # delegate to HTML backend
539571 stream = BytesIO (bytes (html_str , encoding = "utf-8" ))
540572 in_doc = InputDocument (
0 commit comments