@@ -3649,6 +3649,52 @@ def parse_key_value_item(
36493649
36503650 return (GraphData (cells = cells , links = links ), overall_prov )
36513651
3652+ def _add_text (
3653+ full_chunk : str ,
3654+ bbox : Optional [BoundingBox ],
3655+ pg_width : int ,
3656+ pg_height : int ,
3657+ page_no : int ,
3658+ tag_name : str ,
3659+ doc_label : DocItemLabel ,
3660+ doc : DoclingDocument ,
3661+ parent : Optional [NodeItem ],
3662+ ):
3663+ # For everything else, treat as text
3664+ text_content = extract_inner_text (full_chunk )
3665+ element_prov = (
3666+ ProvenanceItem (
3667+ bbox = bbox .resize_by_scale (pg_width , pg_height ),
3668+ charspan = (0 , len (text_content )),
3669+ page_no = page_no ,
3670+ )
3671+ if bbox
3672+ else None
3673+ )
3674+
3675+ content_layer = ContentLayer .BODY
3676+ if tag_name in [DocItemLabel .PAGE_HEADER , DocItemLabel .PAGE_FOOTER ]:
3677+ content_layer = ContentLayer .FURNITURE
3678+
3679+ if doc_label == DocItemLabel .SECTION_HEADER :
3680+ # Extract level from tag_name (e.g. "section_level_header_1" -> 1)
3681+ level = int (tag_name .split ("_" )[- 1 ])
3682+ doc .add_heading (
3683+ text = text_content ,
3684+ level = level ,
3685+ prov = element_prov ,
3686+ parent = parent ,
3687+ content_layer = content_layer ,
3688+ )
3689+ else :
3690+ doc .add_text (
3691+ label = doc_label ,
3692+ text = text_content ,
3693+ prov = element_prov ,
3694+ parent = parent ,
3695+ content_layer = content_layer ,
3696+ )
3697+
36523698 # doc = DoclingDocument(name="Document")
36533699 for pg_idx , doctag_page in enumerate (doctag_document .pages ):
36543700 page_doctags = doctag_page .tokens
@@ -3683,7 +3729,7 @@ def parse_key_value_item(
36833729 tag_pattern = (
36843730 rf"<(?P<tag>{ DocItemLabel .TITLE } |{ DocItemLabel .DOCUMENT_INDEX } |"
36853731 rf"{ DocItemLabel .CHECKBOX_UNSELECTED } |{ DocItemLabel .CHECKBOX_SELECTED } |"
3686- rf"{ DocItemLabel .TEXT } |{ DocItemLabel .PAGE_HEADER } |"
3732+ rf"{ DocItemLabel .TEXT } |{ DocItemLabel .PAGE_HEADER } |{ GroupLabel . INLINE } | "
36873733 rf"{ DocItemLabel .PAGE_FOOTER } |{ DocItemLabel .FORMULA } |"
36883734 rf"{ DocItemLabel .CAPTION } |{ DocItemLabel .PICTURE } |"
36893735 rf"{ DocItemLabel .FOOTNOTE } |{ DocItemLabel .CODE } |"
@@ -3708,7 +3754,7 @@ def parse_key_value_item(
37083754 # no closing tag; only the existence of the item is recovered
37093755 full_chunk = f"<{ tag_name } ></{ tag_name } >"
37103756
3711- doc_label = tag_to_doclabel .get (tag_name , DocItemLabel .PARAGRAPH )
3757+ doc_label = tag_to_doclabel .get (tag_name , DocItemLabel .TEXT )
37123758
37133759 if tag_name == DocumentToken .OTSL .value :
37143760 table_data = parse_table_content (full_chunk )
@@ -3731,6 +3777,24 @@ def parse_key_value_item(
37313777 else :
37323778 doc .add_table (data = table_data , caption = caption )
37333779
3780+ elif tag_name == GroupLabel .INLINE :
3781+ inline_group = doc .add_inline_group ()
3782+ content = match .group ("content" )
3783+ common_bbox = extract_bounding_box (content )
3784+ for item_match in pattern .finditer (content ):
3785+ item_tag = item_match .group ("tag" )
3786+ _add_text (
3787+ full_chunk = item_match .group (0 ),
3788+ bbox = common_bbox ,
3789+ pg_width = pg_width ,
3790+ pg_height = pg_height ,
3791+ page_no = page_no ,
3792+ tag_name = item_tag ,
3793+ doc_label = tag_to_doclabel .get (item_tag , DocItemLabel .TEXT ),
3794+ doc = doc ,
3795+ parent = inline_group ,
3796+ )
3797+
37343798 elif tag_name in [DocItemLabel .PICTURE , DocItemLabel .CHART ]:
37353799 caption , caption_bbox = extract_caption (full_chunk )
37363800 table_data = None
@@ -3880,38 +3944,17 @@ def parse_key_value_item(
38803944 )
38813945 else :
38823946 # For everything else, treat as text
3883- text_content = extract_inner_text (full_chunk )
3884- element_prov = (
3885- ProvenanceItem (
3886- bbox = bbox .resize_by_scale (pg_width , pg_height ),
3887- charspan = (0 , len (text_content )),
3888- page_no = page_no ,
3889- )
3890- if bbox
3891- else None
3947+ _add_text (
3948+ full_chunk = full_chunk ,
3949+ bbox = bbox ,
3950+ pg_width = pg_width ,
3951+ pg_height = pg_height ,
3952+ page_no = page_no ,
3953+ tag_name = tag_name ,
3954+ doc_label = doc_label ,
3955+ doc = doc ,
3956+ parent = None ,
38923957 )
3893-
3894- content_layer = ContentLayer .BODY
3895- if tag_name in [DocItemLabel .PAGE_HEADER , DocItemLabel .PAGE_FOOTER ]:
3896- content_layer = ContentLayer .FURNITURE
3897-
3898- if doc_label == DocItemLabel .SECTION_HEADER :
3899- # Extract level from tag_name (e.g. "section_level_header_1" -> 1)
3900- level = int (tag_name .split ("_" )[- 1 ])
3901- doc .add_heading (
3902- text = text_content ,
3903- level = level ,
3904- prov = element_prov ,
3905- content_layer = content_layer ,
3906- )
3907- else :
3908- doc .add_text (
3909- label = doc_label ,
3910- text = text_content ,
3911- prov = element_prov ,
3912- content_layer = content_layer ,
3913- )
3914-
39153958 return doc
39163959
39173960 @deprecated ("Use save_as_doctags instead." )
0 commit comments