@@ -3195,9 +3195,9 @@ def export_to_html( # noqa: C901
31953195
31963196 return ser_res .text
31973197
3198+ @staticmethod
31983199 def load_from_doctags ( # noqa: C901
3199- self ,
3200- doctag_document : DocTagsDocument ,
3200+ doctag_document : DocTagsDocument , document_name : str = "Document"
32013201 ) -> "DoclingDocument" :
32023202 r"""Load Docling document from lists of DocTags and Images."""
32033203 # Maps the recognized tag to a Docling label.
@@ -3221,6 +3221,8 @@ def load_from_doctags( # noqa: C901
32213221 "key_value_region" : DocItemLabel .KEY_VALUE_REGION ,
32223222 }
32233223
3224+ doc = DoclingDocument (name = document_name )
3225+
32243226 def extract_bounding_box (text_chunk : str ) -> Optional [BoundingBox ]:
32253227 """Extract <loc_...> coords from the chunk, normalized by / 500."""
32263228 coords = re .findall (r"<loc_(\d+)>" , text_chunk )
@@ -3244,7 +3246,7 @@ def extract_caption(
32443246 caption_content = caption .group (1 )
32453247 bbox = extract_bounding_box (caption_content )
32463248 caption_text = extract_inner_text (caption_content )
3247- caption_item = self .add_text (
3249+ caption_item = doc .add_text (
32483250 label = DocItemLabel .CAPTION ,
32493251 text = caption_text ,
32503252 parent = None ,
@@ -3567,7 +3569,7 @@ def parse_key_value_item(
35673569 pg_width = 1
35683570 pg_height = 1
35693571
3570- self .add_page (
3572+ doc .add_page (
35713573 page_no = page_no ,
35723574 size = Size (width = pg_width , height = pg_height ),
35733575 image = ImageRef .from_pil (image = image , dpi = 72 ) if image else None ,
@@ -3624,9 +3626,9 @@ def parse_key_value_item(
36243626 charspan = (0 , 0 ),
36253627 page_no = page_no ,
36263628 )
3627- self .add_table (data = table_data , prov = prov , caption = caption )
3629+ doc .add_table (data = table_data , prov = prov , caption = caption )
36283630 else :
3629- self .add_table (data = table_data , caption = caption )
3631+ doc .add_table (data = table_data , caption = caption )
36303632
36313633 elif tag_name in [DocItemLabel .PICTURE , DocItemLabel .CHART ]:
36323634 caption , caption_bbox = extract_caption (full_chunk )
@@ -3646,7 +3648,7 @@ def parse_key_value_item(
36463648 int (bbox .b * im_height ),
36473649 )
36483650 cropped_image = image .crop (crop_box )
3649- pic = self .add_picture (
3651+ pic = doc .add_picture (
36503652 parent = None ,
36513653 image = ImageRef .from_pil (image = cropped_image , dpi = 72 ),
36523654 prov = (
@@ -3692,7 +3694,7 @@ def parse_key_value_item(
36923694 else :
36933695 if bbox :
36943696 # In case we don't have access to an binary of an image
3695- pic = self .add_picture (
3697+ pic = doc .add_picture (
36963698 parent = None ,
36973699 prov = ProvenanceItem (
36983700 bbox = bbox , charspan = (0 , 0 ), page_no = page_no
@@ -3733,7 +3735,7 @@ def parse_key_value_item(
37333735 key_value_data , kv_item_prov = parse_key_value_item (
37343736 full_chunk , image
37353737 )
3736- self .add_key_values (graph = key_value_data , prov = kv_item_prov )
3738+ doc .add_key_values (graph = key_value_data , prov = kv_item_prov )
37373739 elif tag_name in [
37383740 DocumentToken .ORDERED_LIST .value ,
37393741 DocumentToken .UNORDERED_LIST .value ,
@@ -3749,7 +3751,7 @@ def parse_key_value_item(
37493751 )
37503752 li_pattern = re .compile (list_item_pattern , re .DOTALL )
37513753 # Add list group:
3752- new_list = self .add_group (label = list_label , name = "list" )
3754+ new_list = doc .add_group (label = list_label , name = "list" )
37533755 # Pricess list items
37543756 for li_match in li_pattern .finditer (full_chunk ):
37553757 enum_value += 1
@@ -3760,7 +3762,7 @@ def parse_key_value_item(
37603762 li_bbox = extract_bounding_box (li_full_chunk ) if image else None
37613763 text_content = extract_inner_text (li_full_chunk )
37623764 # Add list item
3763- self .add_list_item (
3765+ doc .add_list_item (
37643766 marker = enum_marker ,
37653767 enumerated = (tag_name == DocumentToken .ORDERED_LIST .value ),
37663768 parent = new_list ,
@@ -3792,13 +3794,13 @@ def parse_key_value_item(
37923794 if tag_name in [DocItemLabel .PAGE_HEADER , DocItemLabel .PAGE_FOOTER ]:
37933795 content_layer = ContentLayer .FURNITURE
37943796
3795- self .add_text (
3797+ doc .add_text (
37963798 label = doc_label ,
37973799 text = text_content ,
37983800 prov = element_prov ,
37993801 content_layer = content_layer ,
38003802 )
3801- return self
3803+ return doc
38023804
38033805 @deprecated ("Use save_as_doctags instead." )
38043806 def save_as_document_tokens (self , * args , ** kwargs ):
0 commit comments