Skip to content

Commit 8f85d05

Browse files
authored
fix: make load_from_doctags method static (#273)
make load_from_doctags method static Signed-off-by: Saidgurbuz <[email protected]>
1 parent c66d8dd commit 8f85d05

File tree

1 file changed

+15
-13
lines changed

1 file changed

+15
-13
lines changed

docling_core/types/doc/document.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,9 +3195,9 @@ def export_to_html( # noqa: C901
31953195

31963196
return ser_res.text
31973197

3198+
@staticmethod
31983199
def load_from_doctags( # noqa: C901
3199-
self,
3200-
doctag_document: DocTagsDocument,
3200+
doctag_document: DocTagsDocument, document_name: str = "Document"
32013201
) -> "DoclingDocument":
32023202
r"""Load Docling document from lists of DocTags and Images."""
32033203
# Maps the recognized tag to a Docling label.
@@ -3221,6 +3221,8 @@ def load_from_doctags( # noqa: C901
32213221
"key_value_region": DocItemLabel.KEY_VALUE_REGION,
32223222
}
32233223

3224+
doc = DoclingDocument(name=document_name)
3225+
32243226
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
32253227
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
32263228
coords = re.findall(r"<loc_(\d+)>", text_chunk)
@@ -3244,7 +3246,7 @@ def extract_caption(
32443246
caption_content = caption.group(1)
32453247
bbox = extract_bounding_box(caption_content)
32463248
caption_text = extract_inner_text(caption_content)
3247-
caption_item = self.add_text(
3249+
caption_item = doc.add_text(
32483250
label=DocItemLabel.CAPTION,
32493251
text=caption_text,
32503252
parent=None,
@@ -3567,7 +3569,7 @@ def parse_key_value_item(
35673569
pg_width = 1
35683570
pg_height = 1
35693571

3570-
self.add_page(
3572+
doc.add_page(
35713573
page_no=page_no,
35723574
size=Size(width=pg_width, height=pg_height),
35733575
image=ImageRef.from_pil(image=image, dpi=72) if image else None,
@@ -3624,9 +3626,9 @@ def parse_key_value_item(
36243626
charspan=(0, 0),
36253627
page_no=page_no,
36263628
)
3627-
self.add_table(data=table_data, prov=prov, caption=caption)
3629+
doc.add_table(data=table_data, prov=prov, caption=caption)
36283630
else:
3629-
self.add_table(data=table_data, caption=caption)
3631+
doc.add_table(data=table_data, caption=caption)
36303632

36313633
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
36323634
caption, caption_bbox = extract_caption(full_chunk)
@@ -3646,7 +3648,7 @@ def parse_key_value_item(
36463648
int(bbox.b * im_height),
36473649
)
36483650
cropped_image = image.crop(crop_box)
3649-
pic = self.add_picture(
3651+
pic = doc.add_picture(
36503652
parent=None,
36513653
image=ImageRef.from_pil(image=cropped_image, dpi=72),
36523654
prov=(
@@ -3692,7 +3694,7 @@ def parse_key_value_item(
36923694
else:
36933695
if bbox:
36943696
# In case we don't have access to an binary of an image
3695-
pic = self.add_picture(
3697+
pic = doc.add_picture(
36963698
parent=None,
36973699
prov=ProvenanceItem(
36983700
bbox=bbox, charspan=(0, 0), page_no=page_no
@@ -3733,7 +3735,7 @@ def parse_key_value_item(
37333735
key_value_data, kv_item_prov = parse_key_value_item(
37343736
full_chunk, image
37353737
)
3736-
self.add_key_values(graph=key_value_data, prov=kv_item_prov)
3738+
doc.add_key_values(graph=key_value_data, prov=kv_item_prov)
37373739
elif tag_name in [
37383740
DocumentToken.ORDERED_LIST.value,
37393741
DocumentToken.UNORDERED_LIST.value,
@@ -3749,7 +3751,7 @@ def parse_key_value_item(
37493751
)
37503752
li_pattern = re.compile(list_item_pattern, re.DOTALL)
37513753
# Add list group:
3752-
new_list = self.add_group(label=list_label, name="list")
3754+
new_list = doc.add_group(label=list_label, name="list")
37533755
# Pricess list items
37543756
for li_match in li_pattern.finditer(full_chunk):
37553757
enum_value += 1
@@ -3760,7 +3762,7 @@ def parse_key_value_item(
37603762
li_bbox = extract_bounding_box(li_full_chunk) if image else None
37613763
text_content = extract_inner_text(li_full_chunk)
37623764
# Add list item
3763-
self.add_list_item(
3765+
doc.add_list_item(
37643766
marker=enum_marker,
37653767
enumerated=(tag_name == DocumentToken.ORDERED_LIST.value),
37663768
parent=new_list,
@@ -3792,13 +3794,13 @@ def parse_key_value_item(
37923794
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
37933795
content_layer = ContentLayer.FURNITURE
37943796

3795-
self.add_text(
3797+
doc.add_text(
37963798
label=doc_label,
37973799
text=text_content,
37983800
prov=element_prov,
37993801
content_layer=content_layer,
38003802
)
3801-
return self
3803+
return doc
38023804

38033805
@deprecated("Use save_as_doctags instead.")
38043806
def save_as_document_tokens(self, *args, **kwargs):

0 commit comments

Comments
 (0)