Skip to content

Commit dcc198f

Browse files
authored
feat(doctags): add enclosing bbox to inline (#302)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c383f64 commit dcc198f

16 files changed

+45349
-47
lines changed

docling_core/transforms/serializer/doctags.py

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
BaseTableSerializer,
1919
BaseTextSerializer,
2020
SerializationResult,
21+
Span,
2122
)
2223
from docling_core.transforms.serializer.common import (
2324
CommonParams,
2425
DocSerializer,
2526
create_ser_result,
2627
)
28+
from docling_core.types.doc.base import BoundingBox
2729
from docling_core.types.doc.document import (
2830
CodeItem,
2931
DocItem,
@@ -39,6 +41,7 @@
3941
PictureItem,
4042
PictureMoleculeData,
4143
PictureTabularChartData,
44+
ProvenanceItem,
4245
TableItem,
4346
TextItem,
4447
UnorderedList,
@@ -415,6 +418,39 @@ def serialize(
415418
class DocTagsInlineSerializer(BaseInlineSerializer):
416419
"""DocTags-specific inline group serializer."""
417420

421+
def _get_inline_location_tags(
422+
self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
423+
) -> SerializationResult:
424+
425+
prov: Optional[ProvenanceItem] = None
426+
boxes: list[BoundingBox] = []
427+
doc_items: list[DocItem] = []
428+
for it, _ in doc.iterate_items(root=item):
429+
if isinstance(it, DocItem):
430+
for prov in it.prov:
431+
boxes.append(prov.bbox)
432+
doc_items.append(it)
433+
if prov is None:
434+
return create_ser_result()
435+
436+
bbox = BoundingBox.enclosing_bbox(boxes=boxes)
437+
438+
# using last seen prov as reference for page dims
439+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
440+
441+
loc_str = DocumentToken.get_location(
442+
bbox=bbox.to_top_left_origin(page_h).as_tuple(),
443+
page_w=page_w,
444+
page_h=page_h,
445+
xsize=params.xsize,
446+
ysize=params.ysize,
447+
)
448+
449+
return SerializationResult(
450+
text=loc_str,
451+
spans=[Span(item=it) for it in doc_items],
452+
)
453+
418454
@override
419455
def serialize(
420456
self,
@@ -429,12 +465,23 @@ def serialize(
429465
"""Serializes the passed item."""
430466
my_visited = visited if visited is not None else set()
431467
params = DocTagsParams(**kwargs)
432-
parts = doc_serializer.get_parts(
433-
item=item,
434-
list_level=list_level,
435-
is_inline_scope=True,
436-
visited=my_visited,
437-
**kwargs,
468+
parts: List[SerializationResult] = []
469+
if params.add_location:
470+
inline_loc_tags_ser_res = self._get_inline_location_tags(
471+
doc=doc,
472+
item=item,
473+
params=params,
474+
)
475+
parts.append(inline_loc_tags_ser_res)
476+
params.add_location = False # suppress children location serialization
477+
parts.extend(
478+
doc_serializer.get_parts(
479+
item=item,
480+
list_level=list_level,
481+
is_inline_scope=True,
482+
visited=my_visited,
483+
**{**kwargs, **params.model_dump()},
484+
)
438485
)
439486
wrap_tag = DocumentToken.INLINE.value
440487
delim = _get_delim(params=params)

docling_core/types/doc/document.py

Lines changed: 76 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3649,6 +3649,52 @@ def parse_key_value_item(
36493649

36503650
return (GraphData(cells=cells, links=links), overall_prov)
36513651

3652+
def _add_text(
3653+
full_chunk: str,
3654+
bbox: Optional[BoundingBox],
3655+
pg_width: int,
3656+
pg_height: int,
3657+
page_no: int,
3658+
tag_name: str,
3659+
doc_label: DocItemLabel,
3660+
doc: DoclingDocument,
3661+
parent: Optional[NodeItem],
3662+
):
3663+
# For everything else, treat as text
3664+
text_content = extract_inner_text(full_chunk)
3665+
element_prov = (
3666+
ProvenanceItem(
3667+
bbox=bbox.resize_by_scale(pg_width, pg_height),
3668+
charspan=(0, len(text_content)),
3669+
page_no=page_no,
3670+
)
3671+
if bbox
3672+
else None
3673+
)
3674+
3675+
content_layer = ContentLayer.BODY
3676+
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
3677+
content_layer = ContentLayer.FURNITURE
3678+
3679+
if doc_label == DocItemLabel.SECTION_HEADER:
3680+
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
3681+
level = int(tag_name.split("_")[-1])
3682+
doc.add_heading(
3683+
text=text_content,
3684+
level=level,
3685+
prov=element_prov,
3686+
parent=parent,
3687+
content_layer=content_layer,
3688+
)
3689+
else:
3690+
doc.add_text(
3691+
label=doc_label,
3692+
text=text_content,
3693+
prov=element_prov,
3694+
parent=parent,
3695+
content_layer=content_layer,
3696+
)
3697+
36523698
# doc = DoclingDocument(name="Document")
36533699
for pg_idx, doctag_page in enumerate(doctag_document.pages):
36543700
page_doctags = doctag_page.tokens
@@ -3683,7 +3729,7 @@ def parse_key_value_item(
36833729
tag_pattern = (
36843730
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
36853731
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
3686-
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
3732+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|{GroupLabel.INLINE}|"
36873733
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
36883734
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
36893735
rf"{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
@@ -3708,7 +3754,7 @@ def parse_key_value_item(
37083754
# no closing tag; only the existence of the item is recovered
37093755
full_chunk = f"<{tag_name}></{tag_name}>"
37103756

3711-
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
3757+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
37123758

37133759
if tag_name == DocumentToken.OTSL.value:
37143760
table_data = parse_table_content(full_chunk)
@@ -3731,6 +3777,24 @@ def parse_key_value_item(
37313777
else:
37323778
doc.add_table(data=table_data, caption=caption)
37333779

3780+
elif tag_name == GroupLabel.INLINE:
3781+
inline_group = doc.add_inline_group()
3782+
content = match.group("content")
3783+
common_bbox = extract_bounding_box(content)
3784+
for item_match in pattern.finditer(content):
3785+
item_tag = item_match.group("tag")
3786+
_add_text(
3787+
full_chunk=item_match.group(0),
3788+
bbox=common_bbox,
3789+
pg_width=pg_width,
3790+
pg_height=pg_height,
3791+
page_no=page_no,
3792+
tag_name=item_tag,
3793+
doc_label=tag_to_doclabel.get(item_tag, DocItemLabel.TEXT),
3794+
doc=doc,
3795+
parent=inline_group,
3796+
)
3797+
37343798
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
37353799
caption, caption_bbox = extract_caption(full_chunk)
37363800
table_data = None
@@ -3880,38 +3944,17 @@ def parse_key_value_item(
38803944
)
38813945
else:
38823946
# For everything else, treat as text
3883-
text_content = extract_inner_text(full_chunk)
3884-
element_prov = (
3885-
ProvenanceItem(
3886-
bbox=bbox.resize_by_scale(pg_width, pg_height),
3887-
charspan=(0, len(text_content)),
3888-
page_no=page_no,
3889-
)
3890-
if bbox
3891-
else None
3947+
_add_text(
3948+
full_chunk=full_chunk,
3949+
bbox=bbox,
3950+
pg_width=pg_width,
3951+
pg_height=pg_height,
3952+
page_no=page_no,
3953+
tag_name=tag_name,
3954+
doc_label=doc_label,
3955+
doc=doc,
3956+
parent=None,
38923957
)
3893-
3894-
content_layer = ContentLayer.BODY
3895-
if tag_name in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
3896-
content_layer = ContentLayer.FURNITURE
3897-
3898-
if doc_label == DocItemLabel.SECTION_HEADER:
3899-
# Extract level from tag_name (e.g. "section_level_header_1" -> 1)
3900-
level = int(tag_name.split("_")[-1])
3901-
doc.add_heading(
3902-
text=text_content,
3903-
level=level,
3904-
prov=element_prov,
3905-
content_layer=content_layer,
3906-
)
3907-
else:
3908-
doc.add_text(
3909-
label=doc_label,
3910-
text=text_content,
3911-
prov=element_prov,
3912-
content_layer=content_layer,
3913-
)
3914-
39153958
return doc
39163959

39173960
@deprecated("Use save_as_doctags instead.")

0 commit comments

Comments
 (0)