Skip to content

Commit d8a5256

Browse files
authored
feat: add table annotations (#304)
* feat: add table annotations Signed-off-by: Panos Vagenas <[email protected]> * refactor annotation types Signed-off-by: Panos Vagenas <[email protected]> * expand to HTML Signed-off-by: Panos Vagenas <[email protected]> * introduce annotation serializer Signed-off-by: Panos Vagenas <[email protected]> * Update dummy_doc.yaml Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]> Signed-off-by: Panos Vagenas <[email protected]>
1 parent 58d93e6 commit d8a5256

33 files changed

+46023
-176
lines changed

docling_core/transforms/serializer/base.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,15 @@ def serialize_captions(
239239
"""Serialize the item's captions."""
240240
...
241241

242+
@abstractmethod
243+
def serialize_annotations(
244+
self,
245+
item: DocItem,
246+
**kwargs: Any,
247+
) -> SerializationResult:
248+
"""Serialize the item's annotations."""
249+
...
250+
242251
@abstractmethod
243252
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
244253
"""Get references to excluded items."""
@@ -257,3 +266,18 @@ class BaseSerializerProvider(ABC):
257266
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
258267
"""Get a the associated serializer."""
259268
...
269+
270+
271+
class BaseAnnotationSerializer(ABC):
272+
"""Base class for annotation serializers."""
273+
274+
@abstractmethod
275+
def serialize(
276+
self,
277+
*,
278+
item: DocItem,
279+
doc: DoclingDocument,
280+
**kwargs: Any,
281+
) -> SerializationResult:
282+
"""Serializes the passed annotation."""
283+
...

docling_core/transforms/serializer/common.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from typing_extensions import Self, override
1616

1717
from docling_core.transforms.serializer.base import (
18+
BaseAnnotationSerializer,
1819
BaseDocSerializer,
1920
BaseFallbackSerializer,
2021
BaseFormSerializer,
@@ -30,6 +31,7 @@
3031
from docling_core.types.doc.document import (
3132
DOCUMENT_TOKENS_EXPORT_LABELS,
3233
ContentLayer,
34+
DescriptionAnnotation,
3335
DocItem,
3436
DoclingDocument,
3537
FloatingItem,
@@ -41,9 +43,9 @@
4143
OrderedList,
4244
PictureClassificationData,
4345
PictureDataType,
44-
PictureDescriptionData,
4546
PictureItem,
4647
PictureMoleculeData,
48+
TableAnnotationType,
4749
TableItem,
4850
TextItem,
4951
UnorderedList,
@@ -122,7 +124,9 @@ def _iterate_items(
122124
yield item
123125

124126

125-
def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
127+
def _get_annotation_text(
128+
annotation: Union[PictureDataType, TableAnnotationType],
129+
) -> Optional[str]:
126130
result = None
127131
if isinstance(annotation, PictureClassificationData):
128132
predicted_class = (
@@ -132,7 +136,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
132136
)
133137
if predicted_class is not None:
134138
result = predicted_class.replace("_", " ")
135-
elif isinstance(annotation, PictureDescriptionData):
139+
elif isinstance(annotation, DescriptionAnnotation):
136140
result = annotation.text
137141
elif isinstance(annotation, PictureMoleculeData):
138142
result = annotation.smi
@@ -211,6 +215,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
211215
list_serializer: BaseListSerializer
212216
inline_serializer: BaseInlineSerializer
213217

218+
annotation_serializer: BaseAnnotationSerializer
219+
214220
params: CommonParams = CommonParams()
215221

216222
_excluded_refs_cache: dict[str, set[str]] = {}
@@ -505,6 +511,19 @@ def serialize_captions(
505511
text_res = ""
506512
return create_ser_result(text=text_res, span_source=results)
507513

514+
@override
515+
def serialize_annotations(
516+
self,
517+
item: DocItem,
518+
**kwargs: Any,
519+
) -> SerializationResult:
520+
"""Serialize the item's annotations."""
521+
return self.annotation_serializer.serialize(
522+
item=item,
523+
doc=self.doc,
524+
**kwargs,
525+
)
526+
508527
def _get_applicable_pages(self) -> Optional[list[int]]:
509528
pages = {
510529
item.prov[0].page_no: ...

docling_core/transforms/serializer/doctags.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing_extensions import override
88

99
from docling_core.transforms.serializer.base import (
10+
BaseAnnotationSerializer,
1011
BaseDocSerializer,
1112
BaseFallbackSerializer,
1213
BaseFormSerializer,
@@ -460,6 +461,15 @@ def serialize(
460461
return create_ser_result()
461462

462463

464+
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
465+
"""DocTags-specific annotation serializer."""
466+
467+
@override
468+
def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
469+
"""Serializes the item's annotations."""
470+
return create_ser_result()
471+
472+
463473
class DocTagsDocSerializer(DocSerializer):
464474
"""DocTags-specific document serializer."""
465475

@@ -473,6 +483,8 @@ class DocTagsDocSerializer(DocSerializer):
473483
list_serializer: BaseListSerializer = DocTagsListSerializer()
474484
inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
475485

486+
annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
487+
476488
params: DocTagsParams = DocTagsParams()
477489

478490
@override

docling_core/transforms/serializer/html.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing_extensions import override
2222

2323
from docling_core.transforms.serializer.base import (
24+
BaseAnnotationSerializer,
2425
BaseDocSerializer,
2526
BaseFallbackSerializer,
2627
BaseFormSerializer,
@@ -35,7 +36,7 @@
3536
from docling_core.transforms.serializer.common import (
3637
CommonParams,
3738
DocSerializer,
38-
_get_picture_annotation_text,
39+
_get_annotation_text,
3940
create_ser_result,
4041
)
4142
from docling_core.transforms.serializer.html_styles import (
@@ -47,6 +48,7 @@
4748
from docling_core.types.doc.document import (
4849
CodeItem,
4950
ContentLayer,
51+
DescriptionAnnotation,
5052
DocItem,
5153
DoclingDocument,
5254
FloatingItem,
@@ -59,7 +61,9 @@
5961
ListItem,
6062
NodeItem,
6163
OrderedList,
64+
PictureClassificationData,
6265
PictureItem,
66+
PictureMoleculeData,
6367
PictureTabularChartData,
6468
SectionHeaderItem,
6569
TableCell,
@@ -758,14 +762,7 @@ class HTMLFallbackSerializer(BaseFallbackSerializer):
758762
"""HTML-specific fallback serializer."""
759763

760764
@override
761-
def serialize(
762-
self,
763-
*,
764-
item: NodeItem,
765-
doc_serializer: "BaseDocSerializer",
766-
doc: DoclingDocument,
767-
**kwargs: Any,
768-
) -> SerializationResult:
765+
def serialize(self, *, item: NodeItem, **kwargs: Any) -> SerializationResult:
769766
"""Fallback serializer for items not handled by other serializers."""
770767
if isinstance(item, DocItem):
771768
return create_ser_result(
@@ -777,6 +774,42 @@ def serialize(
777774
return create_ser_result()
778775

779776

777+
class HTMLAnnotationSerializer(BaseModel, BaseAnnotationSerializer):
778+
"""HTML-specific annotation serializer."""
779+
780+
def serialize(
781+
self,
782+
*,
783+
item: DocItem,
784+
doc: DoclingDocument,
785+
**kwargs: Any,
786+
) -> SerializationResult:
787+
"""Serializes the passed annotation to HTML format."""
788+
res_parts: list[SerializationResult] = []
789+
for ann in item.get_annotations():
790+
if isinstance(
791+
ann,
792+
(PictureClassificationData, DescriptionAnnotation, PictureMoleculeData),
793+
):
794+
if ann_text := _get_annotation_text(ann):
795+
text_dir = get_text_direction(ann_text)
796+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
797+
ann_ser_res = create_ser_result(
798+
text=(
799+
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
800+
f"{html.escape(ann_text)}"
801+
f"</div>"
802+
),
803+
span_source=item,
804+
)
805+
res_parts.append(ann_ser_res)
806+
807+
return create_ser_result(
808+
text=" ".join([r.text for r in res_parts if r.text]),
809+
span_source=res_parts,
810+
)
811+
812+
780813
class HTMLDocSerializer(DocSerializer):
781814
"""HTML-specific document serializer."""
782815

@@ -790,6 +823,8 @@ class HTMLDocSerializer(DocSerializer):
790823
list_serializer: BaseListSerializer = HTMLListSerializer()
791824
inline_serializer: BaseInlineSerializer = HTMLInlineSerializer()
792825

826+
annotation_serializer: BaseAnnotationSerializer = HTMLAnnotationSerializer()
827+
793828
params: HTMLParams = HTMLParams()
794829

795830
@override
@@ -968,20 +1003,13 @@ def serialize_captions(
9681003
results.append(cap_ser_res)
9691004

9701005
if params.include_annotations and item.self_ref not in excluded_refs:
971-
if isinstance(item, PictureItem):
972-
for ann in item.annotations:
973-
if ann_text := _get_picture_annotation_text(annotation=ann):
974-
text_dir = get_text_direction(ann_text)
975-
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
976-
ann_ser_res = create_ser_result(
977-
text=(
978-
f'<div data-annotation-kind="{ann.kind}"{dir_str}>'
979-
f"{html.escape(ann_text)}"
980-
f"</div>"
981-
),
982-
span_source=item,
983-
)
984-
results.append(ann_ser_res)
1006+
if isinstance(item, (PictureItem, TableItem)):
1007+
ann_res = self.serialize_annotations(
1008+
item=item,
1009+
**kwargs,
1010+
)
1011+
if ann_res.text:
1012+
results.append(ann_res)
9851013

9861014
text_res = params.caption_delim.join([r.text for r in results])
9871015
if text_res:

0 commit comments

Comments
 (0)