Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions docling_core/experimental/idoctags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""Define classes for DocTags serialization."""

from typing import Any, Final, Optional
from xml.dom.minidom import parseString

from pydantic import BaseModel
from typing_extensions import override

from docling_core.transforms.serializer.base import (
BaseDocSerializer,
BaseMetaSerializer,
BasePictureSerializer,
SerializationResult,
)
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.transforms.serializer.doctags import (
DocTagsDocSerializer,
DocTagsParams,
DocTagsPictureSerializer,
_get_delim,
_wrap,
)
from docling_core.types.doc import (
BaseMeta,
DescriptionMetaField,
DocItem,
DoclingDocument,
MetaFieldName,
MoleculeMetaField,
NodeItem,
PictureClassificationMetaField,
PictureItem,
SummaryMetaField,
TableData,
TabularChartMetaField,
)
from docling_core.types.doc.labels import DocItemLabel
from docling_core.types.doc.tokens import DocumentToken

DOCTAGS_VERSION: Final = "1.0.0"


class IDocTagsParams(DocTagsParams):
"""DocTags-specific serialization parameters."""

do_self_closing: bool = True
pretty_indentation: Optional[str] = 2 * " "


class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
"""DocTags-specific meta serializer."""

@override
def serialize(
self,
*,
item: NodeItem,
**kwargs: Any,
) -> SerializationResult:
"""DocTags-specific meta serializer."""
params = IDocTagsParams(**kwargs)

elem_delim = ""
texts = (
[
tmp
for key in (
list(item.meta.__class__.model_fields)
+ list(item.meta.get_custom_part())
)
if (
(
params.allowed_meta_names is None
or key in params.allowed_meta_names
)
and (key not in params.blocked_meta_names)
and (tmp := self._serialize_meta_field(item.meta, key))
)
]
if item.meta
else []
)
if texts:
texts.insert(0, "<meta>")
texts.append("</meta>")
return create_ser_result(
text=elem_delim.join(texts),
span_source=item if isinstance(item, DocItem) else [],
)

def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
if (field_val := getattr(meta, name)) is not None:
if name == MetaFieldName.SUMMARY and isinstance(
field_val, SummaryMetaField
):
txt = f"<summary>{field_val.text}</summary>"
elif name == MetaFieldName.DESCRIPTION and isinstance(
field_val, DescriptionMetaField
):
txt = f"<description>{field_val.text}</description>"
elif name == MetaFieldName.CLASSIFICATION and isinstance(
field_val, PictureClassificationMetaField
):
class_name = self._humanize_text(
field_val.get_main_prediction().class_name
)
txt = f"<classification>{class_name}</classification>"
elif name == MetaFieldName.MOLECULE and isinstance(
field_val, MoleculeMetaField
):
txt = f"<molecule>{field_val.smi}</molecule>"
elif name == MetaFieldName.TABULAR_CHART and isinstance(
field_val, TabularChartMetaField
):
# suppressing tabular chart serialization
return None
# elif tmp := str(field_val or ""):
# txt = tmp
elif name not in {v.value for v in MetaFieldName}:
txt = _wrap(text=str(field_val or ""), wrap_tag=name)
return txt
return None


class IDocTagsPictureSerializer(DocTagsPictureSerializer):
"""DocTags-specific picture item serializer."""

@override
def serialize(
self,
*,
item: PictureItem,
doc_serializer: BaseDocSerializer,
doc: DoclingDocument,
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed item."""
params = DocTagsParams(**kwargs)
res_parts: list[SerializationResult] = []
is_chart = False

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):

if item.meta and not params.use_legacy_annotations:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
res_parts.append(meta_res)

body = ""
if params.add_location:
body += item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# handle tabular chart data
chart_data: Optional[TableData] = None
if item.meta and item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
if chart_data and chart_data.table_cells:
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(data=chart_data)
otsl_content = temp_table.export_to_otsl(
temp_doc,
add_cell_location=False,
self_closing=params.do_self_closing,
)
body += otsl_content
res_parts.append(create_ser_result(text=body, span_source=item))

if params.add_caption:
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
if cap_res.text:
res_parts.append(cap_res)

text_res = "".join([r.text for r in res_parts])
if text_res:
token = DocumentToken.create_token_name_from_doc_item_label(
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
)
text_res = _wrap(text=text_res, wrap_tag=token)
return create_ser_result(text=text_res, span_source=res_parts)


class IDocTagsDocSerializer(DocTagsDocSerializer):
"""DocTags document serializer."""

picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
params: IDocTagsParams = IDocTagsParams()

@override
def _meta_is_wrapped(self) -> bool:
return True

@override
def serialize_doc(
self,
*,
parts: list[SerializationResult],
**kwargs: Any,
) -> SerializationResult:
"""DocTags-specific document serializer."""
delim = _get_delim(params=self.params)
text_res = delim.join([p.text for p in parts if p.text])

if self.params.add_page_break:
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
for full_match, _, _ in self._get_page_breaks(text=text_res):
text_res = text_res.replace(full_match, page_sep)

wrap_tag = DocumentToken.DOCUMENT.value
text_res = f"<{wrap_tag}><version>{DOCTAGS_VERSION}</version>{text_res}{delim}</{wrap_tag}>"

if self.params.pretty_indentation and (
my_root := parseString(text_res).documentElement
):
text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
text_res = "\n".join(
[line for line in text_res.split("\n") if line.strip()]
)
return create_ser_result(text=text_res, span_source=parts)
16 changes: 13 additions & 3 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
res = self.serialize_doc(parts=subparts, **kwargs)
return res

def _meta_is_wrapped(self) -> bool:
return False

@override
def serialize(
self,
Expand All @@ -336,7 +339,11 @@ def serialize(
my_item = item or self.doc.body

if my_item == self.doc.body:
if my_item.meta and not my_params.use_legacy_annotations:
if (
my_item.meta
and not my_params.use_legacy_annotations
and not self._meta_is_wrapped()
):
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand All @@ -355,7 +362,11 @@ def serialize(

my_visited.add(my_item.self_ref)

if my_item.meta and not my_params.use_legacy_annotations:
if (
my_item.meta
and not my_params.use_legacy_annotations
and not self._meta_is_wrapped()
):
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand Down Expand Up @@ -602,7 +613,6 @@ def serialize_meta(
text="", span_source=item if isinstance(item, DocItem) else []
)
else:
_logger.warning("No meta serializer found.")
return create_ser_result(
text="", span_source=item if isinstance(item, DocItem) else []
)
Expand Down
51 changes: 37 additions & 14 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class Mode(str, Enum):

mode: Mode = Mode.HUMAN_FRIENDLY

do_self_closing: bool = False


def _get_delim(params: DocTagsParams) -> str:
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
Expand Down Expand Up @@ -109,11 +111,17 @@ def serialize(
)
parts: list[str] = []

if item.meta and not params.use_legacy_annotations:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
parts.append(meta_res.text)

if params.add_location:
location = item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
if location:
parts.append(location)
Expand Down Expand Up @@ -183,6 +191,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
res_parts.append(create_ser_result(text=loc_text, span_source=item))

Expand Down Expand Up @@ -232,6 +241,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# handle classification data
Expand All @@ -241,12 +251,16 @@ def serialize(
item.meta.classification.get_main_prediction().class_name
)
elif (
classifications := [
ann
for ann in item.annotations
if isinstance(ann, PictureClassificationData)
]
) and classifications[0].predicted_classes:
params.use_legacy_annotations
and (
classifications := [
ann
for ann in item.annotations
if isinstance(ann, PictureClassificationData)
]
)
and classifications[0].predicted_classes
):
predicted_class = classifications[0].predicted_classes[0].class_name
if predicted_class:
body += DocumentToken.get_picture_classification_token(predicted_class)
Expand All @@ -265,9 +279,13 @@ def serialize(
smi: Optional[str] = None
if item.meta and item.meta.molecule:
smi = item.meta.molecule.smi
elif smiles_annotations := [
ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
]:
elif params.use_legacy_annotations and (
smiles_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureMoleculeData)
]
):
smi = smiles_annotations[0].smi
if smi:
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
Expand All @@ -276,11 +294,13 @@ def serialize(
chart_data: Optional[TableData] = None
if item.meta and item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
elif tabular_chart_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureTabularChartData)
]:
elif params.use_legacy_annotations and (
tabular_chart_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureTabularChartData)
]
):
chart_data = tabular_chart_annotations[0].chart_data
if chart_data and chart_data.table_cells:
temp_doc = DoclingDocument(name="temp")
Expand Down Expand Up @@ -331,6 +351,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

# mapping from source_cell_id to a list of target_cell_ids
Expand Down Expand Up @@ -471,6 +492,7 @@ def _get_inline_location_tags(
page_h=page_h,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)

return SerializationResult(
Expand Down Expand Up @@ -606,6 +628,7 @@ def serialize_captions(
doc=self.doc,
xsize=params.xsize,
ysize=params.ysize,
self_closing=params.do_self_closing,
)
results.append(create_ser_result(text=loc_txt))
results.append(cap_res)
Expand Down
Loading
Loading