diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py
new file mode 100644
index 00000000..79f39b78
--- /dev/null
+++ b/docling_core/experimental/idoctags.py
@@ -0,0 +1,224 @@
+"""Define classes for DocTags serialization."""
+
+from typing import Any, Final, Optional
+from xml.dom.minidom import parseString
+
+from pydantic import BaseModel
+from typing_extensions import override
+
+from docling_core.transforms.serializer.base import (
+ BaseDocSerializer,
+ BaseMetaSerializer,
+ BasePictureSerializer,
+ SerializationResult,
+)
+from docling_core.transforms.serializer.common import create_ser_result
+from docling_core.transforms.serializer.doctags import (
+ DocTagsDocSerializer,
+ DocTagsParams,
+ DocTagsPictureSerializer,
+ _get_delim,
+ _wrap,
+)
+from docling_core.types.doc import (
+ BaseMeta,
+ DescriptionMetaField,
+ DocItem,
+ DoclingDocument,
+ MetaFieldName,
+ MoleculeMetaField,
+ NodeItem,
+ PictureClassificationMetaField,
+ PictureItem,
+ SummaryMetaField,
+ TableData,
+ TabularChartMetaField,
+)
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.tokens import DocumentToken
+
+DOCTAGS_VERSION: Final = "1.0.0"
+
+
+class IDocTagsParams(DocTagsParams):
+ """DocTags-specific serialization parameters."""
+
+ do_self_closing: bool = True
+ pretty_indentation: Optional[str] = 2 * " "
+
+
+class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer):
+ """DocTags-specific meta serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: NodeItem,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """DocTags-specific meta serializer."""
+ params = IDocTagsParams(**kwargs)
+
+ elem_delim = ""
+ texts = (
+ [
+ tmp
+ for key in (
+ list(item.meta.__class__.model_fields)
+ + list(item.meta.get_custom_part())
+ )
+ if (
+ (
+ params.allowed_meta_names is None
+ or key in params.allowed_meta_names
+ )
+ and (key not in params.blocked_meta_names)
+ and (tmp := self._serialize_meta_field(item.meta, key))
+ )
+ ]
+ if item.meta
+ else []
+ )
+ if texts:
+ texts.insert(0, "")
+ texts.append("")
+ return create_ser_result(
+ text=elem_delim.join(texts),
+ span_source=item if isinstance(item, DocItem) else [],
+ )
+
+ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
+ if (field_val := getattr(meta, name)) is not None:
+ if name == MetaFieldName.SUMMARY and isinstance(
+ field_val, SummaryMetaField
+ ):
+ txt = f"{field_val.text}"
+ elif name == MetaFieldName.DESCRIPTION and isinstance(
+ field_val, DescriptionMetaField
+ ):
+ txt = f"{field_val.text}"
+ elif name == MetaFieldName.CLASSIFICATION and isinstance(
+ field_val, PictureClassificationMetaField
+ ):
+ class_name = self._humanize_text(
+ field_val.get_main_prediction().class_name
+ )
+ txt = f"{class_name}"
+ elif name == MetaFieldName.MOLECULE and isinstance(
+ field_val, MoleculeMetaField
+ ):
+ txt = f"{field_val.smi}"
+ elif name == MetaFieldName.TABULAR_CHART and isinstance(
+ field_val, TabularChartMetaField
+ ):
+ # suppressing tabular chart serialization
+ return None
+ # elif tmp := str(field_val or ""):
+ # txt = tmp
+ elif name not in {v.value for v in MetaFieldName}:
+ txt = _wrap(text=str(field_val or ""), wrap_tag=name)
+ return txt
+ return None
+
+
+class IDocTagsPictureSerializer(DocTagsPictureSerializer):
+ """DocTags-specific picture item serializer."""
+
+ @override
+ def serialize(
+ self,
+ *,
+ item: PictureItem,
+ doc_serializer: BaseDocSerializer,
+ doc: DoclingDocument,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """Serializes the passed item."""
+ params = DocTagsParams(**kwargs)
+ res_parts: list[SerializationResult] = []
+ is_chart = False
+
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
+
+ if item.meta and not params.use_legacy_annotations:
+ meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
+ if meta_res.text:
+ res_parts.append(meta_res)
+
+ body = ""
+ if params.add_location:
+ body += item.get_location_tokens(
+ doc=doc,
+ xsize=params.xsize,
+ ysize=params.ysize,
+ self_closing=params.do_self_closing,
+ )
+
+ # handle tabular chart data
+ chart_data: Optional[TableData] = None
+ if item.meta and item.meta.tabular_chart:
+ chart_data = item.meta.tabular_chart.chart_data
+ if chart_data and chart_data.table_cells:
+ temp_doc = DoclingDocument(name="temp")
+ temp_table = temp_doc.add_table(data=chart_data)
+ otsl_content = temp_table.export_to_otsl(
+ temp_doc,
+ add_cell_location=False,
+ self_closing=params.do_self_closing,
+ )
+ body += otsl_content
+ res_parts.append(create_ser_result(text=body, span_source=item))
+
+ if params.add_caption:
+ cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
+ if cap_res.text:
+ res_parts.append(cap_res)
+
+ text_res = "".join([r.text for r in res_parts])
+ if text_res:
+ token = DocumentToken.create_token_name_from_doc_item_label(
+ label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
+ )
+ text_res = _wrap(text=text_res, wrap_tag=token)
+ return create_ser_result(text=text_res, span_source=res_parts)
+
+
+class IDocTagsDocSerializer(DocTagsDocSerializer):
+ """DocTags document serializer."""
+
+ picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer()
+ meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer()
+ params: IDocTagsParams = IDocTagsParams()
+
+ @override
+ def _meta_is_wrapped(self) -> bool:
+ return True
+
+ @override
+ def serialize_doc(
+ self,
+ *,
+ parts: list[SerializationResult],
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """DocTags-specific document serializer."""
+ delim = _get_delim(params=self.params)
+ text_res = delim.join([p.text for p in parts if p.text])
+
+ if self.params.add_page_break:
+ page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
+ for full_match, _, _ in self._get_page_breaks(text=text_res):
+ text_res = text_res.replace(full_match, page_sep)
+
+ wrap_tag = DocumentToken.DOCUMENT.value
+ text_res = f"<{wrap_tag}>{DOCTAGS_VERSION}{text_res}{delim}{wrap_tag}>"
+
+ if self.params.pretty_indentation and (
+ my_root := parseString(text_res).documentElement
+ ):
+ text_res = my_root.toprettyxml(indent=self.params.pretty_indentation)
+ text_res = "\n".join(
+ [line for line in text_res.split("\n") if line.strip()]
+ )
+ return create_ser_result(text=text_res, span_source=parts)
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 4720ada0..54f83df8 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -315,6 +315,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult:
res = self.serialize_doc(parts=subparts, **kwargs)
return res
+ def _meta_is_wrapped(self) -> bool:
+ return False
+
@override
def serialize(
self,
@@ -336,7 +339,11 @@ def serialize(
my_item = item or self.doc.body
if my_item == self.doc.body:
- if my_item.meta and not my_params.use_legacy_annotations:
+ if (
+ my_item.meta
+ and not my_params.use_legacy_annotations
+ and not self._meta_is_wrapped()
+ ):
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
@@ -355,7 +362,11 @@ def serialize(
my_visited.add(my_item.self_ref)
- if my_item.meta and not my_params.use_legacy_annotations:
+ if (
+ my_item.meta
+ and not my_params.use_legacy_annotations
+ and not self._meta_is_wrapped()
+ ):
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
@@ -602,7 +613,6 @@ def serialize_meta(
text="", span_source=item if isinstance(item, DocItem) else []
)
else:
- _logger.warning("No meta serializer found.")
return create_ser_result(
text="", span_source=item if isinstance(item, DocItem) else []
)
diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py
index 0195cd8e..1c5b9090 100644
--- a/docling_core/transforms/serializer/doctags.py
+++ b/docling_core/transforms/serializer/doctags.py
@@ -76,6 +76,8 @@ class Mode(str, Enum):
mode: Mode = Mode.HUMAN_FRIENDLY
+ do_self_closing: bool = False
+
def _get_delim(params: DocTagsParams) -> str:
if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
@@ -109,11 +111,17 @@ def serialize(
)
parts: list[str] = []
+ if item.meta and not params.use_legacy_annotations:
+ meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
+ if meta_res.text:
+ parts.append(meta_res.text)
+
if params.add_location:
location = item.get_location_tokens(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
if location:
parts.append(location)
@@ -183,6 +191,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
res_parts.append(create_ser_result(text=loc_text, span_source=item))
@@ -232,6 +241,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
# handle classification data
@@ -241,12 +251,16 @@ def serialize(
item.meta.classification.get_main_prediction().class_name
)
elif (
- classifications := [
- ann
- for ann in item.annotations
- if isinstance(ann, PictureClassificationData)
- ]
- ) and classifications[0].predicted_classes:
+ params.use_legacy_annotations
+ and (
+ classifications := [
+ ann
+ for ann in item.annotations
+ if isinstance(ann, PictureClassificationData)
+ ]
+ )
+ and classifications[0].predicted_classes
+ ):
predicted_class = classifications[0].predicted_classes[0].class_name
if predicted_class:
body += DocumentToken.get_picture_classification_token(predicted_class)
@@ -265,9 +279,13 @@ def serialize(
smi: Optional[str] = None
if item.meta and item.meta.molecule:
smi = item.meta.molecule.smi
- elif smiles_annotations := [
- ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
- ]:
+ elif params.use_legacy_annotations and (
+ smiles_annotations := [
+ ann
+ for ann in item.annotations
+ if isinstance(ann, PictureMoleculeData)
+ ]
+ ):
smi = smiles_annotations[0].smi
if smi:
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
@@ -276,11 +294,13 @@ def serialize(
chart_data: Optional[TableData] = None
if item.meta and item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
- elif tabular_chart_annotations := [
- ann
- for ann in item.annotations
- if isinstance(ann, PictureTabularChartData)
- ]:
+ elif params.use_legacy_annotations and (
+ tabular_chart_annotations := [
+ ann
+ for ann in item.annotations
+ if isinstance(ann, PictureTabularChartData)
+ ]
+ ):
chart_data = tabular_chart_annotations[0].chart_data
if chart_data and chart_data.table_cells:
temp_doc = DoclingDocument(name="temp")
@@ -331,6 +351,7 @@ def serialize(
doc=doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
# mapping from source_cell_id to a list of target_cell_ids
@@ -471,6 +492,7 @@ def _get_inline_location_tags(
page_h=page_h,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
return SerializationResult(
@@ -606,6 +628,7 @@ def serialize_captions(
doc=self.doc,
xsize=params.xsize,
ysize=params.ysize,
+ self_closing=params.do_self_closing,
)
results.append(create_ser_result(text=loc_txt))
results.append(cap_res)
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 626a9734..ccb55d28 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -1259,6 +1259,7 @@ def get_location_tokens(
new_line: str = "", # deprecated
xsize: int = 500,
ysize: int = 500,
+ self_closing: bool = False,
) -> str:
"""Get the location string for the BaseCell."""
if not len(self.prov):
@@ -1274,6 +1275,7 @@ def get_location_tokens(
page_h=page_h,
xsize=xsize,
ysize=ysize,
+ self_closing=self_closing,
)
location += loc_str
@@ -1995,6 +1997,7 @@ def export_to_otsl(
add_cell_text: bool = True,
xsize: int = 500,
ysize: int = 500,
+ self_closing: bool = False,
**kwargs: Any,
) -> str:
"""Export the table as OTSL."""
@@ -2050,6 +2053,7 @@ def export_to_otsl(
page_h=page_h,
xsize=xsize,
ysize=ysize,
+ self_closing=self_closing,
)
if rowstart == i and colstart == j:
diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py
index 5edbc5dc..028afcaa 100644
--- a/docling_core/types/doc/tokens.py
+++ b/docling_core/types/doc/tokens.py
@@ -267,12 +267,14 @@ def get_code_language_token(code_language: str) -> str:
return _CodeLanguageToken(f"<_{code_language}_>").value
@staticmethod
- def get_location_token(val: float, rnorm: int = 500): # TODO review
+ def get_location_token(
+ val: float, rnorm: int = 500, self_closing: bool = False
+ ): # TODO review
"""Function to get location tokens."""
val_ = round(rnorm * val)
val_ = max(val_, 0)
val_ = min(val_, rnorm - 1)
- return f"<{_LOC_PREFIX}{val_}>"
+ return f"<{_LOC_PREFIX}{val_}{'/' if self_closing else ''}>"
@staticmethod
def get_location(
@@ -281,6 +283,7 @@ def get_location(
page_h: float,
xsize: int = 500, # TODO review
ysize: int = 500, # TODO review
+ self_closing: bool = False,
):
"""Get the location string give bbox and page-dim."""
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -291,10 +294,18 @@ def get_location(
x1 = bbox[2] / page_w
y1 = bbox[3] / page_h
- x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
- y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
- x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
- y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
+ x0_tok = DocumentToken.get_location_token(
+ val=min(x0, x1), rnorm=xsize, self_closing=self_closing
+ )
+ y0_tok = DocumentToken.get_location_token(
+ val=min(y0, y1), rnorm=ysize, self_closing=self_closing
+ )
+ x1_tok = DocumentToken.get_location_token(
+ val=max(x0, x1), rnorm=xsize, self_closing=self_closing
+ )
+ y1_tok = DocumentToken.get_location_token(
+ val=max(y0, y1), rnorm=ysize, self_closing=self_closing
+ )
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
diff --git a/test/data/doc/dummy_doc_with_meta.gt.dt b/test/data/doc/dummy_doc_with_meta.gt.dt
new file mode 100644
index 00000000..1979777c
--- /dev/null
+++ b/test/data/doc/dummy_doc_with_meta.gt.dt
@@ -0,0 +1,4 @@
+DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
+CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1Figure 1: Four examples of complex page layouts across different document categories
+
+
diff --git a/test/data/doc/dummy_doc_with_meta.gt.idt.xml b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
new file mode 100644
index 00000000..2d6b544b
--- /dev/null
+++ b/test/data/doc/dummy_doc_with_meta.gt.idt.xml
@@ -0,0 +1,39 @@
+
+ 1.0.0
+
+
+ This is a title.
+ More stuff here.
+
+
+
+
+
+ DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
+
+
+
+ ...
+ Bar chart
+ CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
+ {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}}
+
+
+
+
+
+
+
+
+
+
+ Figure 1: Four examples of complex page layouts across different document categories
+
+
+
+
+
+
+
+
+
diff --git a/test/test_serialization.py b/test/test_serialization.py
index a8ebcdaa..52ede8ed 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -5,6 +5,7 @@
from typing_extensions import override
+from docling_core.experimental.idoctags import IDocTagsDocSerializer
from docling_core.transforms.serializer.base import (
BaseDocSerializer,
SerializationResult,
@@ -637,3 +638,26 @@ def test_doctags_inline_and_formatting():
ser = DocTagsDocSerializer(doc=doc)
actual = ser.serialize().text
verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
+
+
+def test_doctags_meta():
+ src = Path("./test/data/doc/dummy_doc_with_meta.yaml")
+ doc = DoclingDocument.load_from_yaml(src)
+
+ ser = DocTagsDocSerializer(doc=doc)
+ actual = ser.serialize().text
+ verify(exp_file=src.with_suffix(".gt.dt"), actual=actual)
+
+
+# ===============================
+# IDocTags tests
+# ===============================
+
+
+def test_idoctags_meta():
+ src = Path("./test/data/doc/dummy_doc_with_meta.yaml")
+ doc = DoclingDocument.load_from_yaml(src)
+
+ ser = IDocTagsDocSerializer(doc=doc)
+ actual = ser.serialize().text
+ verify(exp_file=src.with_suffix(".gt.idt.xml"), actual=actual)