diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py new file mode 100644 index 00000000..79f39b78 --- /dev/null +++ b/docling_core/experimental/idoctags.py @@ -0,0 +1,224 @@ +"""Define classes for DocTags serialization.""" + +from typing import Any, Final, Optional +from xml.dom.minidom import parseString + +from pydantic import BaseModel +from typing_extensions import override + +from docling_core.transforms.serializer.base import ( + BaseDocSerializer, + BaseMetaSerializer, + BasePictureSerializer, + SerializationResult, +) +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.doctags import ( + DocTagsDocSerializer, + DocTagsParams, + DocTagsPictureSerializer, + _get_delim, + _wrap, +) +from docling_core.types.doc import ( + BaseMeta, + DescriptionMetaField, + DocItem, + DoclingDocument, + MetaFieldName, + MoleculeMetaField, + NodeItem, + PictureClassificationMetaField, + PictureItem, + SummaryMetaField, + TableData, + TabularChartMetaField, +) +from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc.tokens import DocumentToken + +DOCTAGS_VERSION: Final = "1.0.0" + + +class IDocTagsParams(DocTagsParams): + """DocTags-specific serialization parameters.""" + + do_self_closing: bool = True + pretty_indentation: Optional[str] = 2 * " " + + +class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer): + """DocTags-specific meta serializer.""" + + @override + def serialize( + self, + *, + item: NodeItem, + **kwargs: Any, + ) -> SerializationResult: + """DocTags-specific meta serializer.""" + params = IDocTagsParams(**kwargs) + + elem_delim = "" + texts = ( + [ + tmp + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + ( + params.allowed_meta_names is None + or key in params.allowed_meta_names + ) + and (key not in params.blocked_meta_names) + and (tmp := self._serialize_meta_field(item.meta, key)) + ) + ] + if item.meta + else [] + ) + if texts: + texts.insert(0, "") + texts.append("") + return create_ser_result( + text=elem_delim.join(texts), + span_source=item if isinstance(item, DocItem) else [], + ) + + def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None: + if name == MetaFieldName.SUMMARY and isinstance( + field_val, SummaryMetaField + ): + txt = f"{field_val.text}" + elif name == MetaFieldName.DESCRIPTION and isinstance( + field_val, DescriptionMetaField + ): + txt = f"{field_val.text}" + elif name == MetaFieldName.CLASSIFICATION and isinstance( + field_val, PictureClassificationMetaField + ): + class_name = self._humanize_text( + field_val.get_main_prediction().class_name + ) + txt = f"{class_name}" + elif name == MetaFieldName.MOLECULE and isinstance( + field_val, MoleculeMetaField + ): + txt = f"{field_val.smi}" + elif name == MetaFieldName.TABULAR_CHART and isinstance( + field_val, TabularChartMetaField + ): + # suppressing tabular chart serialization + return None + # elif tmp := str(field_val or ""): + # txt = tmp + elif name not in {v.value for v in MetaFieldName}: + txt = _wrap(text=str(field_val or ""), wrap_tag=name) + return txt + return None + + +class IDocTagsPictureSerializer(DocTagsPictureSerializer): + """DocTags-specific picture item serializer.""" + + @override + def serialize( + self, + *, + item: PictureItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + params = DocTagsParams(**kwargs) + res_parts: list[SerializationResult] = [] + is_chart = False + + if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): + + if item.meta and not params.use_legacy_annotations: + meta_res = doc_serializer.serialize_meta(item=item, **kwargs) + if meta_res.text: + res_parts.append(meta_res) + + body = "" + if params.add_location: + body += item.get_location_tokens( + doc=doc, + xsize=params.xsize, + ysize=params.ysize, + self_closing=params.do_self_closing, + ) + + # handle tabular chart data + chart_data: Optional[TableData] = None + if item.meta and item.meta.tabular_chart: + chart_data = item.meta.tabular_chart.chart_data + if chart_data and chart_data.table_cells: + temp_doc = DoclingDocument(name="temp") + temp_table = temp_doc.add_table(data=chart_data) + otsl_content = temp_table.export_to_otsl( + temp_doc, + add_cell_location=False, + self_closing=params.do_self_closing, + ) + body += otsl_content + res_parts.append(create_ser_result(text=body, span_source=item)) + + if params.add_caption: + cap_res = doc_serializer.serialize_captions(item=item, **kwargs) + if cap_res.text: + res_parts.append(cap_res) + + text_res = "".join([r.text for r in res_parts]) + if text_res: + token = DocumentToken.create_token_name_from_doc_item_label( + label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE, + ) + text_res = _wrap(text=text_res, wrap_tag=token) + return create_ser_result(text=text_res, span_source=res_parts) + + +class IDocTagsDocSerializer(DocTagsDocSerializer): + """DocTags document serializer.""" + + picture_serializer: BasePictureSerializer = IDocTagsPictureSerializer() + meta_serializer: BaseMetaSerializer = IDocTagsMetaSerializer() + params: IDocTagsParams = IDocTagsParams() + + @override + def _meta_is_wrapped(self) -> bool: + return True + + @override + def serialize_doc( + self, + *, + parts: list[SerializationResult], + **kwargs: Any, + ) -> SerializationResult: + """DocTags-specific document serializer.""" + delim = _get_delim(params=self.params) + text_res = delim.join([p.text for p in parts if p.text]) + + if self.params.add_page_break: + page_sep = f"<{DocumentToken.PAGE_BREAK.value}>" + for full_match, _, _ in self._get_page_breaks(text=text_res): + text_res = text_res.replace(full_match, page_sep) + + wrap_tag = DocumentToken.DOCUMENT.value + text_res = f"<{wrap_tag}>{DOCTAGS_VERSION}{text_res}{delim}" + + if self.params.pretty_indentation and ( + my_root := parseString(text_res).documentElement + ): + text_res = my_root.toprettyxml(indent=self.params.pretty_indentation) + text_res = "\n".join( + [line for line in text_res.split("\n") if line.strip()] + ) + return create_ser_result(text=text_res, span_source=parts) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 4720ada0..54f83df8 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -315,6 +315,9 @@ def _serialize_body(self, **kwargs) -> SerializationResult: res = self.serialize_doc(parts=subparts, **kwargs) return res + def _meta_is_wrapped(self) -> bool: + return False + @override def serialize( self, @@ -336,7 +339,11 @@ def serialize( my_item = item or self.doc.body if my_item == self.doc.body: - if my_item.meta and not my_params.use_legacy_annotations: + if ( + my_item.meta + and not my_params.use_legacy_annotations + and not self._meta_is_wrapped() + ): meta_part = self.serialize_meta(item=my_item, **my_kwargs) if meta_part.text: parts.append(meta_part) @@ -355,7 +362,11 @@ def serialize( my_visited.add(my_item.self_ref) - if my_item.meta and not my_params.use_legacy_annotations: + if ( + my_item.meta + and not my_params.use_legacy_annotations + and not self._meta_is_wrapped() + ): meta_part = self.serialize_meta(item=my_item, **my_kwargs) if meta_part.text: parts.append(meta_part) @@ -602,7 +613,6 @@ def serialize_meta( text="", span_source=item if isinstance(item, DocItem) else [] ) else: - _logger.warning("No meta serializer found.") return create_ser_result( text="", span_source=item if isinstance(item, DocItem) else [] ) diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 0195cd8e..1c5b9090 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -76,6 +76,8 @@ class Mode(str, Enum): mode: Mode = Mode.HUMAN_FRIENDLY + do_self_closing: bool = False + def _get_delim(params: DocTagsParams) -> str: if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY: @@ -109,11 +111,17 @@ def serialize( ) parts: list[str] = [] + if item.meta and not params.use_legacy_annotations: + meta_res = doc_serializer.serialize_meta(item=item, **kwargs) + if meta_res.text: + parts.append(meta_res.text) + if params.add_location: location = item.get_location_tokens( doc=doc, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) if location: parts.append(location) @@ -183,6 +191,7 @@ def serialize( doc=doc, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) res_parts.append(create_ser_result(text=loc_text, span_source=item)) @@ -232,6 +241,7 @@ def serialize( doc=doc, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) # handle classification data @@ -241,12 +251,16 @@ def serialize( item.meta.classification.get_main_prediction().class_name ) elif ( - classifications := [ - ann - for ann in item.annotations - if isinstance(ann, PictureClassificationData) - ] - ) and classifications[0].predicted_classes: + params.use_legacy_annotations + and ( + classifications := [ + ann + for ann in item.annotations + if isinstance(ann, PictureClassificationData) + ] + ) + and classifications[0].predicted_classes + ): predicted_class = classifications[0].predicted_classes[0].class_name if predicted_class: body += DocumentToken.get_picture_classification_token(predicted_class) @@ -265,9 +279,13 @@ def serialize( smi: Optional[str] = None if item.meta and item.meta.molecule: smi = item.meta.molecule.smi - elif smiles_annotations := [ - ann for ann in item.annotations if isinstance(ann, PictureMoleculeData) - ]: + elif params.use_legacy_annotations and ( + smiles_annotations := [ + ann + for ann in item.annotations + if isinstance(ann, PictureMoleculeData) + ] + ): smi = smiles_annotations[0].smi if smi: body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value) @@ -276,11 +294,13 @@ def serialize( chart_data: Optional[TableData] = None if item.meta and item.meta.tabular_chart: chart_data = item.meta.tabular_chart.chart_data - elif tabular_chart_annotations := [ - ann - for ann in item.annotations - if isinstance(ann, PictureTabularChartData) - ]: + elif params.use_legacy_annotations and ( + tabular_chart_annotations := [ + ann + for ann in item.annotations + if isinstance(ann, PictureTabularChartData) + ] + ): chart_data = tabular_chart_annotations[0].chart_data if chart_data and chart_data.table_cells: temp_doc = DoclingDocument(name="temp") @@ -331,6 +351,7 @@ def serialize( doc=doc, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) # mapping from source_cell_id to a list of target_cell_ids @@ -471,6 +492,7 @@ def _get_inline_location_tags( page_h=page_h, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) return SerializationResult( @@ -606,6 +628,7 @@ def serialize_captions( doc=self.doc, xsize=params.xsize, ysize=params.ysize, + self_closing=params.do_self_closing, ) results.append(create_ser_result(text=loc_txt)) results.append(cap_res) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 626a9734..ccb55d28 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1259,6 +1259,7 @@ def get_location_tokens( new_line: str = "", # deprecated xsize: int = 500, ysize: int = 500, + self_closing: bool = False, ) -> str: """Get the location string for the BaseCell.""" if not len(self.prov): @@ -1274,6 +1275,7 @@ def get_location_tokens( page_h=page_h, xsize=xsize, ysize=ysize, + self_closing=self_closing, ) location += loc_str @@ -1995,6 +1997,7 @@ def export_to_otsl( add_cell_text: bool = True, xsize: int = 500, ysize: int = 500, + self_closing: bool = False, **kwargs: Any, ) -> str: """Export the table as OTSL.""" @@ -2050,6 +2053,7 @@ def export_to_otsl( page_h=page_h, xsize=xsize, ysize=ysize, + self_closing=self_closing, ) if rowstart == i and colstart == j: diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py index 5edbc5dc..028afcaa 100644 --- a/docling_core/types/doc/tokens.py +++ b/docling_core/types/doc/tokens.py @@ -267,12 +267,14 @@ def get_code_language_token(code_language: str) -> str: return _CodeLanguageToken(f"<_{code_language}_>").value @staticmethod - def get_location_token(val: float, rnorm: int = 500): # TODO review + def get_location_token( + val: float, rnorm: int = 500, self_closing: bool = False + ): # TODO review """Function to get location tokens.""" val_ = round(rnorm * val) val_ = max(val_, 0) val_ = min(val_, rnorm - 1) - return f"<{_LOC_PREFIX}{val_}>" + return f"<{_LOC_PREFIX}{val_}{'/' if self_closing else ''}>" @staticmethod def get_location( @@ -281,6 +283,7 @@ def get_location( page_h: float, xsize: int = 500, # TODO review ysize: int = 500, # TODO review + self_closing: bool = False, ): """Get the location string give bbox and page-dim.""" assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}" @@ -291,10 +294,18 @@ def get_location( x1 = bbox[2] / page_w y1 = bbox[3] / page_h - x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize) - y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize) - x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize) - y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize) + x0_tok = DocumentToken.get_location_token( + val=min(x0, x1), rnorm=xsize, self_closing=self_closing + ) + y0_tok = DocumentToken.get_location_token( + val=min(y0, y1), rnorm=ysize, self_closing=self_closing + ) + x1_tok = DocumentToken.get_location_token( + val=max(x0, x1), rnorm=xsize, self_closing=self_closing + ) + y1_tok = DocumentToken.get_location_token( + val=max(y0, y1), rnorm=ysize, self_closing=self_closing + ) loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}" diff --git a/test/data/doc/dummy_doc_with_meta.gt.dt b/test/data/doc/dummy_doc_with_meta.gt.dt new file mode 100644 index 00000000..1979777c --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta.gt.dt @@ -0,0 +1,4 @@ +<loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis +CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1Figure 1: Four examples of complex page layouts across different document categories + + diff --git a/test/data/doc/dummy_doc_with_meta.gt.idt.xml b/test/data/doc/dummy_doc_with_meta.gt.idt.xml new file mode 100644 index 00000000..2d6b544b --- /dev/null +++ b/test/data/doc/dummy_doc_with_meta.gt.idt.xml @@ -0,0 +1,39 @@ + + 1.0.0 + + <meta> + <summary>This is a title.</summary> + <my_corp__foo>More stuff here.</my_corp__foo> + </meta> + <loc_42/> + <loc_26/> + <loc_406/> + <loc_46/> + DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis + + + + ... + Bar chart + CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 + {'myanalysis': {'prediction': 'abc'}, 'something_else': {'text': 'aaa'}} + + + + + + + + + + + Figure 1: Four examples of complex page layouts across different document categories + + + + + + + + + diff --git a/test/test_serialization.py b/test/test_serialization.py index a8ebcdaa..52ede8ed 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -5,6 +5,7 @@ from typing_extensions import override +from docling_core.experimental.idoctags import IDocTagsDocSerializer from docling_core.transforms.serializer.base import ( BaseDocSerializer, SerializationResult, @@ -637,3 +638,26 @@ def test_doctags_inline_and_formatting(): ser = DocTagsDocSerializer(doc=doc) actual = ser.serialize().text verify(exp_file=src.with_suffix(".gt.dt"), actual=actual) + + +def test_doctags_meta(): + src = Path("./test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(src) + + ser = DocTagsDocSerializer(doc=doc) + actual = ser.serialize().text + verify(exp_file=src.with_suffix(".gt.dt"), actual=actual) + + +# =============================== +# IDocTags tests +# =============================== + + +def test_idoctags_meta(): + src = Path("./test/data/doc/dummy_doc_with_meta.yaml") + doc = DoclingDocument.load_from_yaml(src) + + ser = IDocTagsDocSerializer(doc=doc) + actual = ser.serialize().text + verify(exp_file=src.with_suffix(".gt.idt.xml"), actual=actual)