Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docling_core/transforms/serializer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def _humanize_text(self, text: str, title: bool = False) -> str:
return tmp.title() if title else tmp.capitalize()


@deprecated("Use BaseMetaSerializer() instead.")
# deprecated: use BaseMetaSerializer instead
class BaseAnnotationSerializer(ABC):
"""Base class for annotation serializers."""

Expand Down
36 changes: 33 additions & 3 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import re
import sys
import warnings
from abc import abstractmethod
from functools import cached_property
from pathlib import Path
Expand Down Expand Up @@ -206,7 +207,9 @@ class CommonParams(BaseModel):
include_hyperlinks: bool = True
caption_delim: str = " "
use_legacy_annotations: bool = Field(
default=False, description="Use legacy annotation serialization."
default=False,
description="Use legacy annotation serialization.",
deprecated="Legacy annotations considered only when meta not present.",
)
allowed_meta_names: Optional[set[str]] = Field(
default=None,
Expand Down Expand Up @@ -336,7 +339,7 @@ def serialize(
my_item = item or self.doc.body

if my_item == self.doc.body:
if my_item.meta and not my_params.use_legacy_annotations:
if my_item.meta:
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand All @@ -355,7 +358,7 @@ def serialize(

my_visited.add(my_item.self_ref)

if my_item.meta and not my_params.use_legacy_annotations:
if my_item.meta:
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
if meta_part.text:
parts.append(meta_part)
Expand Down Expand Up @@ -655,3 +658,30 @@ def _get_page_breaks(self, text: str) -> Iterable[Tuple[str, int, int]]:
prev_page_nr = int(match.group(1))
next_page_nr = int(match.group(2))
yield (full_match, prev_page_nr, next_page_nr)


def _should_use_legacy_annotations(
*,
params: CommonParams,
item: Union[PictureItem, TableItem],
kind: Optional[str] = None,
) -> bool:
if item.meta:
return False
with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("ignore", DeprecationWarning)
if (
incl_attr := getattr(params, "include_annotations", None)
) is not None and not incl_attr:
return False
use_legacy = bool(
[
ann
for ann in item.annotations
if ((ann.kind == kind) if kind is not None else True)
]
)
if use_legacy:
for w in caught_warnings:
warnings.warn(w.message, w.category)
return use_legacy
68 changes: 45 additions & 23 deletions docling_core/transforms/serializer/doctags.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from docling_core.transforms.serializer.common import (
CommonParams,
DocSerializer,
_should_use_legacy_annotations,
create_ser_result,
)
from docling_core.types.doc.base import BoundingBox
Expand Down Expand Up @@ -236,18 +237,25 @@ def serialize(

# handle classification data
predicted_class: Optional[str] = None
if item.meta and item.meta.classification:
predicted_class = (
item.meta.classification.get_main_prediction().class_name
)
elif (
classifications := [
if item.meta:
if item.meta.classification:
predicted_class = (
item.meta.classification.get_main_prediction().class_name
)
elif _should_use_legacy_annotations(
params=params,
item=item,
kind=PictureClassificationData.model_fields["kind"].default,
):
if classifications := [
ann
for ann in item.annotations
if isinstance(ann, PictureClassificationData)
]
) and classifications[0].predicted_classes:
predicted_class = classifications[0].predicted_classes[0].class_name
]:
if classifications[0].predicted_classes:
predicted_class = (
classifications[0].predicted_classes[0].class_name
)
if predicted_class:
body += DocumentToken.get_picture_classification_token(predicted_class)
if predicted_class in [
Expand All @@ -263,25 +271,39 @@ def serialize(

# handle molecule data
smi: Optional[str] = None
if item.meta and item.meta.molecule:
smi = item.meta.molecule.smi
elif smiles_annotations := [
ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
]:
smi = smiles_annotations[0].smi
if item.meta:
if item.meta.molecule:
smi = item.meta.molecule.smi
elif _should_use_legacy_annotations(
params=params,
item=item,
kind=PictureMoleculeData.model_fields["kind"].default,
):
if smiles_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureMoleculeData)
]:
smi = smiles_annotations[0].smi
if smi:
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)

# handle tabular chart data
chart_data: Optional[TableData] = None
if item.meta and item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
elif tabular_chart_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureTabularChartData)
]:
chart_data = tabular_chart_annotations[0].chart_data
if item.meta:
if item.meta.tabular_chart:
chart_data = item.meta.tabular_chart.chart_data
elif _should_use_legacy_annotations(
params=params,
item=item,
kind=PictureTabularChartData.model_fields["kind"].default,
):
if tabular_chart_annotations := [
ann
for ann in item.annotations
if isinstance(ann, PictureTabularChartData)
]:
chart_data = tabular_chart_annotations[0].chart_data
if chart_data and chart_data.table_cells:
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(data=chart_data)
Expand Down
35 changes: 22 additions & 13 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
CommonParams,
DocSerializer,
_get_annotation_text,
_should_use_legacy_annotations,
create_ser_result,
)
from docling_core.transforms.serializer.html_styles import (
Expand Down Expand Up @@ -496,7 +497,11 @@ def get_img_row(imgb64: str, ind: int) -> str:
if img_text:
res_parts.append(create_ser_result(text=img_text, span_source=item))

if params.enable_chart_tables:
if params.enable_chart_tables and _should_use_legacy_annotations(
params=params,
item=item,
kind=PictureTabularChartData.model_fields["kind"].default,
):
# Check if picture has attached PictureTabularChartData
tabular_chart_annotations = [
ann
Expand Down Expand Up @@ -867,8 +872,13 @@ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
elif isinstance(field_val, MoleculeMetaField):
txt = field_val.smi
elif isinstance(field_val, TabularChartMetaField):
# suppressing tabular chart serialization
return None
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(data=field_val.chart_data)
table_content = temp_table.export_to_html(temp_doc).strip()
if table_content:
txt = table_content
else:
return None
elif tmp := str(field_val or ""):
txt = tmp
else:
Expand Down Expand Up @@ -1119,17 +1129,16 @@ def serialize_captions(
results.append(cap_ser_res)

if (
params.use_legacy_annotations
and params.include_annotations
and item.self_ref not in excluded_refs
item.self_ref not in excluded_refs
and isinstance(item, (PictureItem, TableItem))
and _should_use_legacy_annotations(params=params, item=item)
):
if isinstance(item, (PictureItem, TableItem)):
ann_res = self.serialize_annotations(
item=item,
**kwargs,
)
if ann_res.text:
results.append(ann_res)
ann_res = self.serialize_annotations(
item=item,
**kwargs,
)
if ann_res.text:
results.append(ann_res)

text_res = params.caption_delim.join([r.text for r in results])
if text_res:
Expand Down
37 changes: 15 additions & 22 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
CommonParams,
DocSerializer,
_get_annotation_text,
_should_use_legacy_annotations,
create_ser_result,
)
from docling_core.types.doc.base import ImageRefMode
Expand Down Expand Up @@ -72,23 +73,6 @@
)


def _get_annotation_ser_result(
ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
):
return create_ser_result(
text=(
(
f'<!--<annotation kind="{ann_kind}">-->'
f"{ann_text}"
f"<!--<annotation/>-->"
)
if mark_annotation
else ann_text
),
span_source=doc_item,
)


class OrigListItemMarkerMode(str, Enum):
"""Display mode for original list item marker."""

Expand Down Expand Up @@ -315,8 +299,13 @@ def _serialize_meta_field(
elif isinstance(field_val, MoleculeMetaField):
txt = field_val.smi
elif isinstance(field_val, TabularChartMetaField):
# suppressing tabular chart serialization
return None
temp_doc = DoclingDocument(name="temp")
temp_table = temp_doc.add_table(data=field_val.chart_data)
table_content = temp_table.export_to_markdown(temp_doc).strip()
if table_content:
txt = table_content
else:
return None
elif tmp := str(field_val or ""):
txt = tmp
else:
Expand Down Expand Up @@ -397,7 +386,7 @@ def serialize(

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):

if params.use_legacy_annotations and params.include_annotations:
if _should_use_legacy_annotations(params=params, item=item):

ann_res = doc_serializer.serialize_annotations(
item=item,
Expand Down Expand Up @@ -466,7 +455,7 @@ def serialize(
res_parts.append(cap_res)

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
if params.use_legacy_annotations and params.include_annotations:
if _should_use_legacy_annotations(params=params, item=item):
ann_res = doc_serializer.serialize_annotations(
item=item,
**kwargs,
Expand All @@ -483,7 +472,11 @@ def serialize(
if img_res.text:
res_parts.append(img_res)

if params.enable_chart_tables:
if params.enable_chart_tables and _should_use_legacy_annotations(
params=params,
item=item,
kind=PictureTabularChartData.model_fields["kind"].default,
):
# Check if picture has attached PictureTabularChartData
tabular_chart_annotations = [
ann
Expand Down
Loading
Loading