Skip to content

Commit 3d13b02

Browse files
authored
fix: improve meta migration and warning handling (#417)
* fix: improve meta migration and warning handling Signed-off-by: Panos Vagenas <[email protected]> * remove commented-out code Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent 567d3ad commit 3d13b02

17 files changed

+1072
-291
lines changed

docling_core/transforms/serializer/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ def _humanize_text(self, text: str, title: bool = False) -> str:
317317
return tmp.title() if title else tmp.capitalize()
318318

319319

320-
@deprecated("Use BaseMetaSerializer() instead.")
320+
# deprecated: use BaseMetaSerializer instead
321321
class BaseAnnotationSerializer(ABC):
322322
"""Base class for annotation serializers."""
323323

docling_core/transforms/serializer/common.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import logging
88
import re
99
import sys
10+
import warnings
1011
from abc import abstractmethod
1112
from functools import cached_property
1213
from pathlib import Path
@@ -206,7 +207,9 @@ class CommonParams(BaseModel):
206207
include_hyperlinks: bool = True
207208
caption_delim: str = " "
208209
use_legacy_annotations: bool = Field(
209-
default=False, description="Use legacy annotation serialization."
210+
default=False,
211+
description="Use legacy annotation serialization.",
212+
deprecated="Legacy annotations considered only when meta not present.",
210213
)
211214
allowed_meta_names: Optional[set[str]] = Field(
212215
default=None,
@@ -336,7 +339,7 @@ def serialize(
336339
my_item = item or self.doc.body
337340

338341
if my_item == self.doc.body:
339-
if my_item.meta and not my_params.use_legacy_annotations:
342+
if my_item.meta:
340343
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
341344
if meta_part.text:
342345
parts.append(meta_part)
@@ -355,7 +358,7 @@ def serialize(
355358

356359
my_visited.add(my_item.self_ref)
357360

358-
if my_item.meta and not my_params.use_legacy_annotations:
361+
if my_item.meta:
359362
meta_part = self.serialize_meta(item=my_item, **my_kwargs)
360363
if meta_part.text:
361364
parts.append(meta_part)
@@ -655,3 +658,30 @@ def _get_page_breaks(self, text: str) -> Iterable[Tuple[str, int, int]]:
655658
prev_page_nr = int(match.group(1))
656659
next_page_nr = int(match.group(2))
657660
yield (full_match, prev_page_nr, next_page_nr)
661+
662+
663+
def _should_use_legacy_annotations(
664+
*,
665+
params: CommonParams,
666+
item: Union[PictureItem, TableItem],
667+
kind: Optional[str] = None,
668+
) -> bool:
669+
if item.meta:
670+
return False
671+
with warnings.catch_warnings(record=True) as caught_warnings:
672+
warnings.simplefilter("ignore", DeprecationWarning)
673+
if (
674+
incl_attr := getattr(params, "include_annotations", None)
675+
) is not None and not incl_attr:
676+
return False
677+
use_legacy = bool(
678+
[
679+
ann
680+
for ann in item.annotations
681+
if ((ann.kind == kind) if kind is not None else True)
682+
]
683+
)
684+
if use_legacy:
685+
for w in caught_warnings:
686+
warnings.warn(w.message, w.category)
687+
return use_legacy

docling_core/transforms/serializer/doctags.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from docling_core.transforms.serializer.common import (
2424
CommonParams,
2525
DocSerializer,
26+
_should_use_legacy_annotations,
2627
create_ser_result,
2728
)
2829
from docling_core.types.doc.base import BoundingBox
@@ -236,18 +237,25 @@ def serialize(
236237

237238
# handle classification data
238239
predicted_class: Optional[str] = None
239-
if item.meta and item.meta.classification:
240-
predicted_class = (
241-
item.meta.classification.get_main_prediction().class_name
242-
)
243-
elif (
244-
classifications := [
240+
if item.meta:
241+
if item.meta.classification:
242+
predicted_class = (
243+
item.meta.classification.get_main_prediction().class_name
244+
)
245+
elif _should_use_legacy_annotations(
246+
params=params,
247+
item=item,
248+
kind=PictureClassificationData.model_fields["kind"].default,
249+
):
250+
if classifications := [
245251
ann
246252
for ann in item.annotations
247253
if isinstance(ann, PictureClassificationData)
248-
]
249-
) and classifications[0].predicted_classes:
250-
predicted_class = classifications[0].predicted_classes[0].class_name
254+
]:
255+
if classifications[0].predicted_classes:
256+
predicted_class = (
257+
classifications[0].predicted_classes[0].class_name
258+
)
251259
if predicted_class:
252260
body += DocumentToken.get_picture_classification_token(predicted_class)
253261
if predicted_class in [
@@ -263,25 +271,39 @@ def serialize(
263271

264272
# handle molecule data
265273
smi: Optional[str] = None
266-
if item.meta and item.meta.molecule:
267-
smi = item.meta.molecule.smi
268-
elif smiles_annotations := [
269-
ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
270-
]:
271-
smi = smiles_annotations[0].smi
274+
if item.meta:
275+
if item.meta.molecule:
276+
smi = item.meta.molecule.smi
277+
elif _should_use_legacy_annotations(
278+
params=params,
279+
item=item,
280+
kind=PictureMoleculeData.model_fields["kind"].default,
281+
):
282+
if smiles_annotations := [
283+
ann
284+
for ann in item.annotations
285+
if isinstance(ann, PictureMoleculeData)
286+
]:
287+
smi = smiles_annotations[0].smi
272288
if smi:
273289
body += _wrap(text=smi, wrap_tag=DocumentToken.SMILES.value)
274290

275291
# handle tabular chart data
276292
chart_data: Optional[TableData] = None
277-
if item.meta and item.meta.tabular_chart:
278-
chart_data = item.meta.tabular_chart.chart_data
279-
elif tabular_chart_annotations := [
280-
ann
281-
for ann in item.annotations
282-
if isinstance(ann, PictureTabularChartData)
283-
]:
284-
chart_data = tabular_chart_annotations[0].chart_data
293+
if item.meta:
294+
if item.meta.tabular_chart:
295+
chart_data = item.meta.tabular_chart.chart_data
296+
elif _should_use_legacy_annotations(
297+
params=params,
298+
item=item,
299+
kind=PictureTabularChartData.model_fields["kind"].default,
300+
):
301+
if tabular_chart_annotations := [
302+
ann
303+
for ann in item.annotations
304+
if isinstance(ann, PictureTabularChartData)
305+
]:
306+
chart_data = tabular_chart_annotations[0].chart_data
285307
if chart_data and chart_data.table_cells:
286308
temp_doc = DoclingDocument(name="temp")
287309
temp_table = temp_doc.add_table(data=chart_data)

docling_core/transforms/serializer/html.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
CommonParams,
3939
DocSerializer,
4040
_get_annotation_text,
41+
_should_use_legacy_annotations,
4142
create_ser_result,
4243
)
4344
from docling_core.transforms.serializer.html_styles import (
@@ -496,7 +497,11 @@ def get_img_row(imgb64: str, ind: int) -> str:
496497
if img_text:
497498
res_parts.append(create_ser_result(text=img_text, span_source=item))
498499

499-
if params.enable_chart_tables:
500+
if params.enable_chart_tables and _should_use_legacy_annotations(
501+
params=params,
502+
item=item,
503+
kind=PictureTabularChartData.model_fields["kind"].default,
504+
):
500505
# Check if picture has attached PictureTabularChartData
501506
tabular_chart_annotations = [
502507
ann
@@ -867,8 +872,13 @@ def _serialize_meta_field(self, meta: BaseMeta, name: str) -> Optional[str]:
867872
elif isinstance(field_val, MoleculeMetaField):
868873
txt = field_val.smi
869874
elif isinstance(field_val, TabularChartMetaField):
870-
# suppressing tabular chart serialization
871-
return None
875+
temp_doc = DoclingDocument(name="temp")
876+
temp_table = temp_doc.add_table(data=field_val.chart_data)
877+
table_content = temp_table.export_to_html(temp_doc).strip()
878+
if table_content:
879+
txt = table_content
880+
else:
881+
return None
872882
elif tmp := str(field_val or ""):
873883
txt = tmp
874884
else:
@@ -1119,17 +1129,16 @@ def serialize_captions(
11191129
results.append(cap_ser_res)
11201130

11211131
if (
1122-
params.use_legacy_annotations
1123-
and params.include_annotations
1124-
and item.self_ref not in excluded_refs
1132+
item.self_ref not in excluded_refs
1133+
and isinstance(item, (PictureItem, TableItem))
1134+
and _should_use_legacy_annotations(params=params, item=item)
11251135
):
1126-
if isinstance(item, (PictureItem, TableItem)):
1127-
ann_res = self.serialize_annotations(
1128-
item=item,
1129-
**kwargs,
1130-
)
1131-
if ann_res.text:
1132-
results.append(ann_res)
1136+
ann_res = self.serialize_annotations(
1137+
item=item,
1138+
**kwargs,
1139+
)
1140+
if ann_res.text:
1141+
results.append(ann_res)
11331142

11341143
text_res = params.caption_delim.join([r.text for r in results])
11351144
if text_res:

docling_core/transforms/serializer/markdown.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
CommonParams,
3434
DocSerializer,
3535
_get_annotation_text,
36+
_should_use_legacy_annotations,
3637
create_ser_result,
3738
)
3839
from docling_core.types.doc.base import ImageRefMode
@@ -72,23 +73,6 @@
7273
)
7374

7475

75-
def _get_annotation_ser_result(
76-
ann_kind: str, ann_text: str, mark_annotation: bool, doc_item: DocItem
77-
):
78-
return create_ser_result(
79-
text=(
80-
(
81-
f'<!--<annotation kind="{ann_kind}">-->'
82-
f"{ann_text}"
83-
f"<!--<annotation/>-->"
84-
)
85-
if mark_annotation
86-
else ann_text
87-
),
88-
span_source=doc_item,
89-
)
90-
91-
9276
class OrigListItemMarkerMode(str, Enum):
9377
"""Display mode for original list item marker."""
9478

@@ -322,8 +306,13 @@ def _serialize_meta_field(
322306
elif isinstance(field_val, MoleculeMetaField):
323307
txt = field_val.smi
324308
elif isinstance(field_val, TabularChartMetaField):
325-
# suppressing tabular chart serialization
326-
return None
309+
temp_doc = DoclingDocument(name="temp")
310+
temp_table = temp_doc.add_table(data=field_val.chart_data)
311+
table_content = temp_table.export_to_markdown(temp_doc).strip()
312+
if table_content:
313+
txt = table_content
314+
else:
315+
return None
327316
elif tmp := str(field_val or ""):
328317
txt = tmp
329318
else:
@@ -404,7 +393,7 @@ def serialize(
404393

405394
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
406395

407-
if params.use_legacy_annotations and params.include_annotations:
396+
if _should_use_legacy_annotations(params=params, item=item):
408397

409398
ann_res = doc_serializer.serialize_annotations(
410399
item=item,
@@ -473,7 +462,7 @@ def serialize(
473462
res_parts.append(cap_res)
474463

475464
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
476-
if params.use_legacy_annotations and params.include_annotations:
465+
if _should_use_legacy_annotations(params=params, item=item):
477466
ann_res = doc_serializer.serialize_annotations(
478467
item=item,
479468
**kwargs,
@@ -490,7 +479,11 @@ def serialize(
490479
if img_res.text:
491480
res_parts.append(img_res)
492481

493-
if params.enable_chart_tables:
482+
if params.enable_chart_tables and _should_use_legacy_annotations(
483+
params=params,
484+
item=item,
485+
kind=PictureTabularChartData.model_fields["kind"].default,
486+
):
494487
# Check if picture has attached PictureTabularChartData
495488
tabular_chart_annotations = [
496489
ann

0 commit comments

Comments
 (0)