fix: allow captions without holding item (#215)

vagenas · web-flow · commit 2efb71a0ca16 · 2025-03-25T11:13:59.000+01:00
Signed-off-by: Panos Vagenas &lt;pva@zurich.ibm.com&gt;
diff --git a/docling_core/experimental/serializer/common.py b/docling_core/experimental/serializer/common.py
@@ -7,10 +7,11 @@
 import sys
 from abc import abstractmethod
 from copy import deepcopy
+from functools import cached_property
 from pathlib import Path
 from typing import Any, Optional, Union
 
-from pydantic import AnyUrl, BaseModel, NonNegativeInt
+from pydantic import AnyUrl, BaseModel, NonNegativeInt, computed_field
 from typing_extensions import Self, override
 
 from docling_core.experimental.serializer.base import (
@@ -96,6 +97,21 @@ class Config:
 
     _excluded_refs_cache: dict[str, list[str]] = {}
 
+    @computed_field  # type: ignore[misc]
+    @cached_property
+    def _captions_of_some_item(self) -> set[str]:
+        layers = {cl for cl in ContentLayer}  # TODO review
+        refs = {
+            cap.cref
+            for (item, _) in self.doc.iterate_items(
+                with_groups=True,
+                traverse_pictures=True,
+                included_content_layers=layers,
+            )
+            for cap in (item.captions if isinstance(item, FloatingItem) else [])
+        }
+        return refs
+
     @override
     def get_excluded_refs(self, **kwargs) -> list[str]:
         """References to excluded items."""
@@ -201,11 +217,6 @@ def serialize(
             else:
                 return empty_res
 
-        label_blocklist = {
-            # captions only considered in context of floating items (pictures, tables)
-            DocItemLabel.CAPTION,
-        }
-
         ########
         # groups
         ########
@@ -231,20 +242,22 @@ def serialize(
         ###########
         # doc items
         ###########
-        elif isinstance(item, DocItem) and item.label in label_blocklist:
-            return empty_res
         elif isinstance(item, TextItem):
-            part = (
-                self.text_serializer.serialize(
-                    item=item,
-                    doc_serializer=self,
-                    doc=self.doc,
-                    is_inline_scope=is_inline_scope,
-                    **kwargs,
+            if item.self_ref in self._captions_of_some_item:
+                # those captions will be handled by the floating item holding them
+                return empty_res
+            else:
+                part = (
+                    self.text_serializer.serialize(
+                        item=item,
+                        doc_serializer=self,
+                        doc=self.doc,
+                        is_inline_scope=is_inline_scope,
+                        **kwargs,
+                    )
+                    if item.self_ref not in self.get_excluded_refs(**kwargs)
+                    else empty_res
                 )
-                if item.self_ref not in self.get_excluded_refs(**kwargs)
-                else empty_res
-            )
         elif isinstance(item, TableItem):
             part = self.table_serializer.serialize(
                 item=item,
diff --git a/test/data/doc/2206.01062.yaml.dt b/test/data/doc/2206.01062.yaml.dt
@@ -82,6 +82,7 @@
 <text><loc_44><loc_364><loc_241><loc_445>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
 <picture><loc_258><loc_54><loc_457><loc_290></picture>
 <text><loc_327><loc_290><loc_389><loc_291>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</text>
+<caption><loc_260><loc_299><loc_457><loc_318>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 <text><loc_259><loc_332><loc_456><loc_344>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
 <text><loc_259><loc_346><loc_457><loc_448>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
 <page_break>
diff --git a/test/data/doc/2206.01062.yaml.md b/test/data/doc/2206.01062.yaml.md
@@ -152,6 +152,8 @@ Phase 3: Training. After a first trial with a small group of people, we realised
 
 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0
 
+Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
+
 were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
 
 Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted
diff --git a/test/data/doc/2206.01062.yaml.min.dt b/test/data/doc/2206.01062.yaml.min.dt
diff --git a/test/data/doc/2206.01062.yaml.paged.md b/test/data/doc/2206.01062.yaml.paged.md
@@ -160,6 +160,8 @@ Phase 3: Training. After a first trial with a small group of people, we realised
 
 05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0
 
+Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
+
 were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
 
 Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted