feat: enable heading-only chunks for empty-section headings

vagenas · vagenas · commit 759656c3f0f9 · 2025-12-18T13:50:01.000+01:00
Signed-off-by: Panos Vagenas &lt;pva@zurich.ibm.com&gt;
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Iterator, Optional
+from typing import Any, Iterator, Optional, Union
 
 from pydantic import ConfigDict, Field
 from typing_extensions import Annotated, override
@@ -121,12 +121,14 @@ class HierarchicalChunker(BaseChunker):
         code_chunking_strategy (CodeChunkingStrategy): Optional strategy for chunking code items.
             If provided, code items will be processed using this strategy instead of being
             treated as regular text. Defaults to None (no special code processing).
+        always_emit_headings (bool): Whether to emit headings even for empty sections. Defaults to False.
     """
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
     code_chunking_strategy: Optional[BaseCodeChunkingStrategy] = Field(default=None)
+    always_emit_headings: bool = False
 
     # deprecated:
     merge_list_items: Annotated[bool, Field(deprecated=True)] = True
@@ -145,7 +147,8 @@ def chunk(
             Iterator[Chunk]: iterator over extracted chunks
         """
         my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
-        heading_by_level: dict[LevelNumber, str] = {}
+        heading_by_level: dict[LevelNumber, Union[TitleItem, SectionHeaderItem]] = {}
+        heading_emitted: set[str] = set()
         visited: set[str] = set()
         ser_res = create_ser_result()
         excluded_refs = my_doc_ser.get_excluded_refs(**kwargs)
@@ -154,12 +157,34 @@ def chunk(
                 continue
             if isinstance(item, (TitleItem, SectionHeaderItem)):
                 level = item.level if isinstance(item, SectionHeaderItem) else 0
-                heading_by_level[level] = item.text
 
-                # remove headings of higher level as they just went out of scope
-                keys_to_del = [k for k in heading_by_level if k > level]
+                # prepare to remove shadowed headings as they just went out of scope
+                sorted_keys = sorted(heading_by_level)
+                keys_to_del = [k for k in sorted_keys if k >= level]
+
+                # before removing, check if headings need to be emitted
+                if (
+                    keys_to_del
+                    and self.always_emit_headings
+                    and (leaf_ref := heading_by_level[sorted_keys[-1]].self_ref)
+                    not in heading_emitted
+                ):
+                    yield DocChunk(
+                        text="",
+                        meta=DocMeta(
+                            doc_items=[heading_by_level[k] for k in sorted_keys],
+                            headings=[heading_by_level[k].text for k in sorted_keys],
+                        ),
+                    )
+                    heading_emitted.add(leaf_ref)
+
+                # actually remove shadowed headings
                 for k in keys_to_del:
                     heading_by_level.pop(k, None)
+
+                # capture current heading
+                heading_by_level[level] = item
+
                 continue
             elif (
                 isinstance(item, (ListGroup, InlineGroup, DocItem))
@@ -184,13 +209,35 @@ def chunk(
             if not ser_res.text:
                 continue
             if doc_items := [u.item for u in ser_res.spans]:
+                sorted_keys = sorted(heading_by_level)
+                headings = [heading_by_level[k].text for k in sorted_keys] or None
                 c = DocChunk(
                     text=ser_res.text,
                     meta=DocMeta(
                         doc_items=doc_items,
-                        headings=[heading_by_level[k] for k in sorted(heading_by_level)]
-                        or None,
+                        headings=headings,
                         origin=dl_doc.origin,
                     ),
                 )
+                if self.always_emit_headings and headings:
+                    leaf_ref = heading_by_level[sorted_keys[-1]].self_ref
+                    heading_emitted.add(leaf_ref)
                 yield c
+
+        # if applicable, emit any remaining headings
+        if (
+            self.always_emit_headings
+            and (sorted_keys := sorted(heading_by_level))
+            and (
+                (leaf_ref := heading_by_level[sorted_keys[-1]].self_ref)
+                not in heading_emitted
+            )
+        ):
+            yield DocChunk(
+                text="",
+                meta=DocMeta(
+                    doc_items=[heading_by_level[k] for k in sorted_keys],
+                    headings=[heading_by_level[k].text for k in sorted_keys],
+                ),
+            )
+            heading_emitted.add(leaf_ref)
diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py
@@ -10,6 +10,7 @@
     ChunkingSerializerProvider,
 )
 from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
+from docling_core.types.doc.document import SectionHeaderItem, TitleItem
 
 try:
     import semchunk
@@ -55,6 +56,7 @@ class HybridChunker(BaseChunker):
         max_tokens: The maximum number of tokens per chunk. If not set, limit is
             resolved from the tokenizer
         merge_peers: Whether to merge undersized chunks sharing same relevant metadata
+        always_emit_headings: Whether to emit headings even for empty sections
     """
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -63,6 +65,7 @@ class HybridChunker(BaseChunker):
     merge_peers: bool = True
 
     serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
+    always_emit_headings: bool = False
 
     @model_validator(mode="before")
     @classmethod
@@ -110,7 +113,10 @@ def max_tokens(self) -> int:
     @computed_field  # type: ignore[misc]
     @cached_property
     def _inner_chunker(self) -> HierarchicalChunker:
-        return HierarchicalChunker(serializer_provider=self.serializer_provider)
+        return HierarchicalChunker(
+            serializer_provider=self.serializer_provider,
+            always_emit_headings=self.always_emit_headings,
+        )
 
     def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
         if text is None:
@@ -162,6 +168,7 @@ def _make_chunk_from_doc_items(
                     res_text
                     for doc_item in doc_items
                     if (res_text := doc_serializer.serialize(item=doc_item).text)
+                    and not isinstance(doc_item, (TitleItem, SectionHeaderItem))
                 ]
             )
         )
diff --git a/test/data/chunker/2h_out_chunks_hier_emit_false.json b/test/data/chunker/2h_out_chunks_hier_emit_false.json
@@ -0,0 +1,27 @@
+{
+    "root": [
+        {
+            "text": "Foo",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/8",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "text",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 3",
+                    "Section 3.1"
+                ]
+            }
+        }
+    ]
+}
diff --git a/test/data/chunker/2h_out_chunks_hier_emit_true.json b/test/data/chunker/2h_out_chunks_hier_emit_true.json
@@ -0,0 +1,170 @@
+{
+    "root": [
+        {
+            "text": "",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/0",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    },
+                    {
+                        "self_ref": "#/texts/1",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 1",
+                    "Section 1.1"
+                ]
+            }
+        },
+        {
+            "text": "",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/0",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    },
+                    {
+                        "self_ref": "#/texts/2",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 1",
+                    "Section 1.2"
+                ]
+            }
+        },
+        {
+            "text": "",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/3",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    },
+                    {
+                        "self_ref": "#/texts/4",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    },
+                    {
+                        "self_ref": "#/texts/5",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 2",
+                    "Section 2.1",
+                    "Section 2.1.1"
+                ]
+            }
+        },
+        {
+            "text": "Foo",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/8",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "text",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 3",
+                    "Section 3.1"
+                ]
+            }
+        },
+        {
+            "text": "",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/9",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    },
+                    {
+                        "self_ref": "#/texts/10",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "section_header",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 4",
+                    "Section 4.1"
+                ]
+            }
+        }
+    ]
+}
diff --git a/test/data/chunker/2h_out_chunks_hybr_emit_false.json b/test/data/chunker/2h_out_chunks_hybr_emit_false.json
@@ -0,0 +1,27 @@
+{
+    "root": [
+        {
+            "text": "Foo",
+            "meta": {
+                "schema_name": "docling_core.transforms.chunker.DocMeta",
+                "version": "1.0.0",
+                "doc_items": [
+                    {
+                        "self_ref": "#/texts/8",
+                        "parent": {
+                            "$ref": "#/body"
+                        },
+                        "children": [],
+                        "content_layer": "body",
+                        "label": "text",
+                        "prov": []
+                    }
+                ],
+                "headings": [
+                    "Section 3",
+                    "Section 3.1"
+                ]
+            }
+        }
+    ]
+}
diff --git a/test/data/chunker/2h_out_chunks_hybr_emit_true.json b/test/data/chunker/2h_out_chunks_hybr_emit_true.json
diff --git a/test/test_hybrid_chunker.py b/test/test_hybrid_chunker.py