feat: support heading as chunk metadata (#36)

vagenas · web-flow · commit 4bde51528d23 · 2024-09-30T11:08:09.000+02:00
Signed-off-by: Panos Vagenas &lt;35837085+vagenas@users.noreply.github.com&gt;
diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
@@ -22,8 +22,9 @@ class Chunk(BaseModel):
 class ChunkWithMetadata(Chunk):
     """Data model for Chunk including metadata."""
 
-    page: Optional[int]
-    bbox: Optional[BoundingBox]
+    page: Optional[int] = None
+    bbox: Optional[BoundingBox] = None
+    heading: Optional[str] = None
 
 
 class BaseChunker(BaseModel, ABC):
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
 
     include_metadata: bool = True
+    heading_as_metadata: bool = False
     min_chunk_len: PositiveInt = 64
 
     class _NodeType(str, Enum):
@@ -184,7 +185,7 @@ class _TextEntry(BaseModel):
 
     def _build_chunk_impl(
         self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
-    ) -> list[_TextEntry]:
+    ) -> tuple[list[_TextEntry], Optional[str]]:
         if doc.main_text:
             item = doc.main_text[idx]
             item_type = _HC._norm(item.obj_type)
@@ -193,7 +194,7 @@ def _build_chunk_impl(
                 item_type not in self._allowed_types
                 or item_name in self._disallowed_names_by_type.get(item_type, [])
             ):
-                return []
+                return [], None
 
             c2p = doc_map.dmap
 
@@ -219,7 +220,7 @@ def _build_chunk_impl(
                         else []
                     )
                 else:
-                    return []
+                    return [], None
             elif isinstance(item, BaseText):
                 text_entries = [
                     self._TextEntry(
@@ -248,21 +249,29 @@ def _build_chunk_impl(
                     _HC._NodeName.LIST_ITEM,
                     _HC._NodeName.SUBTITLE_LEVEL_1,
                 ]:
-                    return []
+                    return [], None
 
             if (parent := c2p[idx].parent) is not None:
                 # prepend with ancestors
+
+                parent_res = self._build_chunk_impl(
+                    doc=doc, doc_map=doc_map, idx=parent, rec=True
+                )
                 return (
-                    self._build_chunk_impl(
-                        doc=doc, doc_map=doc_map, idx=parent, rec=True
-                    )
-                    + text_entries
+                    parent_res[0] + text_entries,  # expanded text
+                    parent_res[1],  # heading
                 )
             else:
-                # if root, augment with title (if available and different)
-                return text_entries
+                if (
+                    self.heading_as_metadata
+                    and isinstance(item, BaseText)
+                    and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
+                ):
+                    return [], text_entries[0].text
+                else:
+                    return text_entries, None
         else:
-            return []
+            return [], None
 
     def _build_chunk(
         self,
@@ -272,7 +281,9 @@ def _build_chunk(
         delim: str,
         rec: bool = False,
     ) -> Optional[Chunk]:
-        texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
+        res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
+        texts = res[0]
+        heading = res[1]
         concat = delim.join([t.text for t in texts if t.text])
         assert doc.main_text is not None
         if len(concat) >= self.min_chunk_len:
@@ -295,6 +306,7 @@ def _build_chunk(
                     path=path,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
+                    heading=heading,
                 )
             else:
                 return Chunk(
@@ -315,6 +327,11 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
         Yields:
             Iterator[Chunk]: iterator over extracted chunks
         """
+        if (not self.include_metadata) and self.heading_as_metadata:
+            raise RuntimeError(
+                "To enable `heading_as_metadata`, also `include_metadata` must be True."
+            )
+
         if dl_doc.main_text:
             # extract doc structure incl. metadata for
             # each item (e.g. parent, children)
diff --git a/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json
diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json
@@ -0,0 +1,51 @@
+{
+    "root": [
+        {
+            "path": "$.main-text[0]",
+            "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
+            "page": 1,
+            "bbox": [
+                0.0,
+                1.0,
+                2.0,
+                3.0
+            ]
+        },
+        {
+            "path": "$.main-text[4]",
+            "text": "This one should also include the subtitle above since it is long enough.",
+            "page": 3,
+            "bbox": [
+                5.0,
+                6.0,
+                7.0,
+                8.0
+            ],
+            "heading": "Some subtitle"
+        },
+        {
+            "path": "$.tables[0]",
+            "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "page": 4,
+            "bbox": [
+                8.0,
+                9.0,
+                10.0,
+                11.0
+            ],
+            "heading": "Acquisitions"
+        },
+        {
+            "path": "$.main-text[8]",
+            "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "page": 4,
+            "bbox": [
+                8.0,
+                9.0,
+                10.0,
+                11.0
+            ],
+            "heading": "Acquisitions"
+        }
+    ]
+}
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
@@ -21,13 +21,25 @@ def test_chunk_without_metadata():
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata():
+def test_chunk_with_metadata_heading_in_text():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True)
+    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
-    act_data = dict(root=[n.model_dump() for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta.json") as f:
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
+        exp_data = json.load(fp=f)
+    assert exp_data == act_data
+
+
+def test_chunk_with_metadata_incl_heading():
+    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+        data_json = f.read()
+    dl_doc = DLDocument.model_validate_json(data_json)
+    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
+    chunks = chunker.chunk(dl_doc=dl_doc)
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data