fix: align chunk ref format with one used in Document (#37)

vagenas · web-flow · commit b5592ad747a0 · 2024-10-07T09:07:08.000+02:00
Signed-off-by: Panos Vagenas &lt;35837085+vagenas@users.noreply.github.com&gt;
diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
@@ -7,24 +7,40 @@
 from abc import ABC, abstractmethod
 from typing import Iterator, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
 
 from docling_core.types import BoundingBox, Document
 
 
+def _create_path(pos: int, path_prefix: str = "main-text") -> str:
+    return f"#/{path_prefix}/{pos}"
+
+
 class Chunk(BaseModel):
     """Data model for Chunk."""
 
     path: str
     text: str
+    heading: Optional[str] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def _json_pointer_from_json_path(cls, data):
+        path = data.get("path")
+        if path.startswith("$."):
+            parts = path.split("[")
+            data["path"] = _create_path(
+                pos=parts[1][:-1],
+                path_prefix=parts[0][2:],
+            )
+        return data
 
 
 class ChunkWithMetadata(Chunk):
     """Data model for Chunk including metadata."""
 
     page: Optional[int] = None
     bbox: Optional[BoundingBox] = None
-    heading: Optional[str] = None
 
 
 class BaseChunker(BaseModel, ABC):
@@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
             Iterator[Chunk]: iterator over extracted chunks
         """
         raise NotImplementedError()
+
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return _create_path(
+            pos=pos,
+            path_prefix=path_prefix,
+        )
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -12,7 +12,7 @@
 from typing import Any, Iterator, Optional, Union
 
 import pandas as pd
-from pydantic import BaseModel, PositiveInt
+from pydantic import BaseModel, Field, PositiveInt
 
 from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
 from docling_core.types import BaseText
@@ -25,9 +25,17 @@
 class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
 
-    include_metadata: bool = True
-    heading_as_metadata: bool = False
-    min_chunk_len: PositiveInt = 64
+    heading_as_metadata: bool = Field(
+        default=False,
+        description="Whether heading should be in metadata (instead of text)",
+    )
+    include_metadata: bool = Field(
+        default=True,
+        description="Whether to include extras in the metadata",
+    )
+    min_chunk_len: PositiveInt = Field(
+        default=64, description="Minimum chunk text length to consider (in chars)"
+    )
 
     class _NodeType(str, Enum):
         PARAGRAPH = "paragraph"
@@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:
 
         return output_text
 
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return f"$.{path_prefix}[{pos}]"
-
     class _MainTextItemNode(BaseModel):
         parent: Optional[int] = None
         children: list[int] = []
@@ -304,14 +308,15 @@ def _build_chunk(
                 return ChunkWithMetadata(
                     text=concat,
                     path=path,
+                    heading=heading,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
-                    heading=heading,
                 )
             else:
                 return Chunk(
                     text=concat,
                     path=path,
+                    heading=heading,
                 )
         else:
             return None
@@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
         Yields:
             Iterator[Chunk]: iterator over extracted chunks
         """
-        if (not self.include_metadata) and self.heading_as_metadata:
-            raise RuntimeError(
-                "To enable `heading_as_metadata`, also `include_metadata` must be True."
-            )
-
         if dl_doc.main_text:
             # extract doc structure incl. metadata for
             # each item (e.g. parent, children)
diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,40 +12,40 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle",
             "page": 3,
             "bbox": [
                 5.0,
                 6.0,
                 7.0,
                 8.0
-            ],
-            "heading": "Some subtitle"
+            ]
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         }
     ]
 }
diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
@@ -0,0 +1,23 @@
+{
+    "root": [
+        {
+            "path": "#/main-text/0",
+            "text": "This paragraph is marginally long enough for getting accepted as a chunk."
+        },
+        {
+            "path": "#/main-text/4",
+            "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle"
+        },
+        {
+            "path": "#/tables/0",
+            "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions"
+        },
+        {
+            "path": "#/main-text/8",
+            "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions"
+        }
+    ]
+}
diff --git a/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json b/test/data/chunker/0_out_chunks_heading_in_text_with_extras.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,7 +12,7 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
             "page": 3,
             "bbox": [
@@ -23,7 +23,7 @@
             ]
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
             "page": 4,
             "bbox": [
@@ -34,7 +34,7 @@
             ]
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
             "page": 4,
             "bbox": [
@@ -45,7 +45,7 @@
             ]
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
             "page": 4,
             "bbox": [
diff --git a/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json
@@ -1,23 +1,23 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk."
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
         }
     ]
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
@@ -9,37 +9,49 @@
 from docling_core.types import Document as DLDocument
 
 
-def test_chunk_without_metadata():
+def test_chunk_heading_in_text_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
-    act_data = dict(root=[n.model_dump() for n in chunks])
-    with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
+        exp_data = json.load(fp=f)
+    assert exp_data == act_data
+
+
+def test_chunk_heading_in_text_with_extras():
+    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+        data_json = f.read()
+    dl_doc = DLDocument.model_validate_json(data_json)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
+    chunks = chunker.chunk(dl_doc=dl_doc)
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_heading_in_text():
+def test_chunk_heading_in_meta_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_incl_heading():
+def test_chunk_heading_in_meta_with_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"root": [`
`3`	`3`	`{`
`4`		`- "path": "$.main-text[0]",`
	`4`	`+ "path": "#/main-text/0",`
`5`	`5`	`"text": "This paragraph is marginally long enough for getting accepted as a chunk.",`
`6`	`6`	`"page": 1,`
`7`	`7`	`"bbox": [`
`@@ -12,7 +12,7 @@`
`12`	`12`	`]`
`13`	`13`	`},`
`14`	`14`	`{`
`15`		`- "path": "$.main-text[4]",`
	`15`	`+ "path": "#/main-text/4",`
`16`	`16`	`"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",`
`17`	`17`	`"page": 3,`
`18`	`18`	`"bbox": [`
`@@ -23,7 +23,7 @@`
`23`	`23`	`]`
`24`	`24`	`},`
`25`	`25`	`{`
`26`		`- "path": "$.tables[0]",`
	`26`	`+ "path": "#/tables/0",`
`27`	`27`	`"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",`
`28`	`28`	`"page": 4,`
`29`	`29`	`"bbox": [`
`@@ -34,7 +34,7 @@`
`34`	`34`	`]`
`35`	`35`	`},`
`36`	`36`	`{`
`37`		`- "path": "$.main-text[7]",`
	`37`	`+ "path": "#/main-text/7",`
`38`	`38`	`"text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",`
`39`	`39`	`"page": 4,`
`40`	`40`	`"bbox": [`
`@@ -45,7 +45,7 @@`
`45`	`45`	`]`
`46`	`46`	`},`
`47`	`47`	`{`
`48`		`- "path": "$.main-text[8]",`
	`48`	`+ "path": "#/main-text/8",`
`49`	`49`	`"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",`
`50`	`50`	`"page": 4,`
`51`	`51`	`"bbox": [`
Original file line number	Diff line number	Diff line change
`@@ -1,23 +1,23 @@`
`1`	`1`	`{`
`2`	`2`	`"root": [`
`3`	`3`	`{`
`4`		`- "path": "$.main-text[0]",`
	`4`	`+ "path": "#/main-text/0",`
`5`	`5`	`"text": "This paragraph is marginally long enough for getting accepted as a chunk."`
`6`	`6`	`},`
`7`	`7`	`{`
`8`		`- "path": "$.main-text[4]",`
	`8`	`+ "path": "#/main-text/4",`
`9`	`9`	`"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."`
`10`	`10`	`},`
`11`	`11`	`{`
`12`		`- "path": "$.tables[0]",`
	`12`	`+ "path": "#/tables/0",`
`13`	`13`	`"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"`
`14`	`14`	`},`
`15`	`15`	`{`
`16`		`- "path": "$.main-text[7]",`
	`16`	`+ "path": "#/main-text/7",`
`17`	`17`	`"text": "Acquisitions\nThis paragraph should actually include the latest subtitle."`
`18`	`18`	`},`
`19`	`19`	`{`
`20`		`- "path": "$.main-text[8]",`
	`20`	`+ "path": "#/main-text/8",`
`21`	`21`	`"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."`
`22`	`22`	`}`
`23`	`23`	`]`