Skip to content

Commit b5592ad

Browse files
authored
fix: align chunk ref format with one used in Document (#37)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 4496b44 commit b5592ad

File tree

7 files changed

+104
-46
lines changed

7 files changed

+104
-46
lines changed

docling_core/transforms/chunker/base.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,24 +7,40 @@
77
from abc import ABC, abstractmethod
88
from typing import Iterator, Optional
99

10-
from pydantic import BaseModel
10+
from pydantic import BaseModel, model_validator
1111

1212
from docling_core.types import BoundingBox, Document
1313

1414

15+
def _create_path(pos: int, path_prefix: str = "main-text") -> str:
16+
return f"#/{path_prefix}/{pos}"
17+
18+
1519
class Chunk(BaseModel):
1620
"""Data model for Chunk."""
1721

1822
path: str
1923
text: str
24+
heading: Optional[str] = None
25+
26+
@model_validator(mode="before")
27+
@classmethod
28+
def _json_pointer_from_json_path(cls, data):
29+
path = data.get("path")
30+
if path.startswith("$."):
31+
parts = path.split("[")
32+
data["path"] = _create_path(
33+
pos=parts[1][:-1],
34+
path_prefix=parts[0][2:],
35+
)
36+
return data
2037

2138

2239
class ChunkWithMetadata(Chunk):
2340
"""Data model for Chunk including metadata."""
2441

2542
page: Optional[int] = None
2643
bbox: Optional[BoundingBox] = None
27-
heading: Optional[str] = None
2844

2945

3046
class BaseChunker(BaseModel, ABC):
@@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
4460
Iterator[Chunk]: iterator over extracted chunks
4561
"""
4662
raise NotImplementedError()
63+
64+
@classmethod
65+
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
66+
return _create_path(
67+
pos=pos,
68+
path_prefix=path_prefix,
69+
)

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from typing import Any, Iterator, Optional, Union
1313

1414
import pandas as pd
15-
from pydantic import BaseModel, PositiveInt
15+
from pydantic import BaseModel, Field, PositiveInt
1616

1717
from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
1818
from docling_core.types import BaseText
@@ -25,9 +25,17 @@
2525
class HierarchicalChunker(BaseChunker):
2626
"""Chunker implementation leveraging the document layout."""
2727

28-
include_metadata: bool = True
29-
heading_as_metadata: bool = False
30-
min_chunk_len: PositiveInt = 64
28+
heading_as_metadata: bool = Field(
29+
default=False,
30+
description="Whether heading should be in metadata (instead of text)",
31+
)
32+
include_metadata: bool = Field(
33+
default=True,
34+
description="Whether to include extras in the metadata",
35+
)
36+
min_chunk_len: PositiveInt = Field(
37+
default=64, description="Minimum chunk text length to consider (in chars)"
38+
)
3139

3240
class _NodeType(str, Enum):
3341
PARAGRAPH = "paragraph"
@@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:
8391

8492
return output_text
8593

86-
@classmethod
87-
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
88-
return f"$.{path_prefix}[{pos}]"
89-
9094
class _MainTextItemNode(BaseModel):
9195
parent: Optional[int] = None
9296
children: list[int] = []
@@ -304,14 +308,15 @@ def _build_chunk(
304308
return ChunkWithMetadata(
305309
text=concat,
306310
path=path,
311+
heading=heading,
307312
page=item.prov[0].page if item.prov else None,
308313
bbox=item.prov[0].bbox if item.prov else None,
309-
heading=heading,
310314
)
311315
else:
312316
return Chunk(
313317
text=concat,
314318
path=path,
319+
heading=heading,
315320
)
316321
else:
317322
return None
@@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
327332
Yields:
328333
Iterator[Chunk]: iterator over extracted chunks
329334
"""
330-
if (not self.include_metadata) and self.heading_as_metadata:
331-
raise RuntimeError(
332-
"To enable `heading_as_metadata`, also `include_metadata` must be True."
333-
)
334-
335335
if dl_doc.main_text:
336336
# extract doc structure incl. metadata for
337337
# each item (e.g. parent, children)
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"root": [
33
{
4-
"path": "$.main-text[0]",
4+
"path": "#/main-text/0",
55
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
66
"page": 1,
77
"bbox": [
@@ -12,40 +12,40 @@
1212
]
1313
},
1414
{
15-
"path": "$.main-text[4]",
15+
"path": "#/main-text/4",
1616
"text": "This one should also include the subtitle above since it is long enough.",
17+
"heading": "Some subtitle",
1718
"page": 3,
1819
"bbox": [
1920
5.0,
2021
6.0,
2122
7.0,
2223
8.0
23-
],
24-
"heading": "Some subtitle"
24+
]
2525
},
2626
{
27-
"path": "$.tables[0]",
27+
"path": "#/tables/0",
2828
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
29+
"heading": "Acquisitions",
2930
"page": 4,
3031
"bbox": [
3132
8.0,
3233
9.0,
3334
10.0,
3435
11.0
35-
],
36-
"heading": "Acquisitions"
36+
]
3737
},
3838
{
39-
"path": "$.main-text[8]",
39+
"path": "#/main-text/8",
4040
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
41+
"heading": "Acquisitions",
4142
"page": 4,
4243
"bbox": [
4344
8.0,
4445
9.0,
4546
10.0,
4647
11.0
47-
],
48-
"heading": "Acquisitions"
48+
]
4949
}
5050
]
5151
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"root": [
3+
{
4+
"path": "#/main-text/0",
5+
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
6+
},
7+
{
8+
"path": "#/main-text/4",
9+
"text": "This one should also include the subtitle above since it is long enough.",
10+
"heading": "Some subtitle"
11+
},
12+
{
13+
"path": "#/tables/0",
14+
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
15+
"heading": "Acquisitions"
16+
},
17+
{
18+
"path": "#/main-text/8",
19+
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
20+
"heading": "Acquisitions"
21+
}
22+
]
23+
}

test/data/chunker/0_out_chunks_with_meta_heading_in_text.json renamed to test/data/chunker/0_out_chunks_heading_in_text_with_extras.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"root": [
33
{
4-
"path": "$.main-text[0]",
4+
"path": "#/main-text/0",
55
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
66
"page": 1,
77
"bbox": [
@@ -12,7 +12,7 @@
1212
]
1313
},
1414
{
15-
"path": "$.main-text[4]",
15+
"path": "#/main-text/4",
1616
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
1717
"page": 3,
1818
"bbox": [
@@ -23,7 +23,7 @@
2323
]
2424
},
2525
{
26-
"path": "$.tables[0]",
26+
"path": "#/tables/0",
2727
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
2828
"page": 4,
2929
"bbox": [
@@ -34,7 +34,7 @@
3434
]
3535
},
3636
{
37-
"path": "$.main-text[7]",
37+
"path": "#/main-text/7",
3838
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
3939
"page": 4,
4040
"bbox": [
@@ -45,7 +45,7 @@
4545
]
4646
},
4747
{
48-
"path": "$.main-text[8]",
48+
"path": "#/main-text/8",
4949
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
5050
"page": 4,
5151
"bbox": [

test/data/chunker/0_out_chunks_wout_meta.json renamed to test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
{
22
"root": [
33
{
4-
"path": "$.main-text[0]",
4+
"path": "#/main-text/0",
55
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
66
},
77
{
8-
"path": "$.main-text[4]",
8+
"path": "#/main-text/4",
99
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
1010
},
1111
{
12-
"path": "$.tables[0]",
12+
"path": "#/tables/0",
1313
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
1414
},
1515
{
16-
"path": "$.main-text[7]",
16+
"path": "#/main-text/7",
1717
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
1818
},
1919
{
20-
"path": "$.main-text[8]",
20+
"path": "#/main-text/8",
2121
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
2222
}
2323
]

test/test_hierarchical_chunker.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,37 +9,49 @@
99
from docling_core.types import Document as DLDocument
1010

1111

12-
def test_chunk_without_metadata():
12+
def test_chunk_heading_in_text_wout_extras():
1313
with open("test/data/chunker/0_inp_dl_doc.json") as f:
1414
data_json = f.read()
1515
dl_doc = DLDocument.model_validate_json(data_json)
16-
chunker = HierarchicalChunker(include_metadata=False)
16+
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
1717
chunks = chunker.chunk(dl_doc=dl_doc)
18-
act_data = dict(root=[n.model_dump() for n in chunks])
19-
with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
18+
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
19+
with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
20+
exp_data = json.load(fp=f)
21+
assert exp_data == act_data
22+
23+
24+
def test_chunk_heading_in_text_with_extras():
25+
with open("test/data/chunker/0_inp_dl_doc.json") as f:
26+
data_json = f.read()
27+
dl_doc = DLDocument.model_validate_json(data_json)
28+
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
29+
chunks = chunker.chunk(dl_doc=dl_doc)
30+
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
31+
with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
2032
exp_data = json.load(fp=f)
2133
assert exp_data == act_data
2234

2335

24-
def test_chunk_with_metadata_heading_in_text():
36+
def test_chunk_heading_in_meta_wout_extras():
2537
with open("test/data/chunker/0_inp_dl_doc.json") as f:
2638
data_json = f.read()
2739
dl_doc = DLDocument.model_validate_json(data_json)
28-
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
40+
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
2941
chunks = chunker.chunk(dl_doc=dl_doc)
3042
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
31-
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
43+
with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
3244
exp_data = json.load(fp=f)
3345
assert exp_data == act_data
3446

3547

36-
def test_chunk_with_metadata_incl_heading():
48+
def test_chunk_heading_in_meta_with_extras():
3749
with open("test/data/chunker/0_inp_dl_doc.json") as f:
3850
data_json = f.read()
3951
dl_doc = DLDocument.model_validate_json(data_json)
40-
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
52+
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
4153
chunks = chunker.chunk(dl_doc=dl_doc)
4254
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
43-
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
55+
with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
4456
exp_data = json.load(fp=f)
4557
assert exp_data == act_data

0 commit comments

Comments
 (0)