Skip to content

Commit 4bde515

Browse files
authored
feat: support heading as chunk metadata (#36)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 17af1fc commit 4bde515

File tree

5 files changed

+99
-18
lines changed

5 files changed

+99
-18
lines changed

docling_core/transforms/chunker/base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ class Chunk(BaseModel):
2222
class ChunkWithMetadata(Chunk):
2323
"""Data model for Chunk including metadata."""
2424

25-
page: Optional[int]
26-
bbox: Optional[BoundingBox]
25+
page: Optional[int] = None
26+
bbox: Optional[BoundingBox] = None
27+
heading: Optional[str] = None
2728

2829

2930
class BaseChunker(BaseModel, ABC):

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
2626
"""Chunker implementation leveraging the document layout."""
2727

2828
include_metadata: bool = True
29+
heading_as_metadata: bool = False
2930
min_chunk_len: PositiveInt = 64
3031

3132
class _NodeType(str, Enum):
@@ -184,7 +185,7 @@ class _TextEntry(BaseModel):
184185

185186
def _build_chunk_impl(
186187
self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
187-
) -> list[_TextEntry]:
188+
) -> tuple[list[_TextEntry], Optional[str]]:
188189
if doc.main_text:
189190
item = doc.main_text[idx]
190191
item_type = _HC._norm(item.obj_type)
@@ -193,7 +194,7 @@ def _build_chunk_impl(
193194
item_type not in self._allowed_types
194195
or item_name in self._disallowed_names_by_type.get(item_type, [])
195196
):
196-
return []
197+
return [], None
197198

198199
c2p = doc_map.dmap
199200

@@ -219,7 +220,7 @@ def _build_chunk_impl(
219220
else []
220221
)
221222
else:
222-
return []
223+
return [], None
223224
elif isinstance(item, BaseText):
224225
text_entries = [
225226
self._TextEntry(
@@ -248,21 +249,29 @@ def _build_chunk_impl(
248249
_HC._NodeName.LIST_ITEM,
249250
_HC._NodeName.SUBTITLE_LEVEL_1,
250251
]:
251-
return []
252+
return [], None
252253

253254
if (parent := c2p[idx].parent) is not None:
254255
# prepend with ancestors
256+
257+
parent_res = self._build_chunk_impl(
258+
doc=doc, doc_map=doc_map, idx=parent, rec=True
259+
)
255260
return (
256-
self._build_chunk_impl(
257-
doc=doc, doc_map=doc_map, idx=parent, rec=True
258-
)
259-
+ text_entries
261+
parent_res[0] + text_entries, # expanded text
262+
parent_res[1], # heading
260263
)
261264
else:
262-
# if root, augment with title (if available and different)
263-
return text_entries
265+
if (
266+
self.heading_as_metadata
267+
and isinstance(item, BaseText)
268+
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
269+
):
270+
return [], text_entries[0].text
271+
else:
272+
return text_entries, None
264273
else:
265-
return []
274+
return [], None
266275

267276
def _build_chunk(
268277
self,
@@ -272,7 +281,9 @@ def _build_chunk(
272281
delim: str,
273282
rec: bool = False,
274283
) -> Optional[Chunk]:
275-
texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
284+
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
285+
texts = res[0]
286+
heading = res[1]
276287
concat = delim.join([t.text for t in texts if t.text])
277288
assert doc.main_text is not None
278289
if len(concat) >= self.min_chunk_len:
@@ -295,6 +306,7 @@ def _build_chunk(
295306
path=path,
296307
page=item.prov[0].page if item.prov else None,
297308
bbox=item.prov[0].bbox if item.prov else None,
309+
heading=heading,
298310
)
299311
else:
300312
return Chunk(
@@ -315,6 +327,11 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
315327
Yields:
316328
Iterator[Chunk]: iterator over extracted chunks
317329
"""
330+
if (not self.include_metadata) and self.heading_as_metadata:
331+
raise RuntimeError(
332+
"To enable `heading_as_metadata`, also `include_metadata` must be True."
333+
)
334+
318335
if dl_doc.main_text:
319336
# extract doc structure incl. metadata for
320337
# each item (e.g. parent, children)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
{
2+
"root": [
3+
{
4+
"path": "$.main-text[0]",
5+
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
6+
"page": 1,
7+
"bbox": [
8+
0.0,
9+
1.0,
10+
2.0,
11+
3.0
12+
]
13+
},
14+
{
15+
"path": "$.main-text[4]",
16+
"text": "This one should also include the subtitle above since it is long enough.",
17+
"page": 3,
18+
"bbox": [
19+
5.0,
20+
6.0,
21+
7.0,
22+
8.0
23+
],
24+
"heading": "Some subtitle"
25+
},
26+
{
27+
"path": "$.tables[0]",
28+
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
29+
"page": 4,
30+
"bbox": [
31+
8.0,
32+
9.0,
33+
10.0,
34+
11.0
35+
],
36+
"heading": "Acquisitions"
37+
},
38+
{
39+
"path": "$.main-text[8]",
40+
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
41+
"page": 4,
42+
"bbox": [
43+
8.0,
44+
9.0,
45+
10.0,
46+
11.0
47+
],
48+
"heading": "Acquisitions"
49+
}
50+
]
51+
}

test/test_hierarchical_chunker.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,25 @@ def test_chunk_without_metadata():
2121
assert exp_data == act_data
2222

2323

24-
def test_chunk_with_metadata():
24+
def test_chunk_with_metadata_heading_in_text():
2525
with open("test/data/chunker/0_inp_dl_doc.json") as f:
2626
data_json = f.read()
2727
dl_doc = DLDocument.model_validate_json(data_json)
28-
chunker = HierarchicalChunker(include_metadata=True)
28+
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
2929
chunks = chunker.chunk(dl_doc=dl_doc)
30-
act_data = dict(root=[n.model_dump() for n in chunks])
31-
with open("test/data/chunker/0_out_chunks_with_meta.json") as f:
30+
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
31+
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
32+
exp_data = json.load(fp=f)
33+
assert exp_data == act_data
34+
35+
36+
def test_chunk_with_metadata_incl_heading():
37+
with open("test/data/chunker/0_inp_dl_doc.json") as f:
38+
data_json = f.read()
39+
dl_doc = DLDocument.model_validate_json(data_json)
40+
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
41+
chunks = chunker.chunk(dl_doc=dl_doc)
42+
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
43+
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
3244
exp_data = json.load(fp=f)
3345
assert exp_data == act_data

0 commit comments

Comments
 (0)