Skip to content

Commit 759656c

Browse files
committed
feat: enable heading-only chunks for empty-section headings
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 2920f24 commit 759656c

File tree

7 files changed

+513
-8
lines changed

7 files changed

+513
-8
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 54 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from __future__ import annotations
44

55
import logging
6-
from typing import Any, Iterator, Optional
6+
from typing import Any, Iterator, Optional, Union
77

88
from pydantic import ConfigDict, Field
99
from typing_extensions import Annotated, override
@@ -121,12 +121,14 @@ class HierarchicalChunker(BaseChunker):
121121
code_chunking_strategy (CodeChunkingStrategy): Optional strategy for chunking code items.
122122
If provided, code items will be processed using this strategy instead of being
123123
treated as regular text. Defaults to None (no special code processing).
124+
always_emit_headings (bool): Whether to emit headings even for empty sections. Defaults to False.
124125
"""
125126

126127
model_config = ConfigDict(arbitrary_types_allowed=True)
127128

128129
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
129130
code_chunking_strategy: Optional[BaseCodeChunkingStrategy] = Field(default=None)
131+
always_emit_headings: bool = False
130132

131133
# deprecated:
132134
merge_list_items: Annotated[bool, Field(deprecated=True)] = True
@@ -145,7 +147,8 @@ def chunk(
145147
Iterator[Chunk]: iterator over extracted chunks
146148
"""
147149
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
148-
heading_by_level: dict[LevelNumber, str] = {}
150+
heading_by_level: dict[LevelNumber, Union[TitleItem, SectionHeaderItem]] = {}
151+
heading_emitted: set[str] = set()
149152
visited: set[str] = set()
150153
ser_res = create_ser_result()
151154
excluded_refs = my_doc_ser.get_excluded_refs(**kwargs)
@@ -154,12 +157,34 @@ def chunk(
154157
continue
155158
if isinstance(item, (TitleItem, SectionHeaderItem)):
156159
level = item.level if isinstance(item, SectionHeaderItem) else 0
157-
heading_by_level[level] = item.text
158160

159-
# remove headings of higher level as they just went out of scope
160-
keys_to_del = [k for k in heading_by_level if k > level]
161+
# prepare to remove shadowed headings as they just went out of scope
162+
sorted_keys = sorted(heading_by_level)
163+
keys_to_del = [k for k in sorted_keys if k >= level]
164+
165+
# before removing, check if headings need to be emitted
166+
if (
167+
keys_to_del
168+
and self.always_emit_headings
169+
and (leaf_ref := heading_by_level[sorted_keys[-1]].self_ref)
170+
not in heading_emitted
171+
):
172+
yield DocChunk(
173+
text="",
174+
meta=DocMeta(
175+
doc_items=[heading_by_level[k] for k in sorted_keys],
176+
headings=[heading_by_level[k].text for k in sorted_keys],
177+
),
178+
)
179+
heading_emitted.add(leaf_ref)
180+
181+
# actually remove shadowed headings
161182
for k in keys_to_del:
162183
heading_by_level.pop(k, None)
184+
185+
# capture current heading
186+
heading_by_level[level] = item
187+
163188
continue
164189
elif (
165190
isinstance(item, (ListGroup, InlineGroup, DocItem))
@@ -184,13 +209,35 @@ def chunk(
184209
if not ser_res.text:
185210
continue
186211
if doc_items := [u.item for u in ser_res.spans]:
212+
sorted_keys = sorted(heading_by_level)
213+
headings = [heading_by_level[k].text for k in sorted_keys] or None
187214
c = DocChunk(
188215
text=ser_res.text,
189216
meta=DocMeta(
190217
doc_items=doc_items,
191-
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
192-
or None,
218+
headings=headings,
193219
origin=dl_doc.origin,
194220
),
195221
)
222+
if self.always_emit_headings and headings:
223+
leaf_ref = heading_by_level[sorted_keys[-1]].self_ref
224+
heading_emitted.add(leaf_ref)
196225
yield c
226+
227+
# if applicable, emit any remaining headings
228+
if (
229+
self.always_emit_headings
230+
and (sorted_keys := sorted(heading_by_level))
231+
and (
232+
(leaf_ref := heading_by_level[sorted_keys[-1]].self_ref)
233+
not in heading_emitted
234+
)
235+
):
236+
yield DocChunk(
237+
text="",
238+
meta=DocMeta(
239+
doc_items=[heading_by_level[k] for k in sorted_keys],
240+
headings=[heading_by_level[k].text for k in sorted_keys],
241+
),
242+
)
243+
heading_emitted.add(leaf_ref)

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
ChunkingSerializerProvider,
1111
)
1212
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
13+
from docling_core.types.doc.document import SectionHeaderItem, TitleItem
1314

1415
try:
1516
import semchunk
@@ -55,6 +56,7 @@ class HybridChunker(BaseChunker):
5556
max_tokens: The maximum number of tokens per chunk. If not set, limit is
5657
resolved from the tokenizer
5758
merge_peers: Whether to merge undersized chunks sharing same relevant metadata
59+
always_emit_headings: Whether to emit headings even for empty sections
5860
"""
5961

6062
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -63,6 +65,7 @@ class HybridChunker(BaseChunker):
6365
merge_peers: bool = True
6466

6567
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
68+
always_emit_headings: bool = False
6669

6770
@model_validator(mode="before")
6871
@classmethod
@@ -110,7 +113,10 @@ def max_tokens(self) -> int:
110113
@computed_field # type: ignore[misc]
111114
@cached_property
112115
def _inner_chunker(self) -> HierarchicalChunker:
113-
return HierarchicalChunker(serializer_provider=self.serializer_provider)
116+
return HierarchicalChunker(
117+
serializer_provider=self.serializer_provider,
118+
always_emit_headings=self.always_emit_headings,
119+
)
114120

115121
def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
116122
if text is None:
@@ -162,6 +168,7 @@ def _make_chunk_from_doc_items(
162168
res_text
163169
for doc_item in doc_items
164170
if (res_text := doc_serializer.serialize(item=doc_item).text)
171+
and not isinstance(doc_item, (TitleItem, SectionHeaderItem))
165172
]
166173
)
167174
)
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"root": [
3+
{
4+
"text": "Foo",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/texts/8",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [],
15+
"content_layer": "body",
16+
"label": "text",
17+
"prov": []
18+
}
19+
],
20+
"headings": [
21+
"Section 3",
22+
"Section 3.1"
23+
]
24+
}
25+
}
26+
]
27+
}
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
{
2+
"root": [
3+
{
4+
"text": "",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/texts/0",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [],
15+
"content_layer": "body",
16+
"label": "section_header",
17+
"prov": []
18+
},
19+
{
20+
"self_ref": "#/texts/1",
21+
"parent": {
22+
"$ref": "#/body"
23+
},
24+
"children": [],
25+
"content_layer": "body",
26+
"label": "section_header",
27+
"prov": []
28+
}
29+
],
30+
"headings": [
31+
"Section 1",
32+
"Section 1.1"
33+
]
34+
}
35+
},
36+
{
37+
"text": "",
38+
"meta": {
39+
"schema_name": "docling_core.transforms.chunker.DocMeta",
40+
"version": "1.0.0",
41+
"doc_items": [
42+
{
43+
"self_ref": "#/texts/0",
44+
"parent": {
45+
"$ref": "#/body"
46+
},
47+
"children": [],
48+
"content_layer": "body",
49+
"label": "section_header",
50+
"prov": []
51+
},
52+
{
53+
"self_ref": "#/texts/2",
54+
"parent": {
55+
"$ref": "#/body"
56+
},
57+
"children": [],
58+
"content_layer": "body",
59+
"label": "section_header",
60+
"prov": []
61+
}
62+
],
63+
"headings": [
64+
"Section 1",
65+
"Section 1.2"
66+
]
67+
}
68+
},
69+
{
70+
"text": "",
71+
"meta": {
72+
"schema_name": "docling_core.transforms.chunker.DocMeta",
73+
"version": "1.0.0",
74+
"doc_items": [
75+
{
76+
"self_ref": "#/texts/3",
77+
"parent": {
78+
"$ref": "#/body"
79+
},
80+
"children": [],
81+
"content_layer": "body",
82+
"label": "section_header",
83+
"prov": []
84+
},
85+
{
86+
"self_ref": "#/texts/4",
87+
"parent": {
88+
"$ref": "#/body"
89+
},
90+
"children": [],
91+
"content_layer": "body",
92+
"label": "section_header",
93+
"prov": []
94+
},
95+
{
96+
"self_ref": "#/texts/5",
97+
"parent": {
98+
"$ref": "#/body"
99+
},
100+
"children": [],
101+
"content_layer": "body",
102+
"label": "section_header",
103+
"prov": []
104+
}
105+
],
106+
"headings": [
107+
"Section 2",
108+
"Section 2.1",
109+
"Section 2.1.1"
110+
]
111+
}
112+
},
113+
{
114+
"text": "Foo",
115+
"meta": {
116+
"schema_name": "docling_core.transforms.chunker.DocMeta",
117+
"version": "1.0.0",
118+
"doc_items": [
119+
{
120+
"self_ref": "#/texts/8",
121+
"parent": {
122+
"$ref": "#/body"
123+
},
124+
"children": [],
125+
"content_layer": "body",
126+
"label": "text",
127+
"prov": []
128+
}
129+
],
130+
"headings": [
131+
"Section 3",
132+
"Section 3.1"
133+
]
134+
}
135+
},
136+
{
137+
"text": "",
138+
"meta": {
139+
"schema_name": "docling_core.transforms.chunker.DocMeta",
140+
"version": "1.0.0",
141+
"doc_items": [
142+
{
143+
"self_ref": "#/texts/9",
144+
"parent": {
145+
"$ref": "#/body"
146+
},
147+
"children": [],
148+
"content_layer": "body",
149+
"label": "section_header",
150+
"prov": []
151+
},
152+
{
153+
"self_ref": "#/texts/10",
154+
"parent": {
155+
"$ref": "#/body"
156+
},
157+
"children": [],
158+
"content_layer": "body",
159+
"label": "section_header",
160+
"prov": []
161+
}
162+
],
163+
"headings": [
164+
"Section 4",
165+
"Section 4.1"
166+
]
167+
}
168+
}
169+
]
170+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"root": [
3+
{
4+
"text": "Foo",
5+
"meta": {
6+
"schema_name": "docling_core.transforms.chunker.DocMeta",
7+
"version": "1.0.0",
8+
"doc_items": [
9+
{
10+
"self_ref": "#/texts/8",
11+
"parent": {
12+
"$ref": "#/body"
13+
},
14+
"children": [],
15+
"content_layer": "body",
16+
"label": "text",
17+
"prov": []
18+
}
19+
],
20+
"headings": [
21+
"Section 3",
22+
"Section 3.1"
23+
]
24+
}
25+
}
26+
]
27+
}

0 commit comments

Comments
 (0)