Skip to content

Commit 3a0b747

Browse files
authored
feat: add page chunking (#337)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent ebd9147 commit 3a0b747

File tree

6 files changed

+904
-1
lines changed

6 files changed

+904
-1
lines changed

docling_core/transforms/chunker/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
DocMeta,
1212
HierarchicalChunker,
1313
)
14+
from docling_core.transforms.chunker.page_chunker import PageChunker
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Page-based chunker implementation: each chunk corresponds to a single page."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any, Iterator
6+
7+
from pydantic import ConfigDict
8+
from typing_extensions import override
9+
10+
from docling_core.transforms.chunker import BaseChunker, DocChunk, DocMeta
11+
from docling_core.transforms.chunker.hierarchical_chunker import (
12+
ChunkingSerializerProvider,
13+
)
14+
from docling_core.types import DoclingDocument as DLDocument
15+
16+
17+
class PageChunker(BaseChunker):
18+
r"""Chunker implementation that yields one chunk per page."""
19+
20+
model_config = ConfigDict(arbitrary_types_allowed=True)
21+
22+
serializer_provider: ChunkingSerializerProvider = ChunkingSerializerProvider()
23+
24+
@override
25+
def chunk(
26+
self,
27+
dl_doc: DLDocument,
28+
**kwargs: Any,
29+
) -> Iterator[DocChunk]:
30+
"""Chunk the provided document by page."""
31+
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
32+
if dl_doc.pages:
33+
# chunk by page
34+
for page_no in sorted(dl_doc.pages.keys()):
35+
ser_res = my_doc_ser.serialize(pages={page_no})
36+
if not ser_res.text:
37+
continue
38+
yield DocChunk(
39+
text=ser_res.text,
40+
meta=DocMeta(
41+
doc_items=ser_res.get_unique_doc_items(),
42+
headings=None,
43+
captions=None,
44+
origin=dl_doc.origin,
45+
),
46+
)
47+
else:
48+
# if no pages, treat whole document as single chunk
49+
ser_res = my_doc_ser.serialize()
50+
if ser_res.text:
51+
yield DocChunk(
52+
text=ser_res.text,
53+
meta=DocMeta(
54+
doc_items=ser_res.get_unique_doc_items(),
55+
headings=None,
56+
captions=None,
57+
origin=dl_doc.origin,
58+
),
59+
)

docling_core/transforms/serializer/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ class SerializationResult(BaseModel):
3939
spans: list[Span] = []
4040
# group: Optional[GroupItem] = None # set when result reflects specific group item
4141

42+
def get_unique_doc_items(self) -> list[DocItem]:
43+
"""Get the doc items corresponding to this result."""
44+
seen_doc_item_refs: set[str] = set()
45+
doc_items: list[DocItem] = []
46+
for span in self.spans:
47+
if span.item.self_ref not in seen_doc_item_refs:
48+
seen_doc_item_refs.add(span.item.self_ref)
49+
doc_items.append(span.item)
50+
return doc_items
51+
4252

4353
class BaseTextSerializer(ABC):
4454
"""Base class for text item serializers."""

docling_core/transforms/serializer/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def serialize_doc(
285285

286286
def _serialize_body(self, **kwargs) -> SerializationResult:
287287
"""Serialize the document body."""
288-
subparts = self.get_parts()
288+
subparts = self.get_parts(**kwargs)
289289
res = self.serialize_doc(parts=subparts, **kwargs)
290290
return res
291291

0 commit comments

Comments
 (0)