Skip to content

Commit 9698d30

Browse files
authored
feat: add hierarchical chunker (#18)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c482610 commit 9698d30

File tree

10 files changed

+1259
-4
lines changed

10 files changed

+1259
-4
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Data transformations package."""
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Define the chunker types."""
7+
8+
from docling_core.transforms.chunker.base import ( # noqa
9+
BaseChunker,
10+
Chunk,
11+
ChunkWithMetadata,
12+
)
13+
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
14+
HierarchicalChunker,
15+
)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Define base classes for chunking."""
7+
from abc import ABC, abstractmethod
8+
from typing import Iterator, Optional
9+
10+
from pydantic import BaseModel
11+
12+
from docling_core.types import BoundingBox, Document
13+
14+
15+
class Chunk(BaseModel):
16+
"""Data model for Chunk."""
17+
18+
path: str
19+
text: str
20+
21+
22+
class ChunkWithMetadata(Chunk):
23+
"""Data model for Chunk including metadata."""
24+
25+
page: Optional[int]
26+
bbox: Optional[BoundingBox]
27+
28+
29+
class BaseChunker(BaseModel, ABC):
30+
"""Base class for Chunker."""
31+
32+
@abstractmethod
33+
def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
34+
"""Chunk the provided document.
35+
36+
Args:
37+
dl_doc (Document): document to chunk
38+
39+
Raises:
40+
NotImplementedError: in this abstract implementation
41+
42+
Yields:
43+
Iterator[Chunk]: iterator over extracted chunks
44+
"""
45+
raise NotImplementedError()

0 commit comments

Comments
 (0)