Skip to content

Commit d09fe7e

Browse files
authored
feat: extend chunk meta with schema, version, origin (#49)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 1388e67 commit d09fe7e

File tree

5 files changed

+1880
-236
lines changed

5 files changed

+1880
-236
lines changed

docling_core/transforms/chunker/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,7 @@
66
"""Define the chunker types."""
77

88
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9-
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
9+
from docling_core.transforms.chunker.hierarchical_chunker import (
10+
DocMeta,
11+
HierarchicalChunker,
12+
)

docling_core/transforms/chunker/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414

1515
class BaseMeta(BaseModel):
16-
"""Metadata base class."""
16+
"""Chunk metadata base class."""
1717

1818
excluded_embed: ClassVar[list[str]] = []
1919
excluded_llm: ClassVar[list[str]] = []

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@
88
from __future__ import annotations
99

1010
import logging
11-
from typing import Any, ClassVar, Iterator, Optional
11+
import re
12+
from typing import Any, ClassVar, Final, Iterator, Literal, Optional
1213

1314
from pandas import DataFrame
14-
from pydantic import Field
15+
from pydantic import Field, StringConstraints, field_validator
16+
from typing_extensions import Annotated
1517

18+
from docling_core.search.package import VERSION_PATTERN
1619
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17-
from docling_core.types.doc import DoclingDocument as DLDocument
20+
from docling_core.types import DoclingDocument as DLDocument
1821
from docling_core.types.doc.document import (
1922
DocItem,
23+
DocumentOrigin,
2024
LevelNumber,
2125
ListItem,
2226
SectionHeaderItem,
@@ -25,16 +29,31 @@
2529
)
2630
from docling_core.types.doc.labels import DocItemLabel
2731

32+
_VERSION: Final = "1.0.0"
33+
34+
_KEY_SCHEMA_NAME = "schema_name"
35+
_KEY_VERSION = "version"
2836
_KEY_DOC_ITEMS = "doc_items"
2937
_KEY_HEADINGS = "headings"
3038
_KEY_CAPTIONS = "captions"
39+
_KEY_ORIGIN = "origin"
3140

3241
_logger = logging.getLogger(__name__)
3342

3443

3544
class DocMeta(BaseMeta):
36-
"""Data model for Hierarchical Chunker metadata."""
45+
"""Data model for Hierarchical Chunker chunk metadata."""
3746

47+
schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
48+
default="docling_core.transforms.chunker.DocMeta",
49+
alias=_KEY_SCHEMA_NAME,
50+
)
51+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
52+
Field(
53+
default=_VERSION,
54+
alias=_KEY_VERSION,
55+
)
56+
)
3857
doc_items: list[DocItem] = Field(
3958
alias=_KEY_DOC_ITEMS,
4059
min_length=1,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
4968
alias=_KEY_CAPTIONS,
5069
min_length=1,
5170
)
71+
origin: Optional[DocumentOrigin] = Field(
72+
default=None,
73+
alias=_KEY_ORIGIN,
74+
)
5275

53-
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54-
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
76+
excluded_embed: ClassVar[list[str]] = [
77+
_KEY_SCHEMA_NAME,
78+
_KEY_VERSION,
79+
_KEY_DOC_ITEMS,
80+
_KEY_ORIGIN,
81+
]
82+
excluded_llm: ClassVar[list[str]] = [
83+
_KEY_SCHEMA_NAME,
84+
_KEY_VERSION,
85+
_KEY_DOC_ITEMS,
86+
_KEY_ORIGIN,
87+
]
88+
89+
@field_validator(_KEY_VERSION)
90+
@classmethod
91+
def check_version_is_compatible(cls, v: str) -> str:
92+
"""Check if this meta item version is compatible with current version."""
93+
current_match = re.match(VERSION_PATTERN, _VERSION)
94+
doc_match = re.match(VERSION_PATTERN, v)
95+
if (
96+
doc_match is None
97+
or current_match is None
98+
or doc_match["major"] != current_match["major"]
99+
or doc_match["minor"] > current_match["minor"]
100+
):
101+
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
102+
else:
103+
return _VERSION
55104

56105

57106
class DocChunk(BaseChunk):
@@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
129178
for k in sorted(heading_by_level)
130179
]
131180
or None,
181+
origin=dl_doc.origin,
132182
),
133183
)
134184
list_items = [] # reset
@@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
171221
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172222
or None,
173223
captions=captions,
224+
origin=dl_doc.origin,
174225
),
175226
)
176227
yield c
@@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
182233
doc_items=list_items,
183234
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184235
or None,
236+
origin=dl_doc.origin,
185237
),
186238
)

0 commit comments

Comments
 (0)