88from __future__ import annotations
99
1010import logging
11- from typing import Any , ClassVar , Iterator , Optional
11+ import re
12+ from typing import Any , ClassVar , Final , Iterator , Literal , Optional
1213
1314from pandas import DataFrame
14- from pydantic import Field
15+ from pydantic import Field , StringConstraints , field_validator
16+ from typing_extensions import Annotated
1517
18+ from docling_core .search .package import VERSION_PATTERN
1619from docling_core .transforms .chunker import BaseChunk , BaseChunker , BaseMeta
17- from docling_core .types . doc import DoclingDocument as DLDocument
20+ from docling_core .types import DoclingDocument as DLDocument
1821from docling_core .types .doc .document import (
1922 DocItem ,
23+ DocumentOrigin ,
2024 LevelNumber ,
2125 ListItem ,
2226 SectionHeaderItem ,
2529)
2630from docling_core .types .doc .labels import DocItemLabel
2731
32+ _VERSION : Final = "1.0.0"
33+
34+ _KEY_SCHEMA_NAME = "schema_name"
35+ _KEY_VERSION = "version"
2836_KEY_DOC_ITEMS = "doc_items"
2937_KEY_HEADINGS = "headings"
3038_KEY_CAPTIONS = "captions"
39+ _KEY_ORIGIN = "origin"
3140
3241_logger = logging .getLogger (__name__ )
3342
3443
3544class DocMeta (BaseMeta ):
36- """Data model for Hierarchical Chunker metadata."""
45+ """Data model for Hierarchical Chunker chunk metadata."""
3746
47+ schema_name : Literal ["docling_core.transforms.chunker.DocMeta" ] = Field (
48+ default = "docling_core.transforms.chunker.DocMeta" ,
49+ alias = _KEY_SCHEMA_NAME ,
50+ )
51+ version : Annotated [str , StringConstraints (pattern = VERSION_PATTERN , strict = True )] = (
52+ Field (
53+ default = _VERSION ,
54+ alias = _KEY_VERSION ,
55+ )
56+ )
3857 doc_items : list [DocItem ] = Field (
3958 alias = _KEY_DOC_ITEMS ,
4059 min_length = 1 ,
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
4968 alias = _KEY_CAPTIONS ,
5069 min_length = 1 ,
5170 )
71+ origin : Optional [DocumentOrigin ] = Field (
72+ default = None ,
73+ alias = _KEY_ORIGIN ,
74+ )
5275
53- excluded_embed : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
54- excluded_llm : ClassVar [list [str ]] = [_KEY_DOC_ITEMS ]
76+ excluded_embed : ClassVar [list [str ]] = [
77+ _KEY_SCHEMA_NAME ,
78+ _KEY_VERSION ,
79+ _KEY_DOC_ITEMS ,
80+ _KEY_ORIGIN ,
81+ ]
82+ excluded_llm : ClassVar [list [str ]] = [
83+ _KEY_SCHEMA_NAME ,
84+ _KEY_VERSION ,
85+ _KEY_DOC_ITEMS ,
86+ _KEY_ORIGIN ,
87+ ]
88+
89+ @field_validator (_KEY_VERSION )
90+ @classmethod
91+ def check_version_is_compatible (cls , v : str ) -> str :
92+ """Check if this meta item version is compatible with current version."""
93+ current_match = re .match (VERSION_PATTERN , _VERSION )
94+ doc_match = re .match (VERSION_PATTERN , v )
95+ if (
96+ doc_match is None
97+ or current_match is None
98+ or doc_match ["major" ] != current_match ["major" ]
99+ or doc_match ["minor" ] > current_match ["minor" ]
100+ ):
101+ raise ValueError (f"incompatible version { v } with schema version { _VERSION } " )
102+ else :
103+ return _VERSION
55104
56105
57106class DocChunk (BaseChunk ):
@@ -129,6 +178,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
129178 for k in sorted (heading_by_level )
130179 ]
131180 or None ,
181+ origin = dl_doc .origin ,
132182 ),
133183 )
134184 list_items = [] # reset
@@ -171,6 +221,7 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
171221 headings = [heading_by_level [k ] for k in sorted (heading_by_level )]
172222 or None ,
173223 captions = captions ,
224+ origin = dl_doc .origin ,
174225 ),
175226 )
176227 yield c
@@ -182,5 +233,6 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
182233 doc_items = list_items ,
183234 headings = [heading_by_level [k ] for k in sorted (heading_by_level )]
184235 or None ,
236+ origin = dl_doc .origin ,
185237 ),
186238 )
0 commit comments