Skip to content

Commit 39ae6cd

Browse files
authored
chore: minor refactor incl. explicit typing (#39)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 246627f commit 39ae6cd

File tree

5 files changed

+42
-20
lines changed

5 files changed

+42
-20
lines changed

docling_core/transforms/chunker/base.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@
44
#
55

66
"""Define base classes for chunking."""
7+
import re
78
from abc import ABC, abstractmethod
8-
from typing import Iterator, Optional
9+
from typing import Final, Iterator, Optional
910

10-
from pydantic import BaseModel, model_validator
11+
from pydantic import BaseModel, Field, field_validator
1112

1213
from docling_core.types import BoundingBox, Document
14+
from docling_core.types.base import _JSON_POINTER_REGEX
15+
16+
# (subset of) JSONPath format, e.g. "$.main-text[84]" (for migration purposes)
17+
_DEPRECATED_JSON_PATH_PATTERN: Final = re.compile(r"^\$\.([\w-]+)\[(\d+)\]$")
1318

1419

1520
def _create_path(pos: int, path_prefix: str = "main-text") -> str:
@@ -19,21 +24,21 @@ def _create_path(pos: int, path_prefix: str = "main-text") -> str:
1924
class Chunk(BaseModel):
2025
"""Data model for Chunk."""
2126

22-
path: str
27+
path: str = Field(pattern=_JSON_POINTER_REGEX)
2328
text: str
2429
heading: Optional[str] = None
2530

26-
@model_validator(mode="before")
31+
@field_validator("path", mode="before")
2732
@classmethod
28-
def _json_pointer_from_json_path(cls, data):
29-
path = data.get("path")
30-
if path.startswith("$."):
31-
parts = path.split("[")
32-
data["path"] = _create_path(
33-
pos=parts[1][:-1],
34-
path_prefix=parts[0][2:],
35-
)
36-
return data
33+
def _json_pointer_from_json_path(cls, path: str):
34+
if (match := _DEPRECATED_JSON_PATH_PATTERN.match(path)) is not None:
35+
groups = match.groups()
36+
if len(groups) == 2 and groups[0] is not None and groups[1] is not None:
37+
return _create_path(
38+
pos=int(groups[1]),
39+
path_prefix=groups[0],
40+
)
41+
return path
3742

3843

3944
class ChunkWithMetadata(Chunk):

docling_core/transforms/metadata_extractor/simple_metadata_extractor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@
66
"""Simple metadata extractor module."""
77

88

9-
from typing import Any
9+
from typing import Any, Final
1010

1111
from docling_core.transforms.metadata_extractor import BaseMetadataExtractor
1212
from docling_core.types import Document as DLDocument
1313

14-
_DL_DOC_HASH = "dl_doc_hash"
15-
_ORIGIN = "origin"
14+
_DL_DOC_HASH: Final[str] = "dl_doc_hash"
15+
_ORIGIN: Final[str] = "origin"
1616

1717

1818
class SimpleMetadataExtractor(BaseMetadataExtractor):

docling_core/types/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""Define common models across types."""
77
from datetime import datetime, timezone
88
from enum import Enum
9-
from typing import Generic, Hashable, List, Literal, Optional, TypeVar
9+
from typing import Final, Generic, Hashable, List, Literal, Optional, TypeVar
1010

1111
from pydantic import (
1212
AfterValidator,
@@ -28,6 +28,9 @@
2828
from docling_core.utils.alias import AliasModel
2929
from docling_core.utils.validators import validate_datetime, validate_unique_list
3030

31+
# (subset of) JSON Pointer URI fragment id format, e.g. "#/main-text/84":
32+
_JSON_POINTER_REGEX: Final[str] = r"^#(?:/([\w-]+)(?:/(\d+))?)?$"
33+
3134
LanguageT = TypeVar("LanguageT", bound=str)
3235
IdentifierTypeT = TypeVar("IdentifierTypeT", bound=str)
3336
DescriptionAdvancedT = TypeVar("DescriptionAdvancedT", bound=BaseModel)

docling_core/types/experimental/document.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from typing_extensions import Annotated
2121

2222
from docling_core.search.package import VERSION_PATTERN
23+
from docling_core.types.base import _JSON_POINTER_REGEX
2324
from docling_core.types.doc.tokens import DocumentToken
2425
from docling_core.types.experimental import BoundingBox, Size
2526
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
@@ -28,9 +29,6 @@
2829
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
2930
CURRENT_VERSION: Final = "1.0.0"
3031

31-
# (subset of) JSON Pointer URI fragment identifier format:
32-
_JSON_POINTER_REGEX = r"^#(/[\w\-]+(/\d+)?)?$"
33-
3432

3533
class BasePictureData(BaseModel): # TBD
3634
"""BasePictureData."""

test/test_chunk.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
from docling_core.transforms.chunker.base import Chunk
7+
8+
9+
def test_chunk_migration():
10+
input_path = "$.main-text[42]" # deprected path format
11+
expected_path = "#/main-text/42"
12+
chunk = Chunk(
13+
path=input_path,
14+
text="foo",
15+
)
16+
assert chunk.path == expected_path

0 commit comments

Comments
 (0)