Skip to content

Commit 656f563

Browse files
authored
fix: fix definition issues in record type (#2)
* fix(test): set static typing compatible to python 3.10 Signed-off-by: Cesar Berrospi Ramis <[email protected]> * style: enforce some pre-commit hooks on tests Enforce pre-commit hooks black, isort, autoflake, and mypy on test modules. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * fix(rec): fix definition issues in attribute, predicate, subject Remove duplicate generic types across base and predicate modules. Create an identifier class for subject names. Remove unnecessary type variables in attribute model. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * docs: refer to Docling data objects Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 9134ffe commit 656f563

29 files changed

+549
-411
lines changed

.pre-commit-config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,31 @@ repos:
44
hooks:
55
- id: black
66
name: Black
7-
entry: poetry run black docling_core
7+
entry: poetry run black docling_core test
88
pass_filenames: false
99
language: system
1010
files: '\.py$'
1111
- repo: local
1212
hooks:
1313
- id: isort
1414
name: isort
15-
entry: poetry run isort docling_core
15+
entry: poetry run isort docling_core test
1616
pass_filenames: false
1717
language: system
1818
files: '\.py$'
1919
- repo: local
2020
hooks:
2121
- id: autoflake
2222
name: autoflake
23-
entry: poetry run autoflake docling_core
23+
entry: poetry run autoflake docling_core test
2424
pass_filenames: false
2525
language: system
2626
files: '\.py$'
2727
- repo: local
2828
hooks:
2929
- id: mypy
3030
name: MyPy
31-
entry: poetry run mypy docling_core
31+
entry: poetry run mypy docling_core test
3232
pass_filenames: false
3333
language: system
3434
files: '\.py$'

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is
183183

184184
## Documentation
185185

186-
We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Deep Search objects.
186+
We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.
187187

188188
The documentation pages are stored in [docs](./docs/) folder and are updated at every commit, as part of the `pre-commit` check hooks.
189189
To generate the documentation on-demand, run:

docling_core/types/base.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
4040
ProvenanceTypeT = TypeVar("ProvenanceTypeT", bound=str)
4141
CollectionNameTypeT = TypeVar("CollectionNameTypeT", bound=str)
42+
Coordinates = Annotated[
43+
list[float],
44+
Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
45+
]
4246
T = TypeVar("T", bound=Hashable)
4347

4448
UniqueList = Annotated[
@@ -61,7 +65,7 @@
6165

6266

6367
class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
64-
"""Unique identifier of a Deep Search data object."""
68+
"""Unique identifier of a Docling data object."""
6569

6670
type_: IdentifierTypeT = Field(
6771
alias="type",
@@ -81,7 +85,7 @@ class Identifier(AliasModel, Generic[IdentifierTypeT], extra="forbid"):
8185
alias="_name",
8286
title="_Name",
8387
description=(
84-
"A unique identifier of the data object across Deep Search, consisting of "
88+
"A unique identifier of the data object across Docling, consisting of "
8589
"the concatenation of type and value in lower case, separated by hash "
8690
"(#)."
8791
),
@@ -118,7 +122,7 @@ class Log(AliasModel, extra="forbid"):
118122
json_schema_extra=es_field(type="keyword", ignore_above=8191),
119123
)
120124
agent: StrictStr = Field(
121-
description="The Deep Search agent that performed the task, e.g., CCS or CXS.",
125+
description="The Docling agent that performed the task, e.g., CCS or CXS.",
122126
json_schema_extra=es_field(type="keyword", ignore_above=8191),
123127
)
124128
type_: StrictStr = Field(
@@ -138,7 +142,7 @@ class Log(AliasModel, extra="forbid"):
138142

139143

140144
class FileInfoObject(AliasModel):
141-
"""Filing information for any data object to be stored in a Deep Search database."""
145+
"""Filing information for any data object to be stored in a Docling database."""
142146

143147
filename: StrictStr = Field(
144148
description="The name of a persistent object that created this data object",
@@ -156,15 +160,15 @@ class FileInfoObject(AliasModel):
156160
document_hash: StrictStr = Field(
157161
description=(
158162
"A unique identifier of this data object within a collection of a "
159-
"Deep Search database"
163+
"Docling database"
160164
),
161165
alias="document-hash",
162166
json_schema_extra=es_field(type="keyword", ignore_above=8191),
163167
)
164168

165169

166170
class CollectionTypeEnum(str, Enum):
167-
"""Enumeration of valid Deep Search collection types."""
171+
"""Enumeration of valid Docling collection types."""
168172

169173
generic = "Generic"
170174
document = "Document"

docling_core/types/doc/document.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# SPDX-License-Identifier: MIT
44
#
55

6-
"""Models for the Deep Search Document data type."""
6+
"""Models for the Docling Document data type."""
77

88
from datetime import datetime
99
from typing import Generic, Optional, Union
@@ -352,7 +352,7 @@ class ExportedCCSDocument(
352352
CollectionNameTypeT,
353353
],
354354
):
355-
"""Document model for Deep Search."""
355+
"""Document model for Docling."""
356356

357357
obj_type: StrictStr = Field(
358358
"pdf-document",

docling_core/types/rec/attribute.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"""Define the model Attribute."""
77
from typing import Generic, Optional
88

9-
from pydantic import BaseModel, Field
9+
from pydantic import Field
1010
from typing_extensions import Annotated
1111

1212
from docling_core.search.mapping import es_field
@@ -16,23 +16,20 @@
1616
PredicateKeyTypeT,
1717
PredicateValueTypeT,
1818
ProvenanceTypeT,
19-
SubjectNameTypeT,
20-
SubjectTypeT,
2119
)
2220
from docling_core.types.rec.base import ProvenanceItem
2321
from docling_core.types.rec.predicate import Predicate
22+
from docling_core.utils.alias import AliasModel
2423

2524

2625
class Attribute(
27-
BaseModel,
26+
AliasModel,
2827
Generic[
2928
IdentifierTypeT,
3029
PredicateValueTypeT,
3130
PredicateKeyNameT,
3231
PredicateKeyTypeT,
3332
ProvenanceTypeT,
34-
SubjectTypeT,
35-
SubjectNameTypeT,
3633
],
3734
extra="forbid",
3835
):

docling_core/types/rec/predicate.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
"""Define the model Predicate."""
77
from datetime import datetime
8-
from typing import Annotated, Generic, Optional, TypeVar
8+
from typing import Annotated, Generic, Optional
99

1010
from pydantic import (
1111
BaseModel,
@@ -17,16 +17,14 @@
1717
)
1818

1919
from docling_core.search.mapping import es_field
20+
from docling_core.types.base import (
21+
Coordinates,
22+
PredicateKeyNameT,
23+
PredicateKeyTypeT,
24+
PredicateValueTypeT,
25+
)
2026
from docling_core.utils.alias import AliasModel
2127

22-
PredicateValueTypeT = TypeVar("PredicateValueTypeT", bound=str)
23-
PredicateKeyNameT = TypeVar("PredicateKeyNameT", bound=str)
24-
PredicateKeyTypeT = TypeVar("PredicateKeyTypeT", bound=str)
25-
Coordinates = Annotated[
26-
list[float],
27-
Field(min_length=2, max_length=2, json_schema_extra=es_field(type="geo_point")),
28-
]
29-
3028

3129
class NumericalValue(BaseModel, extra="forbid"):
3230
"""Model for numerical values."""
@@ -117,7 +115,7 @@ class PredicateValue(AliasModel, Generic[PredicateValueTypeT], extra="forbid"):
117115

118116

119117
class Predicate(
120-
BaseModel,
118+
AliasModel,
121119
Generic[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT],
122120
extra="forbid",
123121
):

docling_core/types/rec/record.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ class Record(
8080
PredicateKeyNameT,
8181
PredicateKeyTypeT,
8282
ProvenanceTypeT,
83-
SubjectTypeT,
84-
SubjectNameTypeT,
8583
]
8684
]
8785
] = None

docling_core/types/rec/subject.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
from docling_core.utils.alias import AliasModel
2020

2121

22+
class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
23+
"""Identifier of subject names.""" ""
24+
25+
2226
class Subject(
2327
AliasModel,
2428
Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
@@ -53,7 +57,7 @@ class Subject(
5357
),
5458
json_schema_extra=es_field(type="keyword", ignore_above=8191),
5559
)
56-
names: list[Identifier[SubjectNameTypeT]] = Field(
60+
names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
5761
description=(
5862
"List of given names for this subject. They may not be unique across "
5963
"different subjects."

docling_core/utils/ds_generate_docs.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _prepare_directory(folder: str, clean: bool = False) -> None:
4444

4545

4646
def generate_collection_jsonschema(folder: str):
47-
"""Generate the JSON schema of Deep Search collections and export them to a folder.
47+
"""Generate the JSON schema of Docling collections and export them to a folder.
4848
4949
Args:
5050
folder: The name of the directory.
@@ -58,7 +58,7 @@ def generate_collection_jsonschema(folder: str):
5858

5959

6060
def generate_collection_html(folder: str):
61-
"""Generate HTML pages documenting the data model of Deep Search collections.
61+
"""Generate HTML pages documenting the data model of Docling collections.
6262
6363
The JSON schemas files need to be in a folder and the generated HTML pages will be
6464
written in the same folder.
@@ -79,7 +79,7 @@ def generate_collection_html(folder: str):
7979

8080

8181
def generate_collection_markdown(folder: str):
82-
"""Generate Markdown pages documenting the data model of Deep Search collections.
82+
"""Generate Markdown pages documenting the data model of Docling collections.
8383
8484
The JSON schemas files need to be in a folder and the generated markdown pages will
8585
be written in the same folder.
@@ -101,7 +101,7 @@ def generate_collection_markdown(folder: str):
101101

102102

103103
def main() -> None:
104-
"""Generate the JSON Schema of Deep Search collections and export documentation."""
104+
"""Generate the JSON Schema of Docling collections and export documentation."""
105105
argparser = argparse.ArgumentParser()
106106
argparser.add_argument(
107107
"directory",

docs/Document.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@
781781
"x-es-type": "keyword"
782782
},
783783
"document-hash": {
784-
"description": "A unique identifier of this data object within a collection of a Deep Search database",
784+
"description": "A unique identifier of this data object within a collection of a Docling database",
785785
"title": "Document-Hash",
786786
"type": "string",
787787
"x-es-ignore_above": 8191,
@@ -1086,7 +1086,7 @@
10861086
},
10871087
"Identifier": {
10881088
"additionalProperties": false,
1089-
"description": "Unique identifier of a Deep Search data object.",
1089+
"description": "Unique identifier of a Docling data object.",
10901090
"properties": {
10911091
"type": {
10921092
"description": "A string representing a collection or database that contains this data object.",
@@ -1103,7 +1103,7 @@
11031103
"x-es-type": "keyword"
11041104
},
11051105
"_name": {
1106-
"description": "A unique identifier of the data object across Deep Search, consisting of the concatenation of type and value in lower case, separated by hash (#).",
1106+
"description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).",
11071107
"pattern": "^.+#.+$",
11081108
"title": "_Name",
11091109
"type": "string",
@@ -1139,7 +1139,7 @@
11391139
"x-es-type": "keyword"
11401140
},
11411141
"agent": {
1142-
"description": "The Deep Search agent that performed the task, e.g., CCS or CXS.",
1142+
"description": "The Docling agent that performed the task, e.g., CCS or CXS.",
11431143
"title": "Agent",
11441144
"type": "string",
11451145
"x-es-ignore_above": 8191,
@@ -1725,7 +1725,7 @@
17251725
"type": "object"
17261726
}
17271727
},
1728-
"description": "Document model for Deep Search.",
1728+
"description": "Document model for Docling.",
17291729
"properties": {
17301730
"_name": {
17311731
"title": " Name",

0 commit comments

Comments
 (0)