Skip to content

Commit f0be592

Browse files
authored
Merge pull request #278 from candleindark/validate-json
Improve (better arguments validation, avoiding repeated creation of validator objects, etc) `_validate_obj_json()` in `metadata.py` and supporting funcs
2 parents 87cc55f + e32f32d commit f0be592

File tree

4 files changed

+902
-46
lines changed

4 files changed

+902
-46
lines changed

dandischema/metadata.py

Lines changed: 116 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
from copy import deepcopy
22
from enum import Enum
3-
from functools import lru_cache
3+
from functools import cache
44
from inspect import isclass
55
import json
66
from pathlib import Path
77
from typing import Any, Dict, Iterable, Optional, TypeVar, Union, cast, get_args
88

9-
import jsonschema
9+
from jsonschema.protocols import Validator as JsonschemaValidator
1010
import pydantic
1111
import requests
1212

@@ -21,12 +21,16 @@
2121
from .utils import (
2222
TransitionalGenerateJsonSchema,
2323
_ensure_newline,
24+
dandi_jsonschema_validator,
25+
json_object_adapter,
2426
sanitize_value,
2527
strip_top_level_optional,
28+
validate_json,
2629
version2tuple,
2730
)
2831

29-
schema_map = {
32+
# A mapping of the schema keys of DANDI models to the names of their JSON schema files
33+
SCHEMA_MAP = {
3034
"Dandiset": "dandiset.json",
3135
"PublishedDandiset": "published-dandiset.json",
3236
"Asset": "asset.json",
@@ -130,7 +134,7 @@ def publish_model_schemata(releasedir: Union[str, Path]) -> Path:
130134
version = models.get_schema_version()
131135
vdir = Path(releasedir, version)
132136
vdir.mkdir(exist_ok=True, parents=True)
133-
for class_, filename in schema_map.items():
137+
for class_, filename in SCHEMA_MAP.items():
134138
(vdir / filename).write_text(
135139
_ensure_newline(
136140
json.dumps(
@@ -147,49 +151,122 @@ def publish_model_schemata(releasedir: Union[str, Path]) -> Path:
147151
return vdir
148152

149153

150-
def _validate_obj_json(data: dict, schema: dict, missing_ok: bool = False) -> None:
151-
validator: Union[jsonschema.Draft202012Validator, jsonschema.Draft7Validator]
152-
153-
if version2tuple(data["schemaVersion"]) >= version2tuple("0.6.5"):
154-
# schema version 0.7.0 and above is produced with Pydantic V2
155-
# which is compliant with JSON Schema Draft 2020-12
156-
validator = jsonschema.Draft202012Validator(
157-
schema, format_checker=jsonschema.Draft202012Validator.FORMAT_CHECKER
158-
)
159-
else:
160-
validator = jsonschema.Draft7Validator(
161-
schema, format_checker=jsonschema.Draft7Validator.FORMAT_CHECKER
162-
)
163-
164-
error_list = []
165-
for error in sorted(validator.iter_errors(data), key=str):
166-
if missing_ok and "is a required property" in error.message:
167-
continue
168-
error_list.append(error)
169-
if error_list:
170-
raise JsonschemaValidationError(error_list)
154+
def _validate_obj_json(
155+
instance: Any, validator: JsonschemaValidator, *, missing_ok: bool = False
156+
) -> None:
157+
"""
158+
Validate a data instance using a jsonschema validator with an option to filter out
159+
errors related to missing required properties
160+
161+
:param instance: The data instance to validate
162+
:param validator: The JSON schema validator to use
163+
:param missing_ok: Indicates whether to filter out errors related to missing
164+
required properties
165+
:raises JsonschemaValidationError: If the metadata instance is invalid, and there
166+
are errors detected in the validation, optionally discounting errors
167+
related to missing required properties. An instance of this exception containing
168+
a list of `jsonschema.exceptions.ValidationError` instances representing all the
169+
(remaining) errors detected in the validation
170+
"""
171+
try:
172+
validate_json(instance, validator)
173+
except JsonschemaValidationError as e:
174+
if missing_ok:
175+
remaining_errs = [
176+
err for err in e.errors if "is a required property" not in err.message
177+
]
178+
# Raise an exception only if there are errors left after filtering
179+
if remaining_errs:
180+
raise JsonschemaValidationError(remaining_errs) from e
181+
else:
182+
raise e
171183

172184

173185
def _validate_dandiset_json(data: dict, schema_dir: Union[str, Path]) -> None:
174186
with Path(schema_dir, "dandiset.json").open() as fp:
175187
schema = json.load(fp)
176-
_validate_obj_json(data, schema)
188+
_validate_obj_json(data, dandi_jsonschema_validator(schema))
177189

178190

179191
def _validate_asset_json(data: dict, schema_dir: Union[str, Path]) -> None:
180192
with Path(schema_dir, "asset.json").open() as fp:
181193
schema = json.load(fp)
182-
_validate_obj_json(data, schema)
194+
_validate_obj_json(data, dandi_jsonschema_validator(schema))
195+
183196

197+
@cache
198+
def _get_jsonschema_validator(
199+
schema_version: str, schema_key: str
200+
) -> JsonschemaValidator:
201+
"""
202+
Get jsonschema validator for validating instances against a specific DANDI schema
203+
204+
:param schema_version: The version of the specific DANDI schema
205+
:param schema_key: The schema key that identifies the specific DANDI schema
206+
:return: The jsonschema validator appropriate for validating instances against the
207+
specific DANDI schema
208+
:raises ValueError: If the provided schema version is among the allowed versions,
209+
`ALLOWED_VALIDATION_SCHEMAS`
210+
:raises ValueError: If the provided schema key is not among the keys in `SCHEMA_MAP`
211+
:raises requests.HTTPError: If the schema cannot be fetched from the `dandi/schema`
212+
repository
213+
:raises RuntimeError: If the fetched schema is not a valid JSON object
214+
"""
215+
if schema_version not in ALLOWED_VALIDATION_SCHEMAS:
216+
raise ValueError(
217+
f"DANDI schema version {schema_version} is not allowed. "
218+
f"Allowed are: {', '.join(ALLOWED_VALIDATION_SCHEMAS)}."
219+
)
220+
if schema_key not in SCHEMA_MAP:
221+
raise ValueError(
222+
f"Schema key must be one of {', '.join(map(repr, SCHEMA_MAP.keys()))}"
223+
)
184224

185-
@lru_cache
186-
def _get_schema(schema_version: str, schema_name: str) -> Any:
187-
r = requests.get(
188-
"https://raw.githubusercontent.com/dandi/schema/"
189-
f"master/releases/{schema_version}/{schema_name}"
225+
# Fetch the schema from the `dandi/schema` repository
226+
schema_url = (
227+
f"https://raw.githubusercontent.com/dandi/schema/"
228+
f"master/releases/{schema_version}/{SCHEMA_MAP[schema_key]}"
190229
)
230+
r = requests.get(schema_url)
191231
r.raise_for_status()
192-
return r.json()
232+
schema = r.json()
233+
234+
# Validate that the retrieved schema is a valid JSON object, i.e., a dictionary
235+
# This step is needed because the `jsonschema` package requires the schema to be a
236+
# `Mapping[str, Any]` object
237+
try:
238+
json_object_adapter.validate_python(schema)
239+
except pydantic.ValidationError as e:
240+
msg = (
241+
f"The JSON schema at {schema_url} is not a valid JSON object. "
242+
f"Received: {schema}"
243+
)
244+
raise RuntimeError(msg) from e
245+
246+
# Create a jsonschema validator for the schema
247+
return dandi_jsonschema_validator(schema)
248+
249+
250+
@cache
251+
def _get_jsonschema_validator_local(schema_key: str) -> JsonschemaValidator:
252+
"""
253+
Get jsonschema validator for validating instances against a specific DANDI schema
254+
generated from the corresponding locally defined Pydantic model
255+
256+
:param schema_key: The schema key that identifies the specific DANDI schema
257+
:raises ValueError: If the provided schema key is not among the keys in `SCHEMA_MAP`
258+
"""
259+
if schema_key not in SCHEMA_MAP:
260+
raise ValueError(
261+
f"Schema key must be one of {', '.join(map(repr, SCHEMA_MAP.keys()))}"
262+
)
263+
264+
# The pydantic model with the specified schema key
265+
m: type[pydantic.BaseModel] = getattr(models, schema_key)
266+
267+
return dandi_jsonschema_validator(
268+
m.model_json_schema(schema_generator=TransitionalGenerateJsonSchema)
269+
)
193270

194271

195272
def validate(
@@ -232,25 +309,22 @@ def validate(
232309
if schema_key is None:
233310
raise ValueError("Provided object has no known schemaKey")
234311
schema_version = schema_version or obj.get("schemaVersion")
235-
if schema_version not in ALLOWED_VALIDATION_SCHEMAS and schema_key in schema_map:
312+
if schema_version not in ALLOWED_VALIDATION_SCHEMAS and schema_key in SCHEMA_MAP:
236313
raise ValueError(
237314
f"Metadata version {schema_version} is not allowed. "
238315
f"Allowed are: {', '.join(ALLOWED_VALIDATION_SCHEMAS)}."
239316
)
240317
if json_validation:
241318
if schema_version == DANDI_SCHEMA_VERSION:
242-
klass = getattr(models, schema_key)
243-
schema = klass.model_json_schema(
244-
schema_generator=TransitionalGenerateJsonSchema
245-
)
319+
jvalidator = _get_jsonschema_validator_local(schema_key)
246320
else:
247-
if schema_key not in schema_map:
321+
if schema_key not in SCHEMA_MAP:
248322
raise ValueError(
249323
"Only dandisets and assets can be validated "
250324
"using json schema for older versions"
251325
)
252-
schema = _get_schema(schema_version, schema_map[schema_key])
253-
_validate_obj_json(obj, schema, missing_ok)
326+
jvalidator = _get_jsonschema_validator(schema_version, schema_key)
327+
_validate_obj_json(obj, jvalidator, missing_ok=missing_ok)
254328
klass = getattr(models, schema_key)
255329
try:
256330
klass(**obj)
@@ -358,8 +432,7 @@ def migrate(
358432
# Optionally validate the instance against the DANDI schema it specifies
359433
# before migration
360434
if not skip_validation:
361-
schema = _get_schema(obj_ver, "dandiset.json")
362-
_validate_obj_json(obj, schema)
435+
_validate_obj_json(obj, _get_jsonschema_validator(obj_ver, "Dandiset"))
363436

364437
obj_migrated = deepcopy(obj)
365438

0 commit comments

Comments
 (0)