Skip to content

Commit f9645ee

Browse files
committed
Fix rebuild of validator per instance & cache bug
The schema loader is asked to retrieve a validator per instance, in order to support the interface of metaschema evaluation. However, this results in a complete rebuild of the validator even when there is one schema being used for many instances (as in the `--schemafile` usage mode). Introducing an LRU cache on the validator builder results in the same validator being reused (and is sensitive to changes in settings/parameters). Additionally fix a bug in the remote ref download caching in which the key used for lookup was incorrect, rendering the cache completely ineffective. A new regression test confirms that the caching is fully effective by only observing the number of requests made by the program over "N" instancefiles, where N>=1.
1 parent 74b003b commit f9645ee

File tree

3 files changed

+72
-4
lines changed

3 files changed

+72
-4
lines changed

src/check_jsonschema/schema_loader/main.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import functools
34
import pathlib
45
import typing as t
56
import urllib.error
@@ -130,11 +131,21 @@ def get_validator(
130131
instance_doc: dict[str, t.Any],
131132
format_opts: FormatOptions,
132133
fill_defaults: bool,
134+
) -> jsonschema.protocols.Validator:
135+
return self._get_validator(format_opts, fill_defaults)
136+
137+
@functools.lru_cache
138+
def _get_validator(
139+
self,
140+
format_opts: FormatOptions,
141+
fill_defaults: bool,
133142
) -> jsonschema.protocols.Validator:
134143
retrieval_uri = self.get_schema_retrieval_uri()
135144
schema = self.get_schema()
136145

137146
schema_dialect = schema.get("$schema")
147+
if schema_dialect is not None and not isinstance(schema_dialect, str):
148+
schema_dialect = None
138149

139150
# format checker (which may be None)
140151
format_checker = make_format_checker(format_opts, schema_dialect)

src/check_jsonschema/schema_loader/resolver.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ def retrieve_reference(uri: str) -> referencing.Resource[Schema]:
7979
else:
8080
full_uri = uri
8181

82-
if full_uri in cache._cache:
83-
return cache[uri]
82+
if full_uri in cache:
83+
return cache[full_uri]
8484

8585
full_uri_scheme = urllib.parse.urlsplit(full_uri).scheme
8686
if full_uri_scheme in ("http", "https"):
@@ -100,8 +100,8 @@ def validation_callback(content: bytes) -> None:
100100
else:
101101
parsed_object = get_local_file(full_uri)
102102

103-
cache[uri] = parsed_object
104-
return cache[uri]
103+
cache[full_uri] = parsed_object
104+
return cache[full_uri]
105105

106106
return retrieve_reference
107107

tests/acceptance/test_remote_ref_resolution.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,3 +244,60 @@ def test_ref_resolution_with_custom_base_uri(run_line, tmp_path, check_passes):
244244
assert result.exit_code == 0, output
245245
else:
246246
assert result.exit_code == 1, output
247+
248+
249+
@pytest.mark.parametrize("num_instances", (1, 2, 10))
250+
@pytest.mark.parametrize("check_passes", (True, False))
251+
def test_remote_ref_resolution_callout_count_is_scale_free_in_instancefiles(
252+
run_line, tmp_path, num_instances, check_passes
253+
):
254+
"""
255+
Test that for any N > 1, validation of a schema with a ref against N instance files
256+
has exactly the same number of callouts as validation when N=1
257+
258+
This proves that the validator and caching are working correctly, and we aren't
259+
repeating callouts to rebuild state.
260+
"""
261+
schema_uri = "https://example.org/schemas/main.json"
262+
ref_uri = "https://example.org/schemas/title_schema.json"
263+
264+
main_schema = {
265+
"$id": schema_uri,
266+
"$schema": "http://json-schema.org/draft-07/schema",
267+
"properties": {
268+
"title": {"$ref": "./title_schema.json"},
269+
},
270+
"additionalProperties": False,
271+
}
272+
title_schema = {"type": "string"}
273+
responses.add("GET", schema_uri, json=main_schema)
274+
responses.add("GET", ref_uri, json=title_schema)
275+
276+
# write N documents
277+
instance_doc = {"title": "doc one" if check_passes else 2}
278+
instance_paths = []
279+
for i in range(num_instances):
280+
instance_path = tmp_path / f"instance{i}.json"
281+
instance_path.write_text(json.dumps(instance_doc))
282+
instance_paths.append(str(instance_path))
283+
284+
result = run_line(
285+
[
286+
"check-jsonschema",
287+
"--schemafile",
288+
schema_uri,
289+
]
290+
+ instance_paths
291+
)
292+
output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}"
293+
if check_passes:
294+
assert result.exit_code == 0, output
295+
else:
296+
assert result.exit_code == 1, output
297+
298+
# this is the moment of the "real" test run here:
299+
# no matter how many instances there were, there should only have been two calls
300+
# (one for the schema and one for the $ref)
301+
assert len(responses.calls) == 2
302+
assert len([c for c in responses.calls if c.request.url == schema_uri]) == 1
303+
assert len([c for c in responses.calls if c.request.url == ref_uri]) == 1

0 commit comments

Comments
 (0)