Skip to content

Commit f7feab9

Browse files
committed
Caching for canonicalised JSON
1 parent 7fa9377 commit f7feab9

File tree

1 file changed

+58
-4
lines changed

1 file changed

+58
-4
lines changed

src/hypothesis_jsonschema/_canonicalise.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
most things by construction instead of by filtering. That's the difference
1313
between "I'd like it to be faster" and "doesn't finish at all".
1414
"""
15-
15+
import functools
1616
import itertools
1717
import json
1818
import math
1919
import re
2020
from copy import deepcopy
2121
from json.encoder import _make_iterencode, encode_basestring_ascii # type: ignore
22-
from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union
22+
from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
2323

2424
import jsonschema
2525
from hypothesis.errors import InvalidArgument
@@ -108,9 +108,63 @@ class HypothesisRefResolutionError(jsonschema.exceptions.RefResolutionError):
108108
pass
109109

110110

111-
def encode_canonical_json(value: JSONType) -> str:
111+
def _make_cache_key(
112+
value: JSONType,
113+
) -> Tuple[Type, Union[Tuple, None, bool, float, str]]:
114+
"""Make a hashable object from any JSON value.
115+
116+
The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
117+
"""
118+
if isinstance(value, dict):
119+
return (dict, tuple((k, _make_cache_key(v)) for k, v in value.items()))
120+
if isinstance(value, list):
121+
return (list, tuple(map(_make_cache_key, value)))
122+
# Primitive types are hashable
123+
# `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
124+
return (type(value), value)
125+
126+
127+
class HashedJSON:
128+
"""A proxy that holds a JSON value.
129+
130+
Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
131+
"""
132+
133+
__slots__ = ("value", "hashedvalue")
134+
135+
def __init__(self, value: JSONType):
136+
self.value = value
137+
# `hash` is called multiple times on cache miss, therefore it is evaluated only once
138+
self.hashedvalue = hash(_make_cache_key(value))
139+
140+
def __hash__(self) -> int:
141+
return self.hashedvalue
142+
143+
def __eq__(self, other: Any) -> bool:
144+
if not isinstance(other, HashedJSON):
145+
return NotImplemented
146+
return self.hashedvalue == other.hashedvalue
147+
148+
149+
def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
150+
"""Cache calls to `encode_canonical_json`.
151+
152+
The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
153+
"""
154+
155+
cached_func = functools.lru_cache()(func)
156+
157+
@functools.wraps(cached_func)
158+
def wrapped(value: JSONType) -> str:
159+
return cached_func(HashedJSON(value))
160+
161+
return wrapped
162+
163+
164+
@cached_json
165+
def encode_canonical_json(value: HashedJSON) -> str:
112166
"""Canonical form serialiser, for uniqueness testing."""
113-
return json.dumps(value, sort_keys=True, cls=CanonicalisingJsonEncoder)
167+
return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
114168

115169

116170
def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]:

0 commit comments

Comments
 (0)