Skip to content

Commit cc9fcb1

Browse files
Stranger6667Zac-HD
authored andcommitted
Caching for canonicalised JSON
1 parent 7b352de commit cc9fcb1

File tree

1 file changed

+57
-4
lines changed

1 file changed

+57
-4
lines changed

src/hypothesis_jsonschema/_canonicalise.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
most things by construction instead of by filtering. That's the difference
1313
between "I'd like it to be faster" and "doesn't finish at all".
1414
"""
15-
15+
import functools
1616
import itertools
1717
import json
1818
import math
1919
import re
2020
from copy import deepcopy
2121
from json.encoder import _make_iterencode, encode_basestring_ascii # type: ignore
22-
from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union
22+
from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
2323

2424
import jsonschema
2525
from hypothesis.errors import InvalidArgument
@@ -116,9 +116,62 @@ class HypothesisRefResolutionError(jsonschema.exceptions.RefResolutionError):
116116
pass
117117

118118

119-
def encode_canonical_json(value: JSONType) -> str:
119+
def _make_cache_key(
120+
value: JSONType,
121+
) -> Tuple[Type, Union[Tuple, None, bool, float, str]]:
122+
"""Make a hashable object from any JSON value.
123+
124+
The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
125+
"""
126+
if isinstance(value, dict):
127+
return (dict, tuple((k, _make_cache_key(v)) for k, v in value.items()))
128+
if isinstance(value, list):
129+
return (list, tuple(map(_make_cache_key, value)))
130+
# Primitive types are hashable
131+
# `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
132+
return (type(value), value)
133+
134+
135+
class HashedJSON:
136+
"""A proxy that holds a JSON value.
137+
138+
Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
139+
"""
140+
141+
__slots__ = ("value", "hashedvalue")
142+
143+
def __init__(self, value: JSONType):
144+
self.value = value
145+
# `hash` is called multiple times on cache miss, therefore it is evaluated only once
146+
self.hashedvalue = hash(_make_cache_key(value))
147+
148+
def __hash__(self) -> int:
149+
return self.hashedvalue
150+
151+
def __eq__(self, other: "HashedJSON") -> bool: # type: ignore
152+
# TYPES: This class should be used only for caching purposes and there should be
153+
# no values of other types to compare
154+
return self.hashedvalue == other.hashedvalue
155+
156+
157+
def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
158+
"""Cache calls to `encode_canonical_json`.
159+
160+
The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
161+
"""
162+
cached_func = functools.lru_cache(maxsize=1024)(func)
163+
164+
@functools.wraps(cached_func)
165+
def wrapped(value: JSONType) -> str:
166+
return cached_func(HashedJSON(value))
167+
168+
return wrapped
169+
170+
171+
@cached_json
172+
def encode_canonical_json(value: HashedJSON) -> str:
120173
"""Canonical form serialiser, for uniqueness testing."""
121-
return json.dumps(value, sort_keys=True, cls=CanonicalisingJsonEncoder)
174+
return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
122175

123176

124177
def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]:

0 commit comments

Comments
 (0)