Skip to content

Commit 843be8c

Browse files
committed
Separate encoding module
1 parent cc9fcb1 commit 843be8c

File tree

5 files changed

+131
-120
lines changed

5 files changed

+131
-120
lines changed

src/hypothesis_jsonschema/_canonicalise.py

Lines changed: 3 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,19 @@
1212
most things by construction instead of by filtering. That's the difference
1313
between "I'd like it to be faster" and "doesn't finish at all".
1414
"""
15-
import functools
1615
import itertools
1716
import json
1817
import math
1918
import re
2019
from copy import deepcopy
21-
from json.encoder import _make_iterencode, encode_basestring_ascii # type: ignore
22-
from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
20+
from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union
2321

2422
import jsonschema
2523
from hypothesis.errors import InvalidArgument
2624
from hypothesis.internal.floats import next_down as ieee_next_down, next_up
2725

28-
# Mypy does not (yet!) support recursive type definitions.
29-
# (and writing a few steps by hand is a DoS attack on the AST walker in Pytest)
30-
JSONType = Union[None, bool, float, str, list, Dict[str, Any]]
26+
from ._encode import JSONType, encode_canonical_json, sort_key
27+
3128
Schema = Dict[str, JSONType]
3229
JSONSchemaValidator = Union[
3330
jsonschema.validators.Draft4Validator,
@@ -86,110 +83,10 @@ def make_validator(schema: Schema) -> JSONSchemaValidator:
8683
return validator(schema)
8784

8885

89-
class CanonicalisingJsonEncoder(json.JSONEncoder):
90-
def iterencode(self, o: Any, _one_shot: bool = False) -> Any:
91-
"""Replace a stdlib method, so we encode integer-valued floats as ints."""
92-
93-
def floatstr(o: float) -> str:
94-
# This is the bit we're overriding - integer-valued floats are
95-
# encoded as integers, to support JSONschemas's uniqueness.
96-
assert math.isfinite(o)
97-
if o == int(o):
98-
return repr(int(o))
99-
return repr(o)
100-
101-
return _make_iterencode(
102-
{},
103-
self.default,
104-
encode_basestring_ascii,
105-
self.indent,
106-
floatstr,
107-
self.key_separator,
108-
self.item_separator,
109-
self.sort_keys,
110-
self.skipkeys,
111-
_one_shot,
112-
)(o, 0)
113-
114-
11586
class HypothesisRefResolutionError(jsonschema.exceptions.RefResolutionError):
11687
pass
11788

11889

119-
def _make_cache_key(
120-
value: JSONType,
121-
) -> Tuple[Type, Union[Tuple, None, bool, float, str]]:
122-
"""Make a hashable object from any JSON value.
123-
124-
The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
125-
"""
126-
if isinstance(value, dict):
127-
return (dict, tuple((k, _make_cache_key(v)) for k, v in value.items()))
128-
if isinstance(value, list):
129-
return (list, tuple(map(_make_cache_key, value)))
130-
# Primitive types are hashable
131-
# `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
132-
return (type(value), value)
133-
134-
135-
class HashedJSON:
136-
"""A proxy that holds a JSON value.
137-
138-
Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
139-
"""
140-
141-
__slots__ = ("value", "hashedvalue")
142-
143-
def __init__(self, value: JSONType):
144-
self.value = value
145-
# `hash` is called multiple times on cache miss, therefore it is evaluated only once
146-
self.hashedvalue = hash(_make_cache_key(value))
147-
148-
def __hash__(self) -> int:
149-
return self.hashedvalue
150-
151-
def __eq__(self, other: "HashedJSON") -> bool: # type: ignore
152-
# TYPES: This class should be used only for caching purposes and there should be
153-
# no values of other types to compare
154-
return self.hashedvalue == other.hashedvalue
155-
156-
157-
def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
158-
"""Cache calls to `encode_canonical_json`.
159-
160-
The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
161-
"""
162-
cached_func = functools.lru_cache(maxsize=1024)(func)
163-
164-
@functools.wraps(cached_func)
165-
def wrapped(value: JSONType) -> str:
166-
return cached_func(HashedJSON(value))
167-
168-
return wrapped
169-
170-
171-
@cached_json
172-
def encode_canonical_json(value: HashedJSON) -> str:
173-
"""Canonical form serialiser, for uniqueness testing."""
174-
return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
175-
176-
177-
def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]:
178-
"""Return a sort key (type, guess, tiebreak) that can compare any JSON value.
179-
180-
Sorts scalar types before collections, and within each type tries for a
181-
sensible ordering similar to Hypothesis' idea of simplicity.
182-
"""
183-
if value is None:
184-
return (0, 0, 0)
185-
if isinstance(value, bool):
186-
return (1, int(value), 0)
187-
if isinstance(value, (int, float)):
188-
return (2 if int(value) == value else 3, abs(value), value >= 0)
189-
type_key = {str: 4, list: 5, dict: 6}[type(value)]
190-
return (type_key, len(value), encode_canonical_json(value))
191-
192-
19390
def get_type(schema: Schema) -> List[str]:
19491
"""Return a canonical value for the "type" key.
19592

src/hypothesis_jsonschema/_encode.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""Canonical encoding for the JSONSchema semantics, where 1 == 1.0."""
2+
import functools
3+
import json
4+
import math
5+
from json.encoder import _make_iterencode, encode_basestring_ascii # type: ignore
6+
from typing import Any, Callable, Dict, Tuple, Type, Union
7+
8+
# Mypy does not (yet!) support recursive type definitions.
9+
# (and writing a few steps by hand is a DoS attack on the AST walker in Pytest)
10+
JSONType = Union[None, bool, float, str, list, Dict[str, Any]]
11+
12+
13+
class CanonicalisingJsonEncoder(json.JSONEncoder):
14+
def iterencode(self, o: Any, _one_shot: bool = False) -> Any:
15+
"""Replace a stdlib method, so we encode integer-valued floats as ints."""
16+
17+
def floatstr(o: float) -> str:
18+
# This is the bit we're overriding - integer-valued floats are
19+
# encoded as integers, to support JSONschemas's uniqueness.
20+
assert math.isfinite(o)
21+
if o == int(o):
22+
return repr(int(o))
23+
return repr(o)
24+
25+
return _make_iterencode(
26+
{},
27+
self.default,
28+
encode_basestring_ascii,
29+
self.indent,
30+
floatstr,
31+
self.key_separator,
32+
self.item_separator,
33+
self.sort_keys,
34+
self.skipkeys,
35+
_one_shot,
36+
)(o, 0)
37+
38+
39+
def _make_cache_key(
40+
value: JSONType,
41+
) -> Tuple[Type, Union[None, bool, float, str, tuple, frozenset]]:
42+
"""Make a hashable object from any JSON value.
43+
44+
The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
45+
"""
46+
if isinstance(value, dict):
47+
return (dict, frozenset((k, _make_cache_key(v)) for k, v in value.items()))
48+
if isinstance(value, list):
49+
return (list, tuple(map(_make_cache_key, value)))
50+
# Primitive types are hashable
51+
# `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
52+
return (type(value), value)
53+
54+
55+
class HashedJSON:
56+
"""A proxy that holds a JSON value.
57+
58+
Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
59+
"""
60+
61+
__slots__ = ("value", "hashedvalue")
62+
63+
def __init__(self, value: JSONType):
64+
self.value = value
65+
# `hash` is called multiple times on cache miss, therefore it is evaluated only once
66+
self.hashedvalue = hash(_make_cache_key(value))
67+
68+
def __hash__(self) -> int:
69+
return self.hashedvalue
70+
71+
def __eq__(self, other: "HashedJSON") -> bool: # type: ignore
72+
# TYPES: This class should be used only for caching purposes and there should be
73+
# no values of other types to compare
74+
return self.hashedvalue == other.hashedvalue
75+
76+
77+
def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
78+
"""Cache calls to `encode_canonical_json`.
79+
80+
The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
81+
"""
82+
cached_func = functools.lru_cache(maxsize=1024)(func)
83+
84+
@functools.wraps(cached_func)
85+
def wrapped(value: JSONType) -> str:
86+
return cached_func(HashedJSON(value))
87+
88+
return wrapped
89+
90+
91+
@cached_json
92+
def encode_canonical_json(value: HashedJSON) -> str:
93+
"""Canonical form serialiser, for uniqueness testing."""
94+
return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
95+
96+
97+
def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]:
98+
"""Return a sort key (type, guess, tiebreak) that can compare any JSON value.
99+
100+
Sorts scalar types before collections, and within each type tries for a
101+
sensible ordering similar to Hypothesis' idea of simplicity.
102+
"""
103+
if value is None:
104+
return (0, 0, 0)
105+
if isinstance(value, bool):
106+
return (1, int(value), 0)
107+
if isinstance(value, (int, float)):
108+
return (2 if int(value) == value else 3, abs(value), value >= 0)
109+
type_key = {str: 4, list: 5, dict: 6}[type(value)]
110+
return (type_key, len(value), encode_canonical_json(value))

src/hypothesis_jsonschema/_from_schema.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,16 @@
1818
TRUTHY,
1919
TYPE_STRINGS,
2020
HypothesisRefResolutionError,
21-
JSONType,
2221
Schema,
2322
canonicalish,
24-
encode_canonical_json,
2523
get_integer_bounds,
2624
get_number_bounds,
2725
get_type,
2826
make_validator,
2927
merged,
3028
resolve_all_refs,
3129
)
30+
from ._encode import JSONType, encode_canonical_json
3231

3332
JSON_STRATEGY: st.SearchStrategy[JSONType] = st.recursive(
3433
st.none()

tests/test_canonicalise.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
"""Tests for the hypothesis-jsonschema library."""
22

3-
import json
4-
53
import jsonschema
64
import pytest
75
from gen_schemas import gen_number, json_schemata, schema_strategy_params
@@ -12,7 +10,6 @@
1210
from hypothesis_jsonschema._canonicalise import (
1311
FALSEY,
1412
canonicalish,
15-
encode_canonical_json,
1613
get_type,
1714
make_validator,
1815
merged,
@@ -26,15 +23,6 @@ def is_valid(instance, schema):
2623
return make_validator(schema).is_valid(instance)
2724

2825

29-
@given(JSON_STRATEGY)
30-
def test_canonical_json_encoding(v):
31-
"""Test our hand-rolled canonicaljson implementation."""
32-
encoded = encode_canonical_json(v)
33-
v2 = json.loads(encoded)
34-
assert v == v2
35-
assert encode_canonical_json(v2) == encoded
36-
37-
3826
@settings(suppress_health_check=[HealthCheck.too_slow], deadline=None)
3927
@given(data=st.data())
4028
@schema_strategy_params

tests/test_encode.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Tests for the hypothesis-jsonschema library."""
2+
3+
import json
4+
5+
from hypothesis import given
6+
7+
from hypothesis_jsonschema._encode import encode_canonical_json
8+
from hypothesis_jsonschema._from_schema import JSON_STRATEGY
9+
10+
11+
@given(JSON_STRATEGY)
12+
def test_canonical_json_encoding(v):
13+
"""Test our hand-rolled canonicaljson implementation."""
14+
encoded = encode_canonical_json(v)
15+
v2 = json.loads(encoded)
16+
assert v == v2
17+
assert encode_canonical_json(v2) == encoded

0 commit comments

Comments
 (0)