Skip to content

Commit 1df24cc

Browse files
check duplicate object keys (#81)
Co-authored-by: David Hewitt <[email protected]>
1 parent 75699eb commit 1df24cc

File tree

8 files changed

+197
-51
lines changed

8 files changed

+197
-51
lines changed

crates/jiter-python/bench.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,31 @@
77
import json
88

99
cases = [
10-
('medium_response', Path('../benches/medium_response.json').read_bytes()),
11-
('massive_ints_array', Path('../benches/massive_ints_array.json').read_bytes()),
12-
('array_short_strings', '[{}]'.format(', '.join('"123"' for _ in range(100_000)))),
13-
('object_short_strings', '{%s}' % ', '.join(f'"{i}": "{i}x"' for i in range(100_000))),
14-
('array_short_arrays', '[{}]'.format(', '.join('["a", "b", "c", "d"]' for _ in range(10_000)))),
15-
('one_long_string', json.dumps('x' * 100)),
16-
('one_short_string', b'"foobar"'),
17-
('1m_strings', json.dumps([str(i) for i in range(1_000_000)])),
10+
("medium_response", Path("../jiter/benches/medium_response.json").read_bytes()),
11+
(
12+
"massive_ints_array",
13+
Path("../jiter/benches/massive_ints_array.json").read_bytes(),
14+
),
15+
("array_short_strings", "[{}]".format(", ".join('"123"' for _ in range(100_000)))),
16+
(
17+
"object_short_strings",
18+
"{%s}" % ", ".join(f'"{i}": "{i}x"' for i in range(100_000)),
19+
),
20+
(
21+
"array_short_arrays",
22+
"[{}]".format(", ".join('["a", "b", "c", "d"]' for _ in range(10_000))),
23+
),
24+
("one_long_string", json.dumps("x" * 100)),
25+
("one_short_string", b'"foobar"'),
26+
("1m_strings", json.dumps([str(i) for i in range(1_000_000)])),
1827
]
1928

2029

2130
def run_bench(func, d):
2231
if isinstance(d, str):
2332
d = d.encode()
2433
timer = timeit.Timer(
25-
'func(json_data)', setup='', globals={'func': func, 'json_data': d}
34+
"func(json_data)", setup="", globals={"func": func, "json_data": d}
2635
)
2736
n, t = timer.autorange()
2837
iter_time = t / n
@@ -31,13 +40,18 @@ def run_bench(func, d):
3140

3241

3342
for name, json_data in cases:
34-
print(f'Case: {name}')
43+
print(f"Case: {name}")
3544
times = [
36-
('orjson', run_bench(lambda d: orjson.loads(d), json_data)),
37-
('jiter-cache', run_bench(lambda d: jiter_python.from_json(d), json_data)),
38-
('jiter', run_bench(lambda d: jiter_python.from_json(d, cache_strings=False), json_data)),
39-
('ujson', run_bench(lambda d: ujson.loads(d), json_data)),
40-
('json', run_bench(lambda d: json.loads(d), json_data)),
45+
("orjson", run_bench(lambda d: orjson.loads(d), json_data)),
46+
("jiter-cache", run_bench(lambda d: jiter_python.from_json(d), json_data)),
47+
(
48+
"jiter",
49+
run_bench(
50+
lambda d: jiter_python.from_json(d, cache_strings=False), json_data
51+
),
52+
),
53+
("ujson", run_bench(lambda d: ujson.loads(d), json_data)),
54+
("json", run_bench(lambda d: json.loads(d), json_data)),
4155
]
4256

4357
times.sort(key=lambda x: x[1])
@@ -46,5 +60,5 @@ def run_bench(func, d):
4660
print(f'{"package":>12} | {"time µs":>10} | slowdown')
4761
print(f'{"-" * 13}|{"-" * 12}|{"-" * 9}')
4862
for name, time in times:
49-
print(f'{name:>12} | {time * 1_000_000:10.2f} | {time / best:8.2f}')
50-
print('')
63+
print(f"{name:>12} | {time * 1_000_000:10.2f} | {time / best:8.2f}")
64+
print("")

crates/jiter-python/src/lib.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,35 @@ use pyo3::prelude::*;
22

33
use jiter::{map_json_error, python_parse};
44

5-
#[pyfunction(signature = (data, *, allow_inf_nan=true, cache_strings=true))]
5+
#[pyfunction(
6+
signature = (
7+
data,
8+
*,
9+
allow_inf_nan=true,
10+
cache_strings=true,
11+
allow_partial=false,
12+
catch_duplicate_keys=false
13+
)
14+
)]
615
pub fn from_json<'py>(
716
py: Python<'py>,
817
data: &[u8],
918
allow_inf_nan: bool,
1019
cache_strings: bool,
20+
allow_partial: bool,
21+
catch_duplicate_keys: bool,
1122
) -> PyResult<Bound<'py, PyAny>> {
1223
let cache_mode = cache_strings.into();
1324
let json_bytes = data;
14-
python_parse(py, json_bytes, allow_inf_nan, cache_mode, false).map_err(|e| map_json_error(json_bytes, &e))
25+
python_parse(
26+
py,
27+
json_bytes,
28+
allow_inf_nan,
29+
cache_mode,
30+
allow_partial,
31+
catch_duplicate_keys,
32+
)
33+
.map_err(|e| map_json_error(json_bytes, &e))
1534
}
1635

1736
#[pymodule]

crates/jiter/benches/python.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ fn python_parse_numeric(bench: &mut Bencher) {
1717
false,
1818
StringCacheMode::All,
1919
false,
20+
false,
2021
)
2122
.unwrap()
2223
});
@@ -33,6 +34,7 @@ fn python_parse_other(bench: &mut Bencher) {
3334
false,
3435
StringCacheMode::All,
3536
false,
37+
false,
3638
)
3739
.unwrap()
3840
});
@@ -47,7 +49,7 @@ fn _python_parse_file(path: &str, bench: &mut Bencher, cache_mode: StringCacheMo
4749

4850
Python::with_gil(|py| {
4951
cache_clear(py);
50-
bench.iter(|| python_parse(py, json_data, false, cache_mode, false).unwrap());
52+
bench.iter(|| python_parse(py, json_data, false, cache_mode, false, false).unwrap());
5153
})
5254
}
5355

crates/jiter/src/errors.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@ use std::fmt;
44
///
55
/// Almost all of `JsonErrorType` is copied from [serde_json](https://github.com/serde-rs) so errors match
66
/// those expected from `serde_json`.
7-
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
7+
#[derive(Debug, PartialEq, Eq, Clone)]
88
pub enum JsonErrorType {
99
/// float value was found where an int was expected
1010
FloatExpectingInt,
1111

12+
/// duplicate keys in an object
13+
DuplicateKey(String),
14+
1215
/// NOTE: all errors from here on are copied from serde_json
1316
/// [src/error.rs](https://github.com/serde-rs/json/blob/v1.0.107/src/error.rs#L236)
1417
/// with `Io` and `Message` removed
@@ -79,6 +82,7 @@ impl std::fmt::Display for JsonErrorType {
7982
// Messages for enum members copied from serde_json are unchanged
8083
match self {
8184
Self::FloatExpectingInt => f.write_str("float value was found where an int was expected"),
85+
Self::DuplicateKey(s) => write!(f, "Detected duplicate key {s:?}"),
8286
Self::EofWhileParsingList => f.write_str("EOF while parsing a list"),
8387
Self::EofWhileParsingObject => f.write_str("EOF while parsing an object"),
8488
Self::EofWhileParsingString => f.write_str("EOF while parsing a string"),

crates/jiter/src/lazy_index_map.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ where
7777
self.vec.is_empty()
7878
}
7979

80-
pub fn get<Q: ?Sized>(&self, key: &Q) -> Option<&V>
80+
pub fn get<Q>(&self, key: &Q) -> Option<&V>
8181
where
8282
K: Borrow<Q> + PartialEq<Q>,
83-
Q: Hash + Eq,
83+
Q: Hash + Eq + ?Sized,
8484
{
8585
let vec_len = self.vec.len();
8686
// if the vec is longer than the threshold, we use the hashmap for lookups

crates/jiter/src/python.rs

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use ahash::AHashSet;
12
use std::marker::PhantomData;
23

34
use pyo3::exceptions::PyValueError;
@@ -21,6 +22,8 @@ use crate::JsonErrorType;
2122
/// - `json_data`: The JSON data to parse.
2223
/// - `allow_inf_nan`: Whether to allow `(-)Infinity` and `NaN` values.
2324
/// - `cache_strings`: Whether to cache strings to avoid constructing new Python objects,
25+
/// - `allow_partial`: Whether to allow partial JSON data.
26+
/// - `catch_duplicate_keys`: Whether to catch duplicate keys in objects.
2427
/// this should have a significant improvement on performance but increases memory slightly.
2528
///
2629
/// # Returns
@@ -32,11 +35,27 @@ pub fn python_parse<'py>(
3235
allow_inf_nan: bool,
3336
cache_mode: StringCacheMode,
3437
allow_partial: bool,
38+
catch_duplicate_keys: bool,
3539
) -> JsonResult<Bound<'py, PyAny>> {
40+
macro_rules! ppp {
41+
($string_cache:ident, $key_check:ident) => {
42+
PythonParser::<$string_cache, $key_check>::parse(py, json_data, allow_inf_nan, allow_partial)
43+
};
44+
}
45+
3646
match cache_mode {
37-
StringCacheMode::All => PythonParser::<StringCacheAll>::parse(py, json_data, allow_inf_nan, allow_partial),
38-
StringCacheMode::Keys => PythonParser::<StringCacheKeys>::parse(py, json_data, allow_inf_nan, allow_partial),
39-
StringCacheMode::None => PythonParser::<StringNoCache>::parse(py, json_data, allow_inf_nan, allow_partial),
47+
StringCacheMode::All => match catch_duplicate_keys {
48+
true => ppp!(StringCacheAll, DuplicateKeyCheck),
49+
false => ppp!(StringCacheAll, NoopKeyCheck),
50+
},
51+
StringCacheMode::Keys => match catch_duplicate_keys {
52+
true => ppp!(StringCacheKeys, DuplicateKeyCheck),
53+
false => ppp!(StringCacheKeys, NoopKeyCheck),
54+
},
55+
StringCacheMode::None => match catch_duplicate_keys {
56+
true => ppp!(StringNoCache, DuplicateKeyCheck),
57+
false => ppp!(StringNoCache, NoopKeyCheck),
58+
},
4059
}
4160
}
4261

@@ -45,16 +64,17 @@ pub fn map_json_error(json_data: &[u8], json_error: &JsonError) -> PyErr {
4564
PyValueError::new_err(json_error.description(json_data))
4665
}
4766

48-
struct PythonParser<'j, StringCache> {
67+
struct PythonParser<'j, StringCache, KeyCheck> {
4968
_string_cache: PhantomData<StringCache>,
69+
_key_check: PhantomData<KeyCheck>,
5070
parser: Parser<'j>,
5171
tape: Tape,
5272
recursion_limit: u8,
5373
allow_inf_nan: bool,
5474
allow_partial: bool,
5575
}
5676

57-
impl<'j, StringCache: StringMaybeCache> PythonParser<'j, StringCache> {
77+
impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j, StringCache, KeyCheck> {
5878
fn parse<'py>(
5979
py: Python<'py>,
6080
json_data: &[u8],
@@ -63,6 +83,7 @@ impl<'j, StringCache: StringMaybeCache> PythonParser<'j, StringCache> {
6383
) -> JsonResult<Bound<'py, PyAny>> {
6484
let mut slf = PythonParser {
6585
_string_cache: PhantomData::<StringCache>,
86+
_key_check: PhantomData::<KeyCheck>,
6687
parser: Parser::new(json_data),
6788
tape: Tape::default(),
6889
recursion_limit: DEFAULT_RECURSION_LIMIT,
@@ -166,13 +187,18 @@ impl<'j, StringCache: StringMaybeCache> PythonParser<'j, StringCache> {
166187
panic!("PyDict_SetItem failed")
167188
}
168189
};
190+
let mut check_keys = KeyCheck::default();
169191
if let Some(first_key) = self.parser.object_first::<StringDecoder>(&mut self.tape)? {
170-
let first_key = StringCache::get_key(py, first_key.as_str(), first_key.ascii_only());
192+
let first_key_s = first_key.as_str();
193+
check_keys.check(first_key_s, self.parser.index)?;
194+
let first_key = StringCache::get_key(py, first_key_s, first_key.ascii_only());
171195
let peek = self.parser.peek()?;
172196
let first_value = self._check_take_value(py, peek)?;
173197
set_item(first_key, first_value);
174198
while let Some(key) = self.parser.object_step::<StringDecoder>(&mut self.tape)? {
175-
let key = StringCache::get_key(py, key.as_str(), key.ascii_only());
199+
let key_s = key.as_str();
200+
check_keys.check(key_s, self.parser.index)?;
201+
let key = StringCache::get_key(py, key_s, key.ascii_only());
176202
let peek = self.parser.peek()?;
177203
let value = self._check_take_value(py, peek)?;
178204
set_item(key, value);
@@ -209,3 +235,29 @@ impl<'j, StringCache: StringMaybeCache> PythonParser<'j, StringCache> {
209235
r
210236
}
211237
}
238+
239+
trait MaybeKeyCheck: Default {
240+
fn check(&mut self, key: &str, index: usize) -> JsonResult<()>;
241+
}
242+
243+
#[derive(Default)]
244+
struct NoopKeyCheck;
245+
246+
impl MaybeKeyCheck for NoopKeyCheck {
247+
fn check(&mut self, _key: &str, _index: usize) -> JsonResult<()> {
248+
Ok(())
249+
}
250+
}
251+
252+
#[derive(Default)]
253+
struct DuplicateKeyCheck(AHashSet<String>);
254+
255+
impl MaybeKeyCheck for DuplicateKeyCheck {
256+
fn check(&mut self, key: &str, index: usize) -> JsonResult<()> {
257+
if self.0.insert(key.to_owned()) {
258+
Ok(())
259+
} else {
260+
Err(JsonError::new(JsonErrorType::DuplicateKey(key.to_owned()), index))
261+
}
262+
}
263+
}

crates/jiter/tests/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ macro_rules! single_expect_ok_or_error {
116116
let position = jiter.error_position(e.index);
117117
// no wrong type errors, so unwrap the json error
118118
let error_type = match e.error_type {
119-
JiterErrorType::JsonError(e) => e,
119+
JiterErrorType::JsonError(ref e) => e,
120120
_ => panic!("unexpected error type: {:?}", e.error_type),
121121
};
122122
let actual_error = format!("{:?} @ {}", error_type, position.short());

0 commit comments

Comments
 (0)