Skip to content

Commit cda94cd

Browse files
committed
adding Batson
1 parent 43a493b commit cda94cd

File tree

17 files changed

+2573
-4
lines changed

17 files changed

+2573
-4
lines changed

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
members = [
33
"crates/jiter",
44
"crates/jiter-python",
5+
"crates/batson",
56
"crates/fuzz",
67
]
78
resolver = "2"
@@ -28,5 +29,9 @@ inherits = "release"
2829
debug = true
2930

3031
[workspace.dependencies]
32+
jiter = { path = "crates/jiter", version = "0.5.0" }
3133
pyo3 = { version = "0.22.0" }
3234
pyo3-build-config = { version = "0.22.0" }
35+
bencher = "0.1.5"
36+
paste = "1.0.7"
37+
codspeed-bencher-compat = "2.7.1"

crates/batson/Cargo.toml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[package]
2+
name = "batson"
3+
description = "Binary Alternative To (J)SON. Designed to be very fast to query."
4+
readme = "../../README.md"
5+
version = {workspace = true}
6+
edition = {workspace = true}
7+
authors = {workspace = true}
8+
license = {workspace = true}
9+
keywords = {workspace = true}
10+
categories = {workspace = true}
11+
homepage = {workspace = true}
12+
repository = {workspace = true}
13+
14+
[dependencies]
15+
bytemuck = { version = "1.17.1", features = ["aarch64_simd", "derive", "align_offset"] }
16+
jiter = { workspace = true }
17+
serde = "1.0.210"
18+
serde_json = "1.0.128"
19+
simdutf8 = { version = "0.1.4", features = ["aarch64_neon"] }
20+
smallvec = "2.0.0-alpha.7"
21+
22+
[dev-dependencies]
23+
bencher = { workspace = true }
24+
paste = { workspace = true }
25+
codspeed-bencher-compat = { workspace = true }
26+
27+
[[bench]]
28+
name = "main"
29+
harness = false
30+
31+
[lints.clippy]
32+
dbg_macro = "deny"
33+
print_stdout = "deny"
34+
print_stderr = "deny"
35+
# in general we lint against the pedantic group, but we will whitelist
36+
# certain lints which we don't want to enforce (for now)
37+
pedantic = { level = "deny", priority = -1 }
38+
missing_errors_doc = "allow"
39+
cast_possible_truncation = "allow" # TODO remove
40+
cast_sign_loss = "allow" # TODO remove
41+
cast_possible_wrap = "allow" # TODO remove
42+
checked_conversions = "allow" # TODO remove

crates/batson/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# batson
2+
3+
Binary Alternative To (J)SON. Designed to be very fast to query.
4+
5+
Inspired by Postgres' [JSONB type](https://github.com/postgres/postgres/commit/d9134d0a355cfa447adc80db4505d5931084278a?diff=unified&w=0) and Snowflake's [VARIANT type](https://www.youtube.com/watch?v=jtjOfggD4YY).
6+
7+
For a relatively small JSON document (3KB), batson is 14 to 126x faster than Jiter, and 106 to 588x faster than Serde.
8+
9+
```
10+
test medium_get_str_found_batson ... bench: 51 ns/iter (+/- 1)
11+
test medium_get_str_found_jiter ... bench: 755 ns/iter (+/- 66)
12+
test medium_get_str_found_serde ... bench: 5,420 ns/iter (+/- 93)
13+
test medium_get_str_missing_batson ... bench: 9 ns/iter (+/- 0)
14+
test medium_get_str_missing_jiter ... bench: 1,135 ns/iter (+/- 46)
15+
test medium_get_str_missing_serde ... bench: 5,292 ns/iter (+/- 324)
16+
```

crates/batson/benches/main.rs

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
use codspeed_bencher_compat::{benchmark_group, benchmark_main, Bencher};
2+
use std::hint::black_box;
3+
4+
use std::fs::File;
5+
use std::io::Read;
6+
7+
use batson::get::{get_str, BatsonPath};
8+
use batson::{batson_to_json_string, encode_from_json};
9+
use jiter::JsonValue;
10+
11+
fn read_file(path: &str) -> String {
12+
let mut file = File::open(path).unwrap();
13+
let mut contents = String::new();
14+
file.read_to_string(&mut contents).unwrap();
15+
contents
16+
}
17+
18+
/// taken from <https://github.com/datafusion-contrib/datafusion-functions-json/blob/v0.41.0/src/common.rs#L184-L216>
19+
mod jiter_find {
20+
use jiter::{Jiter, Peek};
21+
22+
#[derive(Debug)]
23+
pub enum JsonPath<'s> {
24+
Key(&'s str),
25+
Index(usize),
26+
None,
27+
}
28+
29+
impl From<u64> for JsonPath<'_> {
30+
fn from(index: u64) -> Self {
31+
JsonPath::Index(usize::try_from(index).unwrap())
32+
}
33+
}
34+
35+
impl From<i32> for JsonPath<'_> {
36+
fn from(index: i32) -> Self {
37+
match usize::try_from(index) {
38+
Ok(i) => Self::Index(i),
39+
Err(_) => Self::None,
40+
}
41+
}
42+
}
43+
44+
impl<'s> From<&'s str> for JsonPath<'s> {
45+
fn from(key: &'s str) -> Self {
46+
JsonPath::Key(key)
47+
}
48+
}
49+
50+
pub fn jiter_json_find<'j>(opt_json: Option<&'j str>, path: &[JsonPath]) -> Option<(Jiter<'j>, Peek)> {
51+
let json_str = opt_json?;
52+
let mut jiter = Jiter::new(json_str.as_bytes());
53+
let mut peek = jiter.peek().ok()?;
54+
for element in path {
55+
match element {
56+
JsonPath::Key(key) if peek == Peek::Object => {
57+
let mut next_key = jiter.known_object().ok()??;
58+
59+
while next_key != *key {
60+
jiter.next_skip().ok()?;
61+
next_key = jiter.next_key().ok()??;
62+
}
63+
64+
peek = jiter.peek().ok()?;
65+
}
66+
JsonPath::Index(index) if peek == Peek::Array => {
67+
let mut array_item = jiter.known_array().ok()??;
68+
69+
for _ in 0..*index {
70+
jiter.known_skip(array_item).ok()?;
71+
array_item = jiter.array_step().ok()??;
72+
}
73+
74+
peek = array_item;
75+
}
76+
_ => {
77+
return None;
78+
}
79+
}
80+
}
81+
Some((jiter, peek))
82+
}
83+
84+
pub fn get_str(json_data: Option<&str>, path: &[JsonPath]) -> Option<String> {
85+
if let Some((mut jiter, peek)) = jiter_json_find(json_data, path) {
86+
match peek {
87+
Peek::String => Some(jiter.known_str().ok()?.to_owned()),
88+
_ => None,
89+
}
90+
} else {
91+
None
92+
}
93+
}
94+
}
95+
96+
mod serde_find {
97+
use batson::get::BatsonPath;
98+
use serde_json::Value;
99+
100+
pub fn get_str(json_data: &[u8], path: &[BatsonPath]) -> Option<String> {
101+
let json_value: Value = serde_json::from_slice(json_data).ok()?;
102+
let mut current = &json_value;
103+
for key in path {
104+
current = match (key, current) {
105+
(BatsonPath::Key(k), Value::Object(map)) => map.get(*k)?,
106+
(BatsonPath::Index(i), Value::Array(vec)) => vec.get(*i)?,
107+
_ => return None,
108+
}
109+
}
110+
match current {
111+
Value::String(s) => Some(s.clone()),
112+
_ => None,
113+
}
114+
}
115+
}
116+
117+
fn json_to_batson(json: &[u8]) -> Vec<u8> {
118+
let json_value = JsonValue::parse(json, false).unwrap();
119+
encode_from_json(&json_value).unwrap()
120+
}
121+
122+
fn medium_get_str_found_batson(bench: &mut Bencher) {
123+
let json = read_file("../jiter/benches/medium_response.json");
124+
let json_data = json.as_bytes();
125+
let batson_data = json_to_batson(json_data);
126+
let path: Vec<BatsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
127+
bench.iter(|| {
128+
let v = get_str(black_box(&batson_data), &path);
129+
black_box(v)
130+
});
131+
}
132+
133+
fn medium_get_str_found_jiter(bench: &mut Bencher) {
134+
let json = read_file("../jiter/benches/medium_response.json");
135+
let path: Vec<jiter_find::JsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
136+
bench.iter(|| {
137+
let v = jiter_find::get_str(black_box(Some(&json)), &path);
138+
black_box(v)
139+
});
140+
}
141+
142+
fn medium_get_str_found_serde(bench: &mut Bencher) {
143+
let json = read_file("../jiter/benches/medium_response.json");
144+
let json_data = json.as_bytes();
145+
let path: Vec<BatsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
146+
bench.iter(|| {
147+
let v = serde_find::get_str(black_box(json_data), &path).unwrap();
148+
black_box(v)
149+
});
150+
}
151+
152+
fn medium_get_str_missing_batson(bench: &mut Bencher) {
153+
let json = read_file("../jiter/benches/medium_response.json");
154+
let json_data = json.as_bytes();
155+
let batson_data = json_to_batson(json_data);
156+
let path: Vec<BatsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
157+
bench.iter(|| {
158+
let v = get_str(black_box(&batson_data), &path);
159+
black_box(v)
160+
});
161+
}
162+
163+
fn medium_get_str_missing_jiter(bench: &mut Bencher) {
164+
let json = read_file("../jiter/benches/medium_response.json");
165+
let path: Vec<jiter_find::JsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
166+
bench.iter(|| {
167+
let v = jiter_find::get_str(black_box(Some(&json)), &path);
168+
black_box(v)
169+
});
170+
}
171+
172+
fn medium_get_str_missing_serde(bench: &mut Bencher) {
173+
let json = read_file("../jiter/benches/medium_response.json");
174+
let json_data = json.as_bytes();
175+
let path: Vec<BatsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
176+
bench.iter(|| {
177+
let v = serde_find::get_str(black_box(json_data), &path);
178+
black_box(v)
179+
});
180+
}
181+
182+
fn medium_convert_batson_to_json(bench: &mut Bencher) {
183+
let json = read_file("../jiter/benches/medium_response.json");
184+
let json_data = json.as_bytes();
185+
let batson_data = json_to_batson(json_data);
186+
bench.iter(|| {
187+
let v = batson_to_json_string(black_box(&batson_data)).unwrap();
188+
black_box(v)
189+
});
190+
}
191+
192+
fn medium_convert_json_to_batson(bench: &mut Bencher) {
193+
let json = read_file("../jiter/benches/medium_response.json");
194+
let json = json.as_bytes();
195+
bench.iter(|| {
196+
let json_value = JsonValue::parse(json, false).unwrap();
197+
let b = encode_from_json(&json_value).unwrap();
198+
black_box(b)
199+
});
200+
}
201+
202+
benchmark_group!(
203+
benches,
204+
medium_get_str_found_batson,
205+
medium_get_str_found_jiter,
206+
medium_get_str_found_serde,
207+
medium_get_str_missing_batson,
208+
medium_get_str_missing_jiter,
209+
medium_get_str_missing_serde,
210+
medium_convert_batson_to_json,
211+
medium_convert_json_to_batson
212+
);
213+
benchmark_main!(benches);
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
use batson::get::BatsonPath;
2+
use batson::{batson_to_json_string, encode_from_json};
3+
use jiter::JsonValue;
4+
use std::fs::File;
5+
use std::io::Read;
6+
7+
fn main() {
8+
let filename = std::env::args().nth(1).expect(
9+
r#"
10+
No arguments provided!
11+
12+
Usage:
13+
cargo run --example read_file file.json [path]
14+
"#,
15+
);
16+
17+
let mut file = File::open(&filename).expect("failed to open file");
18+
let mut json = Vec::new();
19+
file.read_to_end(&mut json).expect("failed to read file");
20+
21+
let json_value = JsonValue::parse(&json, false).expect("invalid JSON");
22+
let batson = encode_from_json(&json_value).expect("failed to construct batson data");
23+
println!("json length: {}", json.len());
24+
println!("batson length: {}", batson.len());
25+
26+
let output_json = batson_to_json_string(&batson).expect("failed to convert batson to JSON");
27+
println!("output json length: {}", output_json.len());
28+
29+
if let Some(path) = std::env::args().nth(2) {
30+
let path: Vec<BatsonPath> = path.split('.').map(to_batson_path).collect();
31+
let start = std::time::Instant::now();
32+
let value = batson::get::get_str(&batson, &path).expect("failed to get value");
33+
let elapsed = start.elapsed();
34+
println!("Found value: {value:?} (time taken: {elapsed:?})");
35+
}
36+
37+
println!("reloading to check round-trip");
38+
let json_value = JsonValue::parse(output_json.as_bytes(), false).expect("invalid JSON");
39+
let batson = encode_from_json(&json_value).expect("failed to construct batson data");
40+
let output_json2 = batson_to_json_string(&batson).expect("failed to convert batson to JSON");
41+
println!("JSON unchanged after re-encoding: {:?}", output_json == output_json2);
42+
43+
println!("\n\noutput json:\n{}", output_json);
44+
}
45+
46+
fn to_batson_path(s: &str) -> BatsonPath {
47+
if s.chars().all(char::is_numeric) {
48+
let index: usize = s.parse().unwrap();
49+
index.into()
50+
} else {
51+
s.into()
52+
}
53+
}

0 commit comments

Comments
 (0)