Skip to content

Commit c214c3c

Browse files
authored
Add benchmark for infer_json_schema (#9546)
# Which issue does this PR close? Split out from #9494 to make review easier. It simply adds a benchmark for JSON schema inference. # Rationale for this change I have an open PR that significantly refactors the JSON schema inference code, so I want confidence that not only is the new code correct, but also has better performance than the existing code. # What changes are included in this PR? Adds a benchmark. # Are these changes tested? N/A # Are there any user-facing changes? No
1 parent 92a239a commit c214c3c

File tree

2 files changed

+75
-2
lines changed

2 files changed

+75
-2
lines changed

arrow-json/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ tokio = { version = "1.27", default-features = false, features = ["io-util"] }
6161
bytes = "1.4"
6262
criterion = { workspace = true, default-features = false }
6363
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
64+
arbitrary = { version = "1.4.2", features = ["derive"] }
6465

6566
[[bench]]
6667
name = "serde"

arrow-json/benches/json_reader.rs

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use arbitrary::{Arbitrary, Unstructured};
1819
use arrow_json::ReaderBuilder;
19-
use arrow_json::reader::Decoder;
20+
use arrow_json::reader::{Decoder, infer_json_schema};
2021
use arrow_schema::{DataType, Field, Schema};
2122
use criterion::{
2223
BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
2324
};
25+
use serde::Serialize;
2426
use serde_json::{Map, Number, Value};
2527
use std::fmt::Write;
2628
use std::hint::black_box;
@@ -323,13 +325,83 @@ fn bench_serialize_list(c: &mut Criterion) {
323325
});
324326
}
325327

328+
fn bench_schema_inference(c: &mut Criterion) {
329+
const ROWS: usize = 1000;
330+
331+
#[derive(Serialize, Arbitrary, Debug)]
332+
struct Row {
333+
a: Option<i16>,
334+
b: Option<String>,
335+
c: Option<[i16; 8]>,
336+
d: Option<[bool; 8]>,
337+
e: Option<Inner>,
338+
f: f64,
339+
}
340+
341+
#[derive(Serialize, Arbitrary, Debug)]
342+
struct Inner {
343+
a: Option<i16>,
344+
b: Option<String>,
345+
c: Option<bool>,
346+
}
347+
348+
let mut data = vec![];
349+
for row in pseudorandom_sequence::<Row>(ROWS) {
350+
serde_json::to_writer(&mut data, &row).unwrap();
351+
data.push(b'\n');
352+
}
353+
354+
let mut group = c.benchmark_group("infer_json_schema");
355+
group.throughput(Throughput::Bytes(data.len() as u64));
356+
group.sample_size(50);
357+
group.measurement_time(std::time::Duration::from_secs(5));
358+
group.warm_up_time(std::time::Duration::from_secs(2));
359+
group.sampling_mode(SamplingMode::Flat);
360+
group.bench_function(BenchmarkId::from_parameter(ROWS), |b| {
361+
b.iter(|| infer_json_schema(black_box(&data[..]), None).unwrap())
362+
});
363+
group.finish();
364+
}
365+
366+
fn pseudorandom_sequence<T: for<'a> Arbitrary<'a>>(len: usize) -> Vec<T> {
367+
static RAND_BYTES: &[u8; 255] = &[
368+
12, 135, 254, 243, 18, 5, 38, 175, 60, 58, 204, 103, 15, 88, 201, 199, 57, 63, 56, 234,
369+
106, 111, 238, 119, 214, 50, 110, 89, 129, 185, 112, 115, 35, 239, 188, 189, 49, 184, 194,
370+
146, 108, 131, 213, 43, 236, 81, 61, 20, 21, 52, 223, 220, 215, 74, 210, 27, 190, 107, 174,
371+
142, 237, 66, 75, 1, 53, 181, 82, 158, 68, 134, 176, 229, 157, 116, 233, 153, 84, 139, 151,
372+
8, 171, 59, 105, 242, 40, 69, 94, 170, 4, 187, 212, 156, 65, 90, 192, 216, 29, 222, 122,
373+
230, 198, 154, 155, 245, 45, 178, 123, 23, 117, 168, 149, 17, 177, 48, 54, 241, 202, 44,
374+
232, 64, 221, 252, 161, 91, 93, 143, 240, 102, 172, 209, 224, 186, 197, 219, 247, 71, 36,
375+
101, 133, 113, 6, 137, 231, 162, 31, 7, 22, 138, 47, 136, 2, 244, 141, 173, 99, 25, 95, 96,
376+
85, 249, 42, 251, 217, 16, 205, 98, 203, 92, 114, 14, 163, 150, 144, 10, 125, 13, 195, 72,
377+
41, 67, 246, 11, 77, 132, 83, 37, 24, 183, 226, 250, 109, 248, 33, 76, 9, 55, 159, 34, 62,
378+
196, 87, 3, 39, 28, 166, 167, 255, 206, 79, 191, 228, 193, 179, 97, 182, 148, 73, 120, 211,
379+
253, 70, 227, 51, 169, 130, 145, 218, 78, 180, 165, 46, 127, 152, 26, 140, 207, 19, 100,
380+
104, 80, 164, 126, 118, 200, 128, 86, 160, 32, 30, 225, 147, 124, 121, 235, 208,
381+
];
382+
383+
let bytes: Vec<u8> = RAND_BYTES
384+
.iter()
385+
.flat_map(|i| RAND_BYTES.map(|j| i.wrapping_add(j)))
386+
.take(1000 * len)
387+
.collect();
388+
389+
let mut u = Unstructured::new(&bytes);
390+
391+
(0..len)
392+
.map(|_| u.arbitrary::<T>().unwrap())
393+
.take(len)
394+
.collect()
395+
}
396+
326397
criterion_group!(
327398
benches,
328399
bench_decode_wide_object,
329400
bench_serialize_wide_object,
330401
bench_binary_hex,
331402
bench_wide_projection,
332403
bench_decode_list,
333-
bench_serialize_list
404+
bench_serialize_list,
405+
bench_schema_inference
334406
);
335407
criterion_main!(benches);

0 commit comments

Comments
 (0)