|
15 | 15 | // specific language governing permissions and limitations |
16 | 16 | // under the License. |
17 | 17 |
|
| 18 | +use arbitrary::{Arbitrary, Unstructured}; |
18 | 19 | use arrow_json::ReaderBuilder; |
19 | | -use arrow_json::reader::Decoder; |
| 20 | +use arrow_json::reader::{Decoder, infer_json_schema}; |
20 | 21 | use arrow_schema::{DataType, Field, Schema}; |
21 | 22 | use criterion::{ |
22 | 23 | BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, |
23 | 24 | }; |
| 25 | +use serde::Serialize; |
24 | 26 | use serde_json::{Map, Number, Value}; |
25 | 27 | use std::fmt::Write; |
26 | 28 | use std::hint::black_box; |
@@ -323,13 +325,83 @@ fn bench_serialize_list(c: &mut Criterion) { |
323 | 325 | }); |
324 | 326 | } |
325 | 327 |
|
| 328 | +fn bench_schema_inference(c: &mut Criterion) { |
| 329 | + const ROWS: usize = 1000; |
| 330 | + |
| 331 | + #[derive(Serialize, Arbitrary, Debug)] |
| 332 | + struct Row { |
| 333 | + a: Option<i16>, |
| 334 | + b: Option<String>, |
| 335 | + c: Option<[i16; 8]>, |
| 336 | + d: Option<[bool; 8]>, |
| 337 | + e: Option<Inner>, |
| 338 | + f: f64, |
| 339 | + } |
| 340 | + |
| 341 | + #[derive(Serialize, Arbitrary, Debug)] |
| 342 | + struct Inner { |
| 343 | + a: Option<i16>, |
| 344 | + b: Option<String>, |
| 345 | + c: Option<bool>, |
| 346 | + } |
| 347 | + |
| 348 | + let mut data = vec![]; |
| 349 | + for row in pseudorandom_sequence::<Row>(ROWS) { |
| 350 | + serde_json::to_writer(&mut data, &row).unwrap(); |
| 351 | + data.push(b'\n'); |
| 352 | + } |
| 353 | + |
| 354 | + let mut group = c.benchmark_group("infer_json_schema"); |
| 355 | + group.throughput(Throughput::Bytes(data.len() as u64)); |
| 356 | + group.sample_size(50); |
| 357 | + group.measurement_time(std::time::Duration::from_secs(5)); |
| 358 | + group.warm_up_time(std::time::Duration::from_secs(2)); |
| 359 | + group.sampling_mode(SamplingMode::Flat); |
| 360 | + group.bench_function(BenchmarkId::from_parameter(ROWS), |b| { |
| 361 | + b.iter(|| infer_json_schema(black_box(&data[..]), None).unwrap()) |
| 362 | + }); |
| 363 | + group.finish(); |
| 364 | +} |
| 365 | + |
| 366 | +fn pseudorandom_sequence<T: for<'a> Arbitrary<'a>>(len: usize) -> Vec<T> { |
| 367 | + static RAND_BYTES: &[u8; 255] = &[ |
| 368 | + 12, 135, 254, 243, 18, 5, 38, 175, 60, 58, 204, 103, 15, 88, 201, 199, 57, 63, 56, 234, |
| 369 | + 106, 111, 238, 119, 214, 50, 110, 89, 129, 185, 112, 115, 35, 239, 188, 189, 49, 184, 194, |
| 370 | + 146, 108, 131, 213, 43, 236, 81, 61, 20, 21, 52, 223, 220, 215, 74, 210, 27, 190, 107, 174, |
| 371 | + 142, 237, 66, 75, 1, 53, 181, 82, 158, 68, 134, 176, 229, 157, 116, 233, 153, 84, 139, 151, |
| 372 | + 8, 171, 59, 105, 242, 40, 69, 94, 170, 4, 187, 212, 156, 65, 90, 192, 216, 29, 222, 122, |
| 373 | + 230, 198, 154, 155, 245, 45, 178, 123, 23, 117, 168, 149, 17, 177, 48, 54, 241, 202, 44, |
| 374 | + 232, 64, 221, 252, 161, 91, 93, 143, 240, 102, 172, 209, 224, 186, 197, 219, 247, 71, 36, |
| 375 | + 101, 133, 113, 6, 137, 231, 162, 31, 7, 22, 138, 47, 136, 2, 244, 141, 173, 99, 25, 95, 96, |
| 376 | + 85, 249, 42, 251, 217, 16, 205, 98, 203, 92, 114, 14, 163, 150, 144, 10, 125, 13, 195, 72, |
| 377 | + 41, 67, 246, 11, 77, 132, 83, 37, 24, 183, 226, 250, 109, 248, 33, 76, 9, 55, 159, 34, 62, |
| 378 | + 196, 87, 3, 39, 28, 166, 167, 255, 206, 79, 191, 228, 193, 179, 97, 182, 148, 73, 120, 211, |
| 379 | + 253, 70, 227, 51, 169, 130, 145, 218, 78, 180, 165, 46, 127, 152, 26, 140, 207, 19, 100, |
| 380 | + 104, 80, 164, 126, 118, 200, 128, 86, 160, 32, 30, 225, 147, 124, 121, 235, 208, |
| 381 | + ]; |
| 382 | + |
| 383 | + let bytes: Vec<u8> = RAND_BYTES |
| 384 | + .iter() |
| 385 | + .flat_map(|i| RAND_BYTES.map(|j| i.wrapping_add(j))) |
| 386 | + .take(1000 * len) |
| 387 | + .collect(); |
| 388 | + |
| 389 | + let mut u = Unstructured::new(&bytes); |
| 390 | + |
| 391 | + (0..len) |
| 392 | + .map(|_| u.arbitrary::<T>().unwrap()) |
| 393 | + .take(len) |
| 394 | + .collect() |
| 395 | +} |
| 396 | + |
326 | 397 | criterion_group!( |
327 | 398 | benches, |
328 | 399 | bench_decode_wide_object, |
329 | 400 | bench_serialize_wide_object, |
330 | 401 | bench_binary_hex, |
331 | 402 | bench_wide_projection, |
332 | 403 | bench_decode_list, |
333 | | - bench_serialize_list |
| 404 | + bench_serialize_list, |
| 405 | + bench_schema_inference |
334 | 406 | ); |
335 | 407 | criterion_main!(benches); |
0 commit comments