Add benchmark for infer_json_schema (#9546)

Rafferty97 · web-flow · commit c214c3c6f539 · 2026-03-13T10:54:04.000+01:00
# Which issue does this PR close? Split out from #9494 to make review easier. It simply adds a benchmark for JSON schema inference. # Rationale for this change I have an open PR that significantly refactors the JSON schema inference code, so I want confidence that not only is the new code correct, but also has better performance than the existing code. # What changes are included in this PR? Adds a benchmark. # Are these changes tested? N/A # Are there any user-facing changes? No
diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml
@@ -61,6 +61,7 @@ tokio = { version = "1.27", default-features = false, features = ["io-util"] }
 bytes = "1.4"
 criterion = { workspace = true, default-features = false }
 rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
+arbitrary = { version = "1.4.2", features = ["derive"] }
 
 [[bench]]
 name = "serde"
diff --git a/arrow-json/benches/json_reader.rs b/arrow-json/benches/json_reader.rs
@@ -15,12 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arbitrary::{Arbitrary, Unstructured};
 use arrow_json::ReaderBuilder;
-use arrow_json::reader::Decoder;
+use arrow_json::reader::{Decoder, infer_json_schema};
 use arrow_schema::{DataType, Field, Schema};
 use criterion::{
     BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
 };
+use serde::Serialize;
 use serde_json::{Map, Number, Value};
 use std::fmt::Write;
 use std::hint::black_box;
@@ -323,13 +325,83 @@ fn bench_serialize_list(c: &mut Criterion) {
     });
 }
 
+fn bench_schema_inference(c: &mut Criterion) {
+    const ROWS: usize = 1000;
+
+    #[derive(Serialize, Arbitrary, Debug)]
+    struct Row {
+        a: Option<i16>,
+        b: Option<String>,
+        c: Option<[i16; 8]>,
+        d: Option<[bool; 8]>,
+        e: Option<Inner>,
+        f: f64,
+    }
+
+    #[derive(Serialize, Arbitrary, Debug)]
+    struct Inner {
+        a: Option<i16>,
+        b: Option<String>,
+        c: Option<bool>,
+    }
+
+    let mut data = vec![];
+    for row in pseudorandom_sequence::<Row>(ROWS) {
+        serde_json::to_writer(&mut data, &row).unwrap();
+        data.push(b'\n');
+    }
+
+    let mut group = c.benchmark_group("infer_json_schema");
+    group.throughput(Throughput::Bytes(data.len() as u64));
+    group.sample_size(50);
+    group.measurement_time(std::time::Duration::from_secs(5));
+    group.warm_up_time(std::time::Duration::from_secs(2));
+    group.sampling_mode(SamplingMode::Flat);
+    group.bench_function(BenchmarkId::from_parameter(ROWS), |b| {
+        b.iter(|| infer_json_schema(black_box(&data[..]), None).unwrap())
+    });
+    group.finish();
+}
+
+fn pseudorandom_sequence<T: for<'a> Arbitrary<'a>>(len: usize) -> Vec<T> {
+    static RAND_BYTES: &[u8; 255] = &[
+        12, 135, 254, 243, 18, 5, 38, 175, 60, 58, 204, 103, 15, 88, 201, 199, 57, 63, 56, 234,
+        106, 111, 238, 119, 214, 50, 110, 89, 129, 185, 112, 115, 35, 239, 188, 189, 49, 184, 194,
+        146, 108, 131, 213, 43, 236, 81, 61, 20, 21, 52, 223, 220, 215, 74, 210, 27, 190, 107, 174,
+        142, 237, 66, 75, 1, 53, 181, 82, 158, 68, 134, 176, 229, 157, 116, 233, 153, 84, 139, 151,
+        8, 171, 59, 105, 242, 40, 69, 94, 170, 4, 187, 212, 156, 65, 90, 192, 216, 29, 222, 122,
+        230, 198, 154, 155, 245, 45, 178, 123, 23, 117, 168, 149, 17, 177, 48, 54, 241, 202, 44,
+        232, 64, 221, 252, 161, 91, 93, 143, 240, 102, 172, 209, 224, 186, 197, 219, 247, 71, 36,
+        101, 133, 113, 6, 137, 231, 162, 31, 7, 22, 138, 47, 136, 2, 244, 141, 173, 99, 25, 95, 96,
+        85, 249, 42, 251, 217, 16, 205, 98, 203, 92, 114, 14, 163, 150, 144, 10, 125, 13, 195, 72,
+        41, 67, 246, 11, 77, 132, 83, 37, 24, 183, 226, 250, 109, 248, 33, 76, 9, 55, 159, 34, 62,
+        196, 87, 3, 39, 28, 166, 167, 255, 206, 79, 191, 228, 193, 179, 97, 182, 148, 73, 120, 211,
+        253, 70, 227, 51, 169, 130, 145, 218, 78, 180, 165, 46, 127, 152, 26, 140, 207, 19, 100,
+        104, 80, 164, 126, 118, 200, 128, 86, 160, 32, 30, 225, 147, 124, 121, 235, 208,
+    ];
+
+    let bytes: Vec<u8> = RAND_BYTES
+        .iter()
+        .flat_map(|i| RAND_BYTES.map(|j| i.wrapping_add(j)))
+        .take(1000 * len)
+        .collect();
+
+    let mut u = Unstructured::new(&bytes);
+
+    (0..len)
+        .map(|_| u.arbitrary::<T>().unwrap())
+        .take(len)
+        .collect()
+}
+
 criterion_group!(
     benches,
     bench_decode_wide_object,
     bench_serialize_wide_object,
     bench_binary_hex,
     bench_wide_projection,
     bench_decode_list,
-    bench_serialize_list
+    bench_serialize_list,
+    bench_schema_inference
 );
 criterion_main!(benches);