Skip to content

Commit 097c203

Browse files
authored
Add some benchmarks for decoding delta encoded Parquet (#9500)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Part of #9476. # Rationale for this change Add benchmarks to show benefit of the optimizations in #9477 # What changes are included in this PR? Adds some benches for DELTA_BINARY_PACKED, DELTA_BYTE_ARRAY, and DELTA_LENGTH_BYTE_ARRAY. The generated data is meant to show the benefit of special casing for miniblocks with a bitwidth of 0. # Are these changes tested? Just benches # Are there any user-facing changes? No
1 parent 5ba4515 commit 097c203

File tree

1 file changed

+251
-0
lines changed

1 file changed

+251
-0
lines changed

parquet/benches/arrow_reader.rs

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,58 @@ where
326326
InMemoryPageIterator::new(pages)
327327
}
328328

329+
fn build_delta_encoded_incr_primitive_page_iterator<T>(
330+
column_desc: ColumnDescPtr,
331+
null_density: f32,
332+
increment: usize,
333+
stepped: bool,
334+
) -> impl PageIterator + Clone
335+
where
336+
T: parquet::data_type::DataType,
337+
T::T: SampleUniform + FromPrimitive,
338+
{
339+
let max_def_level = column_desc.max_def_level();
340+
let max_rep_level = column_desc.max_rep_level();
341+
let rep_levels = vec![0; VALUES_PER_PAGE];
342+
let mut rng = seedable_rng();
343+
let mut pages: Vec<Vec<parquet::column::page::Page>> = Vec::new();
344+
let mut running_val: usize = 1;
345+
for _i in 0..NUM_ROW_GROUPS {
346+
let mut column_chunk_pages = Vec::new();
347+
for _j in 0..PAGES_PER_GROUP {
348+
// generate page
349+
let mut values = Vec::with_capacity(VALUES_PER_PAGE);
350+
let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE);
351+
for k in 0..VALUES_PER_PAGE {
352+
let def_level = if rng.random::<f32>() < null_density {
353+
max_def_level - 1
354+
} else {
355+
max_def_level
356+
};
357+
if def_level == max_def_level {
358+
let value = FromPrimitive::from_usize(running_val).unwrap();
359+
running_val = if !stepped || k % 2 == 1 {
360+
running_val + increment
361+
} else {
362+
running_val
363+
};
364+
values.push(value);
365+
}
366+
def_levels.push(def_level);
367+
}
368+
let mut page_builder =
369+
DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true);
370+
page_builder.add_rep_levels(max_rep_level, &rep_levels);
371+
page_builder.add_def_levels(max_def_level, &def_levels);
372+
page_builder.add_values::<T>(Encoding::DELTA_BINARY_PACKED, &values);
373+
column_chunk_pages.push(page_builder.consume());
374+
}
375+
pages.push(column_chunk_pages);
376+
}
377+
378+
InMemoryPageIterator::new(pages)
379+
}
380+
329381
fn build_dictionary_encoded_primitive_page_iterator<T>(
330382
column_desc: ColumnDescPtr,
331383
null_density: f32,
@@ -439,6 +491,52 @@ fn build_plain_encoded_byte_array_page_iterator_inner(
439491
InMemoryPageIterator::new(pages)
440492
}
441493

494+
fn build_constant_prefix_byte_array_page_iterator(
495+
column_desc: ColumnDescPtr,
496+
null_density: f32,
497+
encoding: Encoding,
498+
const_string: bool,
499+
) -> impl PageIterator + Clone {
500+
let max_def_level = column_desc.max_def_level();
501+
let max_rep_level = column_desc.max_rep_level();
502+
let rep_levels = vec![0; VALUES_PER_PAGE];
503+
let mut rng = seedable_rng();
504+
let mut pages: Vec<Vec<parquet::column::page::Page>> = Vec::new();
505+
for i in 0..NUM_ROW_GROUPS {
506+
let mut column_chunk_pages = Vec::new();
507+
for j in 0..PAGES_PER_GROUP {
508+
// generate page
509+
let mut values = Vec::with_capacity(VALUES_PER_PAGE);
510+
let mut def_levels = Vec::with_capacity(VALUES_PER_PAGE);
511+
for k in 0..VALUES_PER_PAGE {
512+
let def_level = if rng.random::<f32>() < null_density {
513+
max_def_level - 1
514+
} else {
515+
max_def_level
516+
};
517+
if def_level == max_def_level {
518+
let string_value = if const_string {
519+
"01234567890123456789012345678901".to_string()
520+
} else {
521+
format!("01234567890123456789012345678901:{:x}{j}{i}", (k % 16))
522+
};
523+
values.push(parquet::data_type::ByteArray::from(string_value.as_str()));
524+
}
525+
def_levels.push(def_level);
526+
}
527+
let mut page_builder =
528+
DataPageBuilderImpl::new(column_desc.clone(), values.len() as u32, true);
529+
page_builder.add_rep_levels(max_rep_level, &rep_levels);
530+
page_builder.add_def_levels(max_def_level, &def_levels);
531+
page_builder.add_values::<ByteArrayType>(encoding, &values);
532+
column_chunk_pages.push(page_builder.consume());
533+
}
534+
pages.push(column_chunk_pages);
535+
}
536+
537+
InMemoryPageIterator::new(pages)
538+
}
539+
442540
fn build_plain_encoded_byte_array_page_iterator(
443541
column_desc: ColumnDescPtr,
444542
null_density: f32,
@@ -1094,6 +1192,99 @@ fn bench_primitive<T>(
10941192
assert_eq!(count, EXPECTED_VALUE_COUNT);
10951193
});
10961194

1195+
// binary packed same value
1196+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1197+
mandatory_column_desc.clone(),
1198+
0.0,
1199+
0,
1200+
false,
1201+
);
1202+
group.bench_function("binary packed single value", |b| {
1203+
b.iter(|| {
1204+
let array_reader =
1205+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1206+
count = bench_array_reader(array_reader);
1207+
});
1208+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1209+
});
1210+
1211+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1212+
mandatory_column_desc.clone(),
1213+
0.0,
1214+
0,
1215+
false,
1216+
);
1217+
group.bench_function("binary packed skip single value", |b| {
1218+
b.iter(|| {
1219+
let array_reader =
1220+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1221+
count = bench_array_reader_skip(array_reader);
1222+
});
1223+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1224+
});
1225+
1226+
// binary packed monotonically increasing
1227+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1228+
mandatory_column_desc.clone(),
1229+
0.0,
1230+
1,
1231+
false,
1232+
);
1233+
group.bench_function("binary packed increasing value", |b| {
1234+
b.iter(|| {
1235+
let array_reader =
1236+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1237+
count = bench_array_reader(array_reader);
1238+
});
1239+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1240+
});
1241+
1242+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1243+
mandatory_column_desc.clone(),
1244+
0.0,
1245+
1,
1246+
false,
1247+
);
1248+
group.bench_function("binary packed skip increasing value", |b| {
1249+
b.iter(|| {
1250+
let array_reader =
1251+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1252+
count = bench_array_reader_skip(array_reader);
1253+
});
1254+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1255+
});
1256+
1257+
// binary packed increasing stepped
1258+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1259+
mandatory_column_desc.clone(),
1260+
0.0,
1261+
1,
1262+
true,
1263+
);
1264+
group.bench_function("binary packed stepped increasing value", |b| {
1265+
b.iter(|| {
1266+
let array_reader =
1267+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1268+
count = bench_array_reader(array_reader);
1269+
});
1270+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1271+
});
1272+
1273+
let data = build_delta_encoded_incr_primitive_page_iterator::<T>(
1274+
mandatory_column_desc.clone(),
1275+
0.0,
1276+
1,
1277+
true,
1278+
);
1279+
group.bench_function("binary packed skip stepped increasing value", |b| {
1280+
b.iter(|| {
1281+
let array_reader =
1282+
create_primitive_array_reader(data.clone(), mandatory_column_desc.clone());
1283+
count = bench_array_reader_skip(array_reader);
1284+
});
1285+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1286+
});
1287+
10971288
// dictionary encoded, no NULLs
10981289
let data =
10991290
build_dictionary_encoded_primitive_page_iterator::<T>(mandatory_column_desc.clone(), 0.0);
@@ -1594,6 +1785,66 @@ fn add_benches(c: &mut Criterion) {
15941785
assert_eq!(count, EXPECTED_VALUE_COUNT);
15951786
});
15961787

1788+
// delta byte array with constant prefix and suffix lengths
1789+
let delta_string_const_prefix_no_null_data = build_constant_prefix_byte_array_page_iterator(
1790+
mandatory_string_column_desc.clone(),
1791+
0.0,
1792+
Encoding::DELTA_BYTE_ARRAY,
1793+
false,
1794+
);
1795+
group.bench_function(
1796+
"const prefix delta byte array encoded, mandatory, no NULLs",
1797+
|b| {
1798+
b.iter(|| {
1799+
let array_reader = create_byte_array_reader(
1800+
delta_string_const_prefix_no_null_data.clone(),
1801+
mandatory_string_column_desc.clone(),
1802+
);
1803+
count = bench_array_reader(array_reader);
1804+
});
1805+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1806+
},
1807+
);
1808+
1809+
// delta byte array with constant prefix and no suffix
1810+
let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator(
1811+
mandatory_string_column_desc.clone(),
1812+
0.0,
1813+
Encoding::DELTA_BYTE_ARRAY,
1814+
true,
1815+
);
1816+
group.bench_function("const delta byte array encoded, mandatory, no NULLs", |b| {
1817+
b.iter(|| {
1818+
let array_reader = create_byte_array_reader(
1819+
delta_string_const_no_null_data.clone(),
1820+
mandatory_string_column_desc.clone(),
1821+
);
1822+
count = bench_array_reader(array_reader);
1823+
});
1824+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1825+
});
1826+
1827+
// delta length byte array with constant lengths
1828+
let delta_string_const_no_null_data = build_constant_prefix_byte_array_page_iterator(
1829+
mandatory_string_column_desc.clone(),
1830+
0.0,
1831+
Encoding::DELTA_LENGTH_BYTE_ARRAY,
1832+
true,
1833+
);
1834+
group.bench_function(
1835+
"const delta length byte array encoded, mandatory, no NULLs",
1836+
|b| {
1837+
b.iter(|| {
1838+
let array_reader = create_byte_array_reader(
1839+
delta_string_const_no_null_data.clone(),
1840+
mandatory_string_column_desc.clone(),
1841+
);
1842+
count = bench_array_reader(array_reader);
1843+
});
1844+
assert_eq!(count, EXPECTED_VALUE_COUNT);
1845+
},
1846+
);
1847+
15971848
group.finish();
15981849

15991850
// binary benchmarks

0 commit comments

Comments
 (0)