Skip to content

Commit a3d144f

Browse files
authored
Add more benchmarks for Parquet thrift decoding (#8037)
# Which issue does this PR close? - Part of #5854. # Rationale for this change Before embarking on radical changes to the thrift processing in the parquet crate, add a few more benchmarks to help evaluate the performance gains. # What changes are included in this PR? Adds a test originally written by @tustvold (https://github.com/tustvold/arrow-rs/tree/thrift-bench) and exposes a new public function in the thrift module rather than making the thrift parser public. # Are these changes tested? Benchmark code only, so no tests necessary. # Are there any user-facing changes? No, but adds a public function.
1 parent a9b6077 commit a3d144f

File tree

2 files changed

+151
-0
lines changed

2 files changed

+151
-0
lines changed

parquet/benches/metadata.rs

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,141 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use rand::Rng;
19+
use thrift::protocol::TCompactOutputProtocol;
20+
21+
use arrow::util::test_util::seedable_rng;
1822
use bytes::Bytes;
1923
use criterion::*;
2024
use parquet::file::reader::SerializedFileReader;
2125
use parquet::file::serialized_reader::ReadOptionsBuilder;
26+
use parquet::format::{
27+
ColumnChunk, ColumnMetaData, CompressionCodec, Encoding, FieldRepetitionType, FileMetaData,
28+
RowGroup, SchemaElement, Type,
29+
};
30+
use parquet::thrift::TSerializable;
31+
32+
const NUM_COLUMNS: usize = 10_000;
33+
const NUM_ROW_GROUPS: usize = 10;
34+
35+
fn encoded_meta() -> Vec<u8> {
36+
let mut rng = seedable_rng();
37+
38+
let mut schema = Vec::with_capacity(NUM_COLUMNS + 1);
39+
schema.push(SchemaElement {
40+
type_: None,
41+
type_length: None,
42+
repetition_type: None,
43+
name: Default::default(),
44+
num_children: Some(NUM_COLUMNS as _),
45+
converted_type: None,
46+
scale: None,
47+
precision: None,
48+
field_id: None,
49+
logical_type: None,
50+
});
51+
for i in 0..NUM_COLUMNS {
52+
schema.push(SchemaElement {
53+
type_: Some(Type::FLOAT),
54+
type_length: None,
55+
repetition_type: Some(FieldRepetitionType::REQUIRED),
56+
name: i.to_string(),
57+
num_children: None,
58+
converted_type: None,
59+
scale: None,
60+
precision: None,
61+
field_id: None,
62+
logical_type: None,
63+
})
64+
}
65+
66+
let stats = parquet::format::Statistics {
67+
min: None,
68+
max: None,
69+
null_count: Some(0),
70+
distinct_count: None,
71+
max_value: Some(vec![rng.random(); 8]),
72+
min_value: Some(vec![rng.random(); 8]),
73+
is_max_value_exact: Some(true),
74+
is_min_value_exact: Some(true),
75+
};
76+
77+
let row_groups = (0..NUM_ROW_GROUPS)
78+
.map(|i| {
79+
let columns = (0..NUM_COLUMNS)
80+
.map(|_| ColumnChunk {
81+
file_path: None,
82+
file_offset: 0,
83+
meta_data: Some(ColumnMetaData {
84+
type_: Type::FLOAT,
85+
encodings: vec![Encoding::PLAIN, Encoding::RLE_DICTIONARY],
86+
path_in_schema: vec![],
87+
codec: CompressionCodec::UNCOMPRESSED,
88+
num_values: rng.random(),
89+
total_uncompressed_size: rng.random(),
90+
total_compressed_size: rng.random(),
91+
key_value_metadata: None,
92+
data_page_offset: rng.random(),
93+
index_page_offset: Some(rng.random()),
94+
dictionary_page_offset: Some(rng.random()),
95+
statistics: Some(stats.clone()),
96+
encoding_stats: None,
97+
bloom_filter_offset: None,
98+
bloom_filter_length: None,
99+
size_statistics: None,
100+
geospatial_statistics: None,
101+
}),
102+
offset_index_offset: Some(rng.random()),
103+
offset_index_length: Some(rng.random()),
104+
column_index_offset: Some(rng.random()),
105+
column_index_length: Some(rng.random()),
106+
crypto_metadata: None,
107+
encrypted_column_metadata: None,
108+
})
109+
.collect();
110+
111+
RowGroup {
112+
columns,
113+
total_byte_size: rng.random(),
114+
num_rows: rng.random(),
115+
sorting_columns: None,
116+
file_offset: None,
117+
total_compressed_size: Some(rng.random()),
118+
ordinal: Some(i as _),
119+
}
120+
})
121+
.collect();
122+
123+
let file = FileMetaData {
124+
schema,
125+
row_groups,
126+
version: 1,
127+
num_rows: rng.random(),
128+
key_value_metadata: None,
129+
created_by: Some("parquet-rs".into()),
130+
column_orders: None,
131+
encryption_algorithm: None,
132+
footer_signing_key_metadata: None,
133+
};
134+
135+
let mut buf = Vec::with_capacity(1024);
136+
{
137+
let mut out = TCompactOutputProtocol::new(&mut buf);
138+
file.write_to_out_protocol(&mut out).unwrap();
139+
}
140+
buf
141+
}
142+
143+
fn get_footer_bytes(data: Bytes) -> Bytes {
144+
let footer_bytes = data.slice(data.len() - 8..);
145+
let footer_len = footer_bytes[0] as u32
146+
| (footer_bytes[1] as u32) << 8
147+
| (footer_bytes[2] as u32) << 16
148+
| (footer_bytes[3] as u32) << 24;
149+
let meta_start = data.len() - footer_len as usize - 8;
150+
let meta_end = data.len() - 8;
151+
data.slice(meta_start..meta_end)
152+
}
22153

23154
fn criterion_benchmark(c: &mut Criterion) {
24155
// Read file into memory to isolate filesystem performance
@@ -36,6 +167,20 @@ fn criterion_benchmark(c: &mut Criterion) {
36167
SerializedFileReader::new_with_options(data.clone(), options).unwrap()
37168
})
38169
});
170+
171+
let meta_data = get_footer_bytes(data);
172+
c.bench_function("decode file metadata", |b| {
173+
b.iter(|| {
174+
parquet::thrift::bench_file_metadata(&meta_data);
175+
})
176+
});
177+
178+
let buf = black_box(encoded_meta()).into();
179+
c.bench_function("decode file metadata (wide)", |b| {
180+
b.iter(|| {
181+
parquet::thrift::bench_file_metadata(&buf);
182+
})
183+
});
39184
}
40185

41186
criterion_group!(benches, criterion_benchmark);

parquet/src/thrift.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ pub trait TSerializable: Sized {
3333
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> thrift::Result<()>;
3434
}
3535

36+
/// Public function to aid benchmarking.
37+
pub fn bench_file_metadata(bytes: &bytes::Bytes) {
38+
let mut input = TCompactSliceInputProtocol::new(bytes);
39+
crate::format::FileMetaData::read_from_in_protocol(&mut input).unwrap();
40+
}
41+
3642
/// A more performant implementation of [`TCompactInputProtocol`] that reads a slice
3743
///
3844
/// [`TCompactInputProtocol`]: thrift::protocol::TCompactInputProtocol

0 commit comments

Comments
 (0)