15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
+ use rand:: Rng ;
19
+ use thrift:: protocol:: TCompactOutputProtocol ;
20
+
21
+ use arrow:: util:: test_util:: seedable_rng;
18
22
use bytes:: Bytes ;
19
23
use criterion:: * ;
20
24
use parquet:: file:: reader:: SerializedFileReader ;
21
25
use parquet:: file:: serialized_reader:: ReadOptionsBuilder ;
26
+ use parquet:: format:: {
27
+ ColumnChunk , ColumnMetaData , CompressionCodec , Encoding , FieldRepetitionType , FileMetaData ,
28
+ RowGroup , SchemaElement , Type ,
29
+ } ;
30
+ use parquet:: thrift:: TSerializable ;
31
+
32
+ const NUM_COLUMNS : usize = 10_000 ;
33
+ const NUM_ROW_GROUPS : usize = 10 ;
34
+
35
+ fn encoded_meta ( ) -> Vec < u8 > {
36
+ let mut rng = seedable_rng ( ) ;
37
+
38
+ let mut schema = Vec :: with_capacity ( NUM_COLUMNS + 1 ) ;
39
+ schema. push ( SchemaElement {
40
+ type_ : None ,
41
+ type_length : None ,
42
+ repetition_type : None ,
43
+ name : Default :: default ( ) ,
44
+ num_children : Some ( NUM_COLUMNS as _ ) ,
45
+ converted_type : None ,
46
+ scale : None ,
47
+ precision : None ,
48
+ field_id : None ,
49
+ logical_type : None ,
50
+ } ) ;
51
+ for i in 0 ..NUM_COLUMNS {
52
+ schema. push ( SchemaElement {
53
+ type_ : Some ( Type :: FLOAT ) ,
54
+ type_length : None ,
55
+ repetition_type : Some ( FieldRepetitionType :: REQUIRED ) ,
56
+ name : i. to_string ( ) ,
57
+ num_children : None ,
58
+ converted_type : None ,
59
+ scale : None ,
60
+ precision : None ,
61
+ field_id : None ,
62
+ logical_type : None ,
63
+ } )
64
+ }
65
+
66
+ let stats = parquet:: format:: Statistics {
67
+ min : None ,
68
+ max : None ,
69
+ null_count : Some ( 0 ) ,
70
+ distinct_count : None ,
71
+ max_value : Some ( vec ! [ rng. random( ) ; 8 ] ) ,
72
+ min_value : Some ( vec ! [ rng. random( ) ; 8 ] ) ,
73
+ is_max_value_exact : Some ( true ) ,
74
+ is_min_value_exact : Some ( true ) ,
75
+ } ;
76
+
77
+ let row_groups = ( 0 ..NUM_ROW_GROUPS )
78
+ . map ( |i| {
79
+ let columns = ( 0 ..NUM_COLUMNS )
80
+ . map ( |_| ColumnChunk {
81
+ file_path : None ,
82
+ file_offset : 0 ,
83
+ meta_data : Some ( ColumnMetaData {
84
+ type_ : Type :: FLOAT ,
85
+ encodings : vec ! [ Encoding :: PLAIN , Encoding :: RLE_DICTIONARY ] ,
86
+ path_in_schema : vec ! [ ] ,
87
+ codec : CompressionCodec :: UNCOMPRESSED ,
88
+ num_values : rng. random ( ) ,
89
+ total_uncompressed_size : rng. random ( ) ,
90
+ total_compressed_size : rng. random ( ) ,
91
+ key_value_metadata : None ,
92
+ data_page_offset : rng. random ( ) ,
93
+ index_page_offset : Some ( rng. random ( ) ) ,
94
+ dictionary_page_offset : Some ( rng. random ( ) ) ,
95
+ statistics : Some ( stats. clone ( ) ) ,
96
+ encoding_stats : None ,
97
+ bloom_filter_offset : None ,
98
+ bloom_filter_length : None ,
99
+ size_statistics : None ,
100
+ geospatial_statistics : None ,
101
+ } ) ,
102
+ offset_index_offset : Some ( rng. random ( ) ) ,
103
+ offset_index_length : Some ( rng. random ( ) ) ,
104
+ column_index_offset : Some ( rng. random ( ) ) ,
105
+ column_index_length : Some ( rng. random ( ) ) ,
106
+ crypto_metadata : None ,
107
+ encrypted_column_metadata : None ,
108
+ } )
109
+ . collect ( ) ;
110
+
111
+ RowGroup {
112
+ columns,
113
+ total_byte_size : rng. random ( ) ,
114
+ num_rows : rng. random ( ) ,
115
+ sorting_columns : None ,
116
+ file_offset : None ,
117
+ total_compressed_size : Some ( rng. random ( ) ) ,
118
+ ordinal : Some ( i as _ ) ,
119
+ }
120
+ } )
121
+ . collect ( ) ;
122
+
123
+ let file = FileMetaData {
124
+ schema,
125
+ row_groups,
126
+ version : 1 ,
127
+ num_rows : rng. random ( ) ,
128
+ key_value_metadata : None ,
129
+ created_by : Some ( "parquet-rs" . into ( ) ) ,
130
+ column_orders : None ,
131
+ encryption_algorithm : None ,
132
+ footer_signing_key_metadata : None ,
133
+ } ;
134
+
135
+ let mut buf = Vec :: with_capacity ( 1024 ) ;
136
+ {
137
+ let mut out = TCompactOutputProtocol :: new ( & mut buf) ;
138
+ file. write_to_out_protocol ( & mut out) . unwrap ( ) ;
139
+ }
140
+ buf
141
+ }
142
+
143
+ fn get_footer_bytes ( data : Bytes ) -> Bytes {
144
+ let footer_bytes = data. slice ( data. len ( ) - 8 ..) ;
145
+ let footer_len = footer_bytes[ 0 ] as u32
146
+ | ( footer_bytes[ 1 ] as u32 ) << 8
147
+ | ( footer_bytes[ 2 ] as u32 ) << 16
148
+ | ( footer_bytes[ 3 ] as u32 ) << 24 ;
149
+ let meta_start = data. len ( ) - footer_len as usize - 8 ;
150
+ let meta_end = data. len ( ) - 8 ;
151
+ data. slice ( meta_start..meta_end)
152
+ }
22
153
23
154
fn criterion_benchmark ( c : & mut Criterion ) {
24
155
// Read file into memory to isolate filesystem performance
@@ -36,6 +167,20 @@ fn criterion_benchmark(c: &mut Criterion) {
36
167
SerializedFileReader :: new_with_options ( data. clone ( ) , options) . unwrap ( )
37
168
} )
38
169
} ) ;
170
+
171
+ let meta_data = get_footer_bytes ( data) ;
172
+ c. bench_function ( "decode file metadata" , |b| {
173
+ b. iter ( || {
174
+ parquet:: thrift:: bench_file_metadata ( & meta_data) ;
175
+ } )
176
+ } ) ;
177
+
178
+ let buf = black_box ( encoded_meta ( ) ) . into ( ) ;
179
+ c. bench_function ( "decode file metadata (wide)" , |b| {
180
+ b. iter ( || {
181
+ parquet:: thrift:: bench_file_metadata ( & buf) ;
182
+ } )
183
+ } ) ;
39
184
}
40
185
41
186
criterion_group ! ( benches, criterion_benchmark) ;
0 commit comments