@@ -40,14 +40,14 @@ use crate::util::bit_util::FromBytes;
4040/// A macro to reduce verbosity of [`make_byte_array_dictionary_reader`]
4141macro_rules! make_reader {
4242 (
43- ( $pages: expr, $column_desc: expr, $data_type: expr) => match ( $k: expr, $v: expr) {
43+ ( $pages: expr, $column_desc: expr, $data_type: expr, $batch_size : expr ) => match ( $k: expr, $v: expr) {
4444 $( ( $key_arrow: pat, $value_arrow: pat) => ( $key_type: ty, $value_type: ty) , ) +
4545 }
4646 ) => {
4747 match ( ( $k, $v) ) {
4848 $(
4949 ( $key_arrow, $value_arrow) => {
50- let reader = GenericRecordReader :: new( $column_desc) ;
50+ let reader = GenericRecordReader :: new( $column_desc, $batch_size ) ;
5151 Ok ( Box :: new( ByteArrayDictionaryReader :: <$key_type, $value_type>:: new(
5252 $pages, $data_type, reader,
5353 ) ) )
@@ -73,10 +73,13 @@ macro_rules! make_reader {
7373/// It is therefore recommended that if `pages` contains data from multiple column chunks,
7474/// that the read batch size used is a divisor of the row group size
7575///
76+ /// `batch_size` is used to pre-allocate internal buffers,
77+ /// avoiding reallocations when reading the first batch of data.
7678pub fn make_byte_array_dictionary_reader (
7779 pages : Box < dyn PageIterator > ,
7880 column_desc : ColumnDescPtr ,
7981 arrow_type : Option < ArrowType > ,
82+ batch_size : usize ,
8083) -> Result < Box < dyn ArrayReader > > {
8184 // Check if Arrow type is specified, else create it from Parquet type
8285 let data_type = match arrow_type {
@@ -89,7 +92,7 @@ pub fn make_byte_array_dictionary_reader(
8992 match & data_type {
9093 ArrowType :: Dictionary ( key_type, value_type) => {
9194 make_reader ! {
92- ( pages, column_desc, data_type) => match ( key_type. as_ref( ) , value_type. as_ref( ) ) {
95+ ( pages, column_desc, data_type, batch_size ) => match ( key_type. as_ref( ) , value_type. as_ref( ) ) {
9396 ( ArrowType :: UInt8 , ArrowType :: Binary | ArrowType :: Utf8 | ArrowType :: FixedSizeBinary ( _) ) => ( u8 , i32 ) ,
9497 ( ArrowType :: UInt8 , ArrowType :: LargeBinary | ArrowType :: LargeUtf8 ) => ( u8 , i64 ) ,
9598 ( ArrowType :: Int8 , ArrowType :: Binary | ArrowType :: Utf8 | ArrowType :: FixedSizeBinary ( _) ) => ( i8 , i32 ) ,
@@ -273,7 +276,7 @@ where
273276 }
274277
275278 let len = num_values as usize ;
276- let mut buffer = OffsetBuffer :: < V > :: default ( ) ;
279+ let mut buffer = OffsetBuffer :: < V > :: with_capacity ( 0 ) ;
277280 let mut decoder = ByteArrayDecoderPlain :: new ( buf, len, Some ( len) , self . validate_utf8 ) ;
278281 decoder. read ( & mut buffer, usize:: MAX ) ?;
279282
@@ -426,7 +429,7 @@ mod tests {
426429 . set_data ( Encoding :: RLE_DICTIONARY , encoded, 14 , Some ( data. len ( ) ) )
427430 . unwrap ( ) ;
428431
429- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
432+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
430433 assert_eq ! ( decoder. read( & mut output, 3 ) . unwrap( ) , 3 ) ;
431434
432435 let mut valid = vec ! [ false , false , true , true , false , true ] ;
@@ -492,7 +495,7 @@ mod tests {
492495 . set_data ( Encoding :: RLE_DICTIONARY , encoded, 7 , Some ( data. len ( ) ) )
493496 . unwrap ( ) ;
494497
495- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
498+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
496499
497500 // read two skip one
498501 assert_eq ! ( decoder. read( & mut output, 2 ) . unwrap( ) , 2 ) ;
@@ -543,7 +546,7 @@ mod tests {
543546 . unwrap ( ) ;
544547
545548 // Read all pages into single buffer
546- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
549+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
547550
548551 for ( encoding, page) in pages {
549552 decoder. set_data ( encoding, page, 4 , Some ( 4 ) ) . unwrap ( ) ;
@@ -586,7 +589,7 @@ mod tests {
586589 . unwrap ( ) ;
587590
588591 // Read all pages into single buffer
589- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
592+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
590593
591594 for ( encoding, page) in pages {
592595 decoder. set_data ( encoding, page, 4 , Some ( 4 ) ) . unwrap ( ) ;
@@ -650,7 +653,7 @@ mod tests {
650653 . unwrap ( ) ;
651654
652655 for ( encoding, page) in pages. clone ( ) {
653- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
656+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
654657 decoder. set_data ( encoding, page, 8 , None ) . unwrap ( ) ;
655658 assert_eq ! ( decoder. read( & mut output, 1024 ) . unwrap( ) , 0 ) ;
656659
@@ -665,7 +668,7 @@ mod tests {
665668 }
666669
667670 for ( encoding, page) in pages {
668- let mut output = DictionaryBuffer :: < i32 , i32 > :: default ( ) ;
671+ let mut output = DictionaryBuffer :: < i32 , i32 > :: with_capacity ( 0 ) ;
669672 decoder. set_data ( encoding, page, 8 , None ) . unwrap ( ) ;
670673 assert_eq ! ( decoder. skip_values( 1024 ) . unwrap( ) , 0 ) ;
671674
0 commit comments