@@ -42,11 +42,11 @@ use arrow_array::{Array, ArrayRef};
4242use arrow_schema:: { DataType , Schema , SchemaRef , TimeUnit } ;
4343use datafusion_common:: hash_utils:: create_hashes;
4444use datafusion_common:: { not_impl_err, DataFusionError , Result } ;
45- use datafusion_execution:: memory_pool:: proxy:: { RawTableAllocExt , VecAllocExt } ;
45+ use datafusion_execution:: memory_pool:: proxy:: { HashTableAllocExt , VecAllocExt } ;
4646use datafusion_expr:: EmitTo ;
4747use datafusion_physical_expr:: binary_map:: OutputType ;
4848
49- use hashbrown:: raw :: RawTable ;
49+ use hashbrown:: hash_table :: HashTable ;
5050
5151const NON_INLINED_FLAG : u64 = 0x8000000000000000 ;
5252const VALUE_MASK : u64 = 0x7FFFFFFFFFFFFFFF ;
@@ -180,7 +180,7 @@ pub struct GroupValuesColumn<const STREAMING: bool> {
180180 /// And we use [`GroupIndexView`] to represent such `group indices` in table.
181181 ///
182182 ///
183- map : RawTable < ( u64 , GroupIndexView ) > ,
183+ map : HashTable < ( u64 , GroupIndexView ) > ,
184184
185185 /// The size of `map` in bytes
186186 map_size : usize ,
@@ -261,7 +261,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
261261
262262 /// Create a new instance of GroupValuesColumn if supported for the specified schema
263263 pub fn try_new ( schema : SchemaRef ) -> Result < Self > {
264- let map = RawTable :: with_capacity ( 0 ) ;
264+ let map = HashTable :: with_capacity ( 0 ) ;
265265 Ok ( Self {
266266 schema,
267267 map,
@@ -338,7 +338,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
338338 for ( row, & target_hash) in batch_hashes. iter ( ) . enumerate ( ) {
339339 let entry = self
340340 . map
341- . get_mut ( target_hash, |( exist_hash, group_idx_view) | {
341+ . find_mut ( target_hash, |( exist_hash, group_idx_view) | {
342342 // It is ensured to be inlined in `scalarized_intern`
343343 debug_assert ! ( !group_idx_view. is_non_inlined( ) ) ;
344344
@@ -506,7 +506,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
506506 for ( row, & target_hash) in batch_hashes. iter ( ) . enumerate ( ) {
507507 let entry = self
508508 . map
509- . get ( target_hash, |( exist_hash, _) | target_hash == * exist_hash) ;
509+ . find ( target_hash, |( exist_hash, _) | target_hash == * exist_hash) ;
510510
511511 let Some ( ( _, group_index_view) ) = entry else {
512512 // 1. Bucket not found case
@@ -733,7 +733,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
733733
734734 for & row in & self . vectorized_operation_buffers . remaining_row_indices {
735735 let target_hash = batch_hashes[ row] ;
736- let entry = map. get_mut ( target_hash, |( exist_hash, _) | {
736+ let entry = map. find_mut ( target_hash, |( exist_hash, _) | {
737737 // Somewhat surprisingly, this closure can be called even if the
738738 // hash doesn't match, so check the hash first with an integer
739739 // comparison first avoid the more expensive comparison with
@@ -852,7 +852,7 @@ impl<const STREAMING: bool> GroupValuesColumn<STREAMING> {
852852 /// Return group indices of the hash, also if its `group_index_view` is non-inlined
853853 #[ cfg( test) ]
854854 fn get_indices_by_hash ( & self , hash : u64 ) -> Option < ( Vec < usize > , GroupIndexView ) > {
855- let entry = self . map . get ( hash, |( exist_hash, _) | hash == * exist_hash) ;
855+ let entry = self . map . find ( hash, |( exist_hash, _) | hash == * exist_hash) ;
856856
857857 match entry {
858858 Some ( ( _, group_index_view) ) => {
@@ -1091,67 +1091,63 @@ impl<const STREAMING: bool> GroupValues for GroupValuesColumn<STREAMING> {
10911091 . collect :: < Vec < _ > > ( ) ;
10921092 let mut next_new_list_offset = 0 ;
10931093
1094- // SAFETY: self.map outlives iterator and is not modified concurrently
1095- unsafe {
1096- for bucket in self . map . iter ( ) {
1097- // In non-streaming case, we need to check if the `group index view`
1098- // is `inlined` or `non-inlined`
1099- if !STREAMING && bucket. as_ref ( ) . 1 . is_non_inlined ( ) {
1100- // Non-inlined case
1101- // We take `group_index_list` from `old_group_index_lists`
1102-
1103- // list_offset is incrementally
1104- self . emit_group_index_list_buffer . clear ( ) ;
1105- let list_offset = bucket. as_ref ( ) . 1 . value ( ) as usize ;
1106- for group_index in self . group_index_lists [ list_offset] . iter ( )
1107- {
1108- if let Some ( remaining) = group_index. checked_sub ( n) {
1109- self . emit_group_index_list_buffer . push ( remaining) ;
1110- }
1094+ self . map . retain ( |( _exist_hash, group_idx_view) | {
1095+ // In non-streaming case, we need to check if the `group index view`
1096+ // is `inlined` or `non-inlined`
1097+ if !STREAMING && group_idx_view. is_non_inlined ( ) {
1098+ // Non-inlined case
1099+ // We take `group_index_list` from `old_group_index_lists`
1100+
1101+ // list_offset is incrementally
1102+ self . emit_group_index_list_buffer . clear ( ) ;
1103+ let list_offset = group_idx_view. value ( ) as usize ;
1104+ for group_index in self . group_index_lists [ list_offset] . iter ( ) {
1105+ if let Some ( remaining) = group_index. checked_sub ( n) {
1106+ self . emit_group_index_list_buffer . push ( remaining) ;
11111107 }
1112-
1113- // The possible results:
1114- // - `new_group_index_list` is empty, we should erase this bucket
1115- // - only one value in `new_group_index_list`, switch the `view` to `inlined`
1116- // - still multiple values in `new_group_index_list`, build and set the new `unlined view`
1117- if self . emit_group_index_list_buffer . is_empty ( ) {
1118- self . map . erase ( bucket) ;
1119- } else if self . emit_group_index_list_buffer . len ( ) == 1 {
1120- let group_index =
1121- self . emit_group_index_list_buffer . first ( ) . unwrap ( ) ;
1122- bucket. as_mut ( ) . 1 =
1123- GroupIndexView :: new_inlined ( * group_index as u64 ) ;
1124- } else {
1125- let group_index_list =
1126- & mut self . group_index_lists [ next_new_list_offset] ;
1127- group_index_list. clear ( ) ;
1128- group_index_list
1129- . extend ( self . emit_group_index_list_buffer . iter ( ) ) ;
1130- bucket. as_mut ( ) . 1 = GroupIndexView :: new_non_inlined (
1131- next_new_list_offset as u64 ,
1132- ) ;
1133- next_new_list_offset += 1 ;
1134- }
1135-
1136- continue ;
11371108 }
11381109
1110+ // The possible results:
1111+ // - `new_group_index_list` is empty, we should erase this bucket
1112+ // - only one value in `new_group_index_list`, switch the `view` to `inlined`
1113+ // - still multiple values in `new_group_index_list`, build and set the new `unlined view`
1114+ if self . emit_group_index_list_buffer . is_empty ( ) {
1115+ false
1116+ } else if self . emit_group_index_list_buffer . len ( ) == 1 {
1117+ let group_index =
1118+ self . emit_group_index_list_buffer . first ( ) . unwrap ( ) ;
1119+ * group_idx_view =
1120+ GroupIndexView :: new_inlined ( * group_index as u64 ) ;
1121+ true
1122+ } else {
1123+ let group_index_list =
1124+ & mut self . group_index_lists [ next_new_list_offset] ;
1125+ group_index_list. clear ( ) ;
1126+ group_index_list
1127+ . extend ( self . emit_group_index_list_buffer . iter ( ) ) ;
1128+ * group_idx_view = GroupIndexView :: new_non_inlined (
1129+ next_new_list_offset as u64 ,
1130+ ) ;
1131+ next_new_list_offset += 1 ;
1132+ true
1133+ }
1134+ } else {
11391135 // In `streaming case`, the `group index view` is ensured to be `inlined`
1140- debug_assert ! ( !bucket . as_ref ( ) . 1 . is_non_inlined( ) ) ;
1136+ debug_assert ! ( !group_idx_view . is_non_inlined( ) ) ;
11411137
11421138 // Inlined case, we just decrement group index by n)
1143- let group_index = bucket . as_ref ( ) . 1 . value ( ) as usize ;
1139+ let group_index = group_idx_view . value ( ) as usize ;
11441140 match group_index. checked_sub ( n) {
11451141 // Group index was >= n, shift value down
11461142 Some ( sub) => {
1147- bucket . as_mut ( ) . 1 =
1148- GroupIndexView :: new_inlined ( sub as u64 )
1143+ * group_idx_view = GroupIndexView :: new_inlined ( sub as u64 ) ;
1144+ true
11491145 }
11501146 // Group index was < n, so remove from table
1151- None => self . map . erase ( bucket ) ,
1147+ None => false ,
11521148 }
11531149 }
1154- }
1150+ } ) ;
11551151
11561152 if !STREAMING {
11571153 self . group_index_lists . truncate ( next_new_list_offset) ;
@@ -1243,7 +1239,7 @@ mod tests {
12431239 use arrow:: { compute:: concat_batches, util:: pretty:: pretty_format_batches} ;
12441240 use arrow_array:: { ArrayRef , Int64Array , RecordBatch , StringArray , StringViewArray } ;
12451241 use arrow_schema:: { DataType , Field , Schema , SchemaRef } ;
1246- use datafusion_common:: utils:: proxy:: RawTableAllocExt ;
1242+ use datafusion_common:: utils:: proxy:: HashTableAllocExt ;
12471243 use datafusion_expr:: EmitTo ;
12481244
12491245 use crate :: aggregates:: group_values:: {
0 commit comments