@@ -109,7 +109,10 @@ impl VTable for DictVTable {
109109 let values = children. get ( 1 , dtype, metadata. values_len as usize ) ?;
110110 let all_values_referenced = metadata. all_values_referenced . unwrap_or ( false ) ;
111111
112- DictArray :: try_new_with_metadata ( codes, values, all_values_referenced)
112+ // SAFETY: We've validated the metadata and children
113+ Ok ( unsafe {
114+ DictArray :: new_unchecked ( codes, values) . set_all_values_referenced ( all_values_referenced)
115+ } )
113116 }
114117}
115118
@@ -135,11 +138,7 @@ impl DictArray {
135138 /// This should be called only when you can guarantee the invariants checked
136139 /// by the safe [`DictArray::try_new`] constructor are valid, for example when
137140 /// you are filtering or slicing an existing valid `DictArray`.
138- pub unsafe fn new_unchecked (
139- codes : ArrayRef ,
140- values : ArrayRef ,
141- all_values_referenced : bool ,
142- ) -> Self {
141+ pub unsafe fn new_unchecked ( codes : ArrayRef , values : ArrayRef ) -> Self {
143142 let dtype = values
144143 . dtype ( )
145144 . union_nullability ( codes. dtype ( ) . nullability ( ) ) ;
@@ -148,8 +147,35 @@ impl DictArray {
148147 values,
149148 stats_set : Default :: default ( ) ,
150149 dtype,
151- all_values_referenced,
150+ all_values_referenced : false ,
151+ }
152+ }
153+
154+ /// Set whether all dictionary values are definitely referenced.
155+ ///
156+ /// # Safety
157+ /// The caller must ensure that when setting `all_values_referenced = true`, ALL dictionary
158+ /// values are actually referenced by at least one valid code. Setting this incorrectly can
159+ /// lead to incorrect query results in operations like min/max.
160+ ///
161+ /// This is typically only set to `true` during dictionary encoding when we know for certain
162+ /// that all values are referenced.
163+ pub unsafe fn set_all_values_referenced ( mut self , all_values_referenced : bool ) -> Self {
164+ // In debug builds, verify the claim when setting to true
165+ #[ cfg( debug_assertions) ]
166+ if all_values_referenced {
167+ if let Ok ( unreferenced_mask) = self . compute_unreferenced_values_mask ( false ) {
168+ let has_unreferenced = unreferenced_mask. iter ( ) . any ( |b| b) ;
169+ debug_assert ! (
170+ !has_unreferenced,
171+ "set_all_values_referenced(true) called but {} unreferenced values found" ,
172+ unreferenced_mask. iter( ) . filter( |& b| b) . count( )
173+ ) ;
174+ }
152175 }
176+
177+ self . all_values_referenced = all_values_referenced;
178+ self
153179 }
154180
155181 /// Build a new `DictArray` from its components, `codes` and `values`.
@@ -189,7 +215,9 @@ impl DictArray {
189215 vortex_bail ! ( MismatchedTypes : "unsigned int" , codes. dtype( ) ) ;
190216 }
191217
192- Ok ( unsafe { Self :: new_unchecked ( codes, values, all_values_referenced) } )
218+ Ok ( unsafe {
219+ Self :: new_unchecked ( codes, values) . set_all_values_referenced ( all_values_referenced)
220+ } )
193221 }
194222
195223 #[ inline]
@@ -212,24 +240,56 @@ impl DictArray {
212240 self . all_values_referenced
213241 }
214242
243+ /// Validates that the `all_values_referenced` flag matches reality.
244+ ///
245+ /// Returns `Ok(())` if the flag is consistent with the actual referenced values,
246+ /// or an error describing the mismatch.
247+ ///
248+ /// This is primarily useful for testing and debugging.
249+ #[ cfg( debug_assertions) ]
250+ pub fn validate_all_values_referenced ( & self ) -> VortexResult < ( ) > {
251+ let unreferenced_mask = self . compute_unreferenced_values_mask ( false ) ?;
252+ let has_unreferenced = unreferenced_mask. iter ( ) . any ( |b| b) ;
253+ let actual_all_referenced = !has_unreferenced;
254+
255+ if self . all_values_referenced && !actual_all_referenced {
256+ let unreferenced_count = unreferenced_mask. iter ( ) . filter ( |& b| b) . count ( ) ;
257+ vortex_bail ! (
258+ "all_values_referenced=true but {} unreferenced values found" ,
259+ unreferenced_count
260+ ) ;
261+ }
262+
263+ Ok ( ( ) )
264+ }
265+
215266 /// Compute a mask indicating which values in the dictionary are referenced by at least one code.
216267 ///
217- /// Returns a `BitBuffer` where unset bits (false) correspond to values that are referenced
218- /// by at least one valid code, and set bits (true) correspond to unreferenced values.
268+ /// When `referenced = true`, returns a `BitBuffer` where set bits (true) correspond to
269+ /// referenced values, and unset bits (false) correspond to unreferenced values.
270+ ///
271+ /// When `referenced = false` (default for unreferenced values), returns the inverse:
272+ /// set bits (true) correspond to unreferenced values, and unset bits (false) correspond
273+ /// to referenced values.
219274 ///
220275 /// This is useful for operations like min/max that need to ignore unreferenced values.
221- pub fn compute_unreferenced_values_mask ( & self ) -> VortexResult < BitBuffer > {
276+ pub fn compute_unreferenced_values_mask ( & self , referenced : bool ) -> VortexResult < BitBuffer > {
222277 let codes_validity = self . codes ( ) . validity_mask ( ) ;
223278 let codes_primitive = self . codes ( ) . to_primitive ( ) ;
224279 let values_len = self . values ( ) . len ( ) ;
225280
226- let mut unreferenced_vec = vec ! [ true ; values_len] ;
281+ // Initialize with the starting value: false for referenced, true for unreferenced
282+ let init_value = !referenced;
283+ // Value to set when we find a referenced code: true for referenced, false for unreferenced
284+ let referenced_value = referenced;
285+
286+ let mut values_vec = vec ! [ init_value; values_len] ;
227287 match codes_validity. bit_buffer ( ) {
228288 AllOr :: All => {
229289 match_each_integer_ptype ! ( codes_primitive. ptype( ) , |P | {
230290 #[ allow( clippy:: cast_possible_truncation) ]
231291 for & code in codes_primitive. as_slice:: <P >( ) . iter( ) {
232- unreferenced_vec [ code as usize ] = false ;
292+ values_vec [ code as usize ] = referenced_value ;
233293 }
234294 } ) ;
235295 }
@@ -240,15 +300,13 @@ impl DictArray {
240300
241301 #[ allow( clippy:: cast_possible_truncation) ]
242302 buf. set_indices( ) . for_each( |idx| {
243- unreferenced_vec [ codes[ idx] as usize ] = false ;
303+ values_vec [ codes[ idx] as usize ] = referenced_value ;
244304 } )
245305 } ) ;
246306 }
247307 }
248308
249- Ok ( BitBuffer :: collect_bool ( values_len, |idx| {
250- unreferenced_vec[ idx]
251- } ) )
309+ Ok ( BitBuffer :: collect_bool ( values_len, |idx| values_vec[ idx] ) )
252310 }
253311}
254312
0 commit comments