@@ -32,6 +32,11 @@ pub struct DictMetadata {
3232 // nullable codes are optional since they were added after stabilisation
3333 #[ prost( optional, bool , tag = "3" ) ]
3434 pub ( super ) is_nullable_codes : Option < bool > ,
35+ // all_values_referenced is optional for backward compatibility
36+ // true = all dictionary values are definitely referenced by at least one code
37+ // false/None = unknown whether all values are referenced (conservative default)
38+ #[ prost( optional, bool , tag = "4" ) ]
39+ pub ( super ) all_values_referenced : Option < bool > ,
3540}
3641
3742impl VTable for DictVTable {
@@ -66,6 +71,7 @@ impl VTable for DictVTable {
6671 )
6772 } ) ?,
6873 is_nullable_codes : Some ( array. codes ( ) . dtype ( ) . is_nullable ( ) ) ,
74+ all_values_referenced : Some ( array. all_values_referenced ) ,
6975 } ) )
7076 }
7177
@@ -101,8 +107,9 @@ impl VTable for DictVTable {
101107 let codes_dtype = DType :: Primitive ( metadata. codes_ptype ( ) , codes_nullable) ;
102108 let codes = children. get ( 0 , & codes_dtype, len) ?;
103109 let values = children. get ( 1 , dtype, metadata. values_len as usize ) ?;
110+ let all_values_referenced = metadata. all_values_referenced . unwrap_or ( false ) ;
104111
105- DictArray :: try_new ( codes, values)
112+ DictArray :: try_new_with_metadata ( codes, values, all_values_referenced )
106113 }
107114}
108115
@@ -112,6 +119,10 @@ pub struct DictArray {
112119 values : ArrayRef ,
113120 stats_set : ArrayStats ,
114121 dtype : DType ,
122+ /// Indicates whether all dictionary values are definitely referenced by at least one code.
123+ /// `true` = all values are referenced (computed during encoding).
124+ /// `false` = unknown/might have unreferenced values (conservative default).
125+ all_values_referenced : bool ,
115126}
116127
117128#[ derive( Clone , Debug ) ]
@@ -124,7 +135,11 @@ impl DictArray {
124135 /// This should be called only when you can guarantee the invariants checked
125136 /// by the safe [`DictArray::try_new`] constructor are valid, for example when
126137 /// you are filtering or slicing an existing valid `DictArray`.
127- pub unsafe fn new_unchecked ( codes : ArrayRef , values : ArrayRef ) -> Self {
138+ pub unsafe fn new_unchecked (
139+ codes : ArrayRef ,
140+ values : ArrayRef ,
141+ all_values_referenced : bool ,
142+ ) -> Self {
128143 let dtype = values
129144 . dtype ( )
130145 . union_nullability ( codes. dtype ( ) . nullability ( ) ) ;
@@ -133,6 +148,7 @@ impl DictArray {
133148 values,
134149 stats_set : Default :: default ( ) ,
135150 dtype,
151+ all_values_referenced,
136152 }
137153 }
138154
@@ -156,11 +172,24 @@ impl DictArray {
156172 ///
157173 /// It is an error to provide a nullable `codes` with non-nullable `values`.
158174 pub fn try_new ( codes : ArrayRef , values : ArrayRef ) -> VortexResult < Self > {
175+ Self :: try_new_with_metadata ( codes, values, false )
176+ }
177+
178+ /// Build a new `DictArray` from its components with explicit metadata.
179+ ///
180+ /// Same as [`DictArray::try_new`] but allows specifying whether all values are referenced.
181+ /// This is typically only set to `true` during dictionary encoding when we know for certain
182+ /// that all dictionary values are referenced by at least one code.
183+ pub fn try_new_with_metadata (
184+ codes : ArrayRef ,
185+ values : ArrayRef ,
186+ all_values_referenced : bool ,
187+ ) -> VortexResult < Self > {
159188 if !codes. dtype ( ) . is_unsigned_int ( ) {
160189 vortex_bail ! ( MismatchedTypes : "unsigned int" , codes. dtype( ) ) ;
161190 }
162191
163- Ok ( unsafe { Self :: new_unchecked ( codes, values) } )
192+ Ok ( unsafe { Self :: new_unchecked ( codes, values, all_values_referenced ) } )
164193 }
165194
166195 #[ inline]
@@ -173,6 +202,16 @@ impl DictArray {
173202 & self . values
174203 }
175204
205+ /// Returns `true` if all dictionary values are definitely referenced by at least one code.
206+ ///
207+ /// When `true`, operations like min/max can safely operate on all values without needing to
208+ /// compute which values are actually referenced. When `false`, it is unknown whether all
209+ /// values are referenced (conservative default).
210+ #[ inline]
211+ pub fn has_all_values_referenced ( & self ) -> bool {
212+ self . all_values_referenced
213+ }
214+
176215 /// Compute a mask indicating which values in the dictionary are referenced by at least one code.
177216 ///
178217 /// Returns a `BitBuffer` where unset bits (false) correspond to values that are referenced
@@ -467,6 +506,7 @@ mod test {
467506 codes_ptype : PType :: U64 as i32 ,
468507 values_len : u32:: MAX ,
469508 is_nullable_codes : None ,
509+ all_values_referenced : None ,
470510 } ) ,
471511 ) ;
472512 }
0 commit comments