Skip to content

Commit 962d840

Browse files
committed
perf[dict]: all_values_referenced to allow pushdown optimisation.
Signed-off-by: Joe Isaacs <[email protected]>
1 parent 4968607 commit 962d840

File tree

18 files changed

+149
-21
lines changed

18 files changed

+149
-21
lines changed

vortex-array/src/arrays/dict/array.rs

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ pub struct DictMetadata {
3232
// nullable codes are optional since they were added after stabilisation
3333
#[prost(optional, bool, tag = "3")]
3434
pub(super) is_nullable_codes: Option<bool>,
35+
// all_values_referenced is optional for backward compatibility
36+
// true = all dictionary values are definitely referenced by at least one code
37+
// false/None = unknown whether all values are referenced (conservative default)
38+
#[prost(optional, bool, tag = "4")]
39+
pub(super) all_values_referenced: Option<bool>,
3540
}
3641

3742
impl VTable for DictVTable {
@@ -66,6 +71,7 @@ impl VTable for DictVTable {
6671
)
6772
})?,
6873
is_nullable_codes: Some(array.codes().dtype().is_nullable()),
74+
all_values_referenced: Some(array.all_values_referenced),
6975
}))
7076
}
7177

@@ -101,8 +107,9 @@ impl VTable for DictVTable {
101107
let codes_dtype = DType::Primitive(metadata.codes_ptype(), codes_nullable);
102108
let codes = children.get(0, &codes_dtype, len)?;
103109
let values = children.get(1, dtype, metadata.values_len as usize)?;
110+
let all_values_referenced = metadata.all_values_referenced.unwrap_or(false);
104111

105-
DictArray::try_new(codes, values)
112+
DictArray::try_new_with_metadata(codes, values, all_values_referenced)
106113
}
107114
}
108115

@@ -112,6 +119,10 @@ pub struct DictArray {
112119
values: ArrayRef,
113120
stats_set: ArrayStats,
114121
dtype: DType,
122+
/// Indicates whether all dictionary values are definitely referenced by at least one code.
123+
/// `true` = all values are referenced (computed during encoding).
124+
/// `false` = unknown/might have unreferenced values (conservative default).
125+
all_values_referenced: bool,
115126
}
116127

117128
#[derive(Clone, Debug)]
@@ -124,7 +135,11 @@ impl DictArray {
124135
/// This should be called only when you can guarantee the invariants checked
125136
/// by the safe [`DictArray::try_new`] constructor are valid, for example when
126137
/// you are filtering or slicing an existing valid `DictArray`.
127-
pub unsafe fn new_unchecked(codes: ArrayRef, values: ArrayRef) -> Self {
138+
pub unsafe fn new_unchecked(
139+
codes: ArrayRef,
140+
values: ArrayRef,
141+
all_values_referenced: bool,
142+
) -> Self {
128143
let dtype = values
129144
.dtype()
130145
.union_nullability(codes.dtype().nullability());
@@ -133,6 +148,7 @@ impl DictArray {
133148
values,
134149
stats_set: Default::default(),
135150
dtype,
151+
all_values_referenced,
136152
}
137153
}
138154

@@ -156,11 +172,24 @@ impl DictArray {
156172
///
157173
/// It is an error to provide a nullable `codes` with non-nullable `values`.
158174
pub fn try_new(codes: ArrayRef, values: ArrayRef) -> VortexResult<Self> {
175+
Self::try_new_with_metadata(codes, values, false)
176+
}
177+
178+
/// Build a new `DictArray` from its components with explicit metadata.
179+
///
180+
/// Same as [`DictArray::try_new`] but allows specifying whether all values are referenced.
181+
/// This is typically only set to `true` during dictionary encoding when we know for certain
182+
/// that all dictionary values are referenced by at least one code.
183+
pub fn try_new_with_metadata(
184+
codes: ArrayRef,
185+
values: ArrayRef,
186+
all_values_referenced: bool,
187+
) -> VortexResult<Self> {
159188
if !codes.dtype().is_unsigned_int() {
160189
vortex_bail!(MismatchedTypes: "unsigned int", codes.dtype());
161190
}
162191

163-
Ok(unsafe { Self::new_unchecked(codes, values) })
192+
Ok(unsafe { Self::new_unchecked(codes, values, all_values_referenced) })
164193
}
165194

166195
#[inline]
@@ -173,6 +202,16 @@ impl DictArray {
173202
&self.values
174203
}
175204

205+
/// Returns `true` if all dictionary values are definitely referenced by at least one code.
206+
///
207+
/// When `true`, operations like min/max can safely operate on all values without needing to
208+
/// compute which values are actually referenced. When `false`, it is unknown whether all
209+
/// values are referenced (conservative default).
210+
#[inline]
211+
pub fn has_all_values_referenced(&self) -> bool {
212+
self.all_values_referenced
213+
}
214+
176215
/// Compute a mask indicating which values in the dictionary are referenced by at least one code.
177216
///
178217
/// Returns a `BitBuffer` where unset bits (false) correspond to values that are referenced
@@ -467,6 +506,7 @@ mod test {
467506
codes_ptype: PType::U64 as i32,
468507
values_len: u32::MAX,
469508
is_nullable_codes: None,
509+
all_values_referenced: None,
470510
}),
471511
);
472512
}

vortex-array/src/arrays/dict/arrow.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ impl<K: ArrowDictionaryKeyType> FromArrowArray<&DictionaryArray<K>> for DictArra
1414
let keys = ArrayRef::from_arrow(keys, keys.is_nullable());
1515
let values = ArrayRef::from_arrow(array.values().as_ref(), nullable);
1616
// SAFETY: we assume that Arrow has checked the invariants on construction
17-
unsafe { DictArray::new_unchecked(keys, values) }
17+
// We conservatively set all_values_referenced to false since Arrow doesn't provide this info
18+
unsafe { DictArray::new_unchecked(keys, values, false) }
1819
}
1920
}

vortex-array/src/arrays/dict/compute/cast.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,14 @@ impl CastKernel for DictVTable {
2525

2626
// SAFETY: casting does not alter invariants of the codes
2727
Ok(Some(
28-
unsafe { DictArray::new_unchecked(casted_codes, casted_values) }.into_array(),
28+
unsafe {
29+
DictArray::new_unchecked(
30+
casted_codes,
31+
casted_values,
32+
array.has_all_values_referenced(),
33+
)
34+
}
35+
.into_array(),
2936
))
3037
}
3138
}

vortex-array/src/arrays/dict/compute/compare.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@ impl CompareKernel for DictVTable {
3030

3131
// SAFETY: values len preserved, codes all still point to valid values
3232
let result = unsafe {
33-
DictArray::new_unchecked(lhs.codes().clone(), compare_result).into_array()
33+
DictArray::new_unchecked(
34+
lhs.codes().clone(),
35+
compare_result,
36+
lhs.has_all_values_referenced(),
37+
)
38+
.into_array()
3439
};
3540

3641
// We canonicalize the result because dictionary-encoded bools is dumb.

vortex-array/src/arrays/dict/compute/fill_null.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,12 @@ impl FillNullKernel for DictVTable {
4242
let values = fill_null(array.values(), fill_value)?;
4343

4444
// SAFETY: invariants are still satisfied after patching nulls
45-
unsafe { Ok(DictArray::new_unchecked(codes, values).into_array()) }
45+
unsafe {
46+
Ok(
47+
DictArray::new_unchecked(codes, values, array.has_all_values_referenced())
48+
.into_array(),
49+
)
50+
}
4651
}
4752
}
4853

vortex-array/src/arrays/dict/compute/like.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,15 @@ impl LikeKernel for DictVTable {
2525

2626
// SAFETY: LIKE preserves the len of the values, so codes are still pointing at
2727
// valid positions.
28+
// Preserve all_values_referenced since codes are unchanged
2829
unsafe {
2930
Ok(Some(
30-
DictArray::new_unchecked(array.codes().clone(), values).into_array(),
31+
DictArray::new_unchecked(
32+
array.codes().clone(),
33+
values,
34+
array.has_all_values_referenced(),
35+
)
36+
.into_array(),
3137
))
3238
}
3339
} else {

vortex-array/src/arrays/dict/compute/min_max.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ impl MinMaxKernel for DictVTable {
1515
return Ok(None);
1616
}
1717

18+
// Fast path: if all values are referenced, directly compute min/max on values
19+
if array.has_all_values_referenced() {
20+
return min_max(array.values());
21+
}
22+
23+
// Slow path: compute which values are unreferenced and mask them out
1824
let unreferenced_mask = Mask::from_buffer(array.compute_unreferenced_values_mask()?);
1925
min_max(&mask(array.values(), &unreferenced_mask)?)
2026
}

vortex-array/src/arrays/dict/compute/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ impl TakeKernel for DictVTable {
2222
fn take(&self, array: &DictArray, indices: &dyn Array) -> VortexResult<ArrayRef> {
2323
let codes = take(array.codes(), indices)?;
2424
// SAFETY: selecting codes doesn't change the invariants of DictArray
25-
Ok(unsafe { DictArray::new_unchecked(codes, array.values().clone()) }.into_array())
25+
// We conservatively set all_values_referenced to false since taking may leave values unreferenced
26+
Ok(unsafe { DictArray::new_unchecked(codes, array.values().clone(), false) }.into_array())
2627
}
2728
}
2829

@@ -33,7 +34,8 @@ impl FilterKernel for DictVTable {
3334
let codes = filter(array.codes(), mask)?;
3435

3536
// SAFETY: filtering codes doesn't change invariants
36-
unsafe { Ok(DictArray::new_unchecked(codes, array.values().clone()).into_array()) }
37+
// We conservatively set all_values_referenced to false since filtering may leave values unreferenced
38+
unsafe { Ok(DictArray::new_unchecked(codes, array.values().clone(), false).into_array()) }
3739
}
3840
}
3941

vortex-array/src/arrays/dict/ops.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ impl OperationsVTable<DictVTable> for DictVTable {
2424
};
2525
}
2626
// SAFETY: slicing the codes preserves invariants
27-
unsafe { DictArray::new_unchecked(sliced_code, array.values().clone()).into_array() }
27+
unsafe { DictArray::new_unchecked(sliced_code, array.values().clone(), false).into_array() }
2828
}
2929

3030
fn scalar_at(array: &DictArray, index: usize) -> Scalar {

vortex-array/src/builders/dict/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ pub fn dict_encode_with_constraints(
5757
Ok(DictArray::new_unchecked(
5858
codes.into_array(),
5959
encoder.reset(),
60+
// All values in the dictionary are guaranteed to be referenced by at least one code
61+
// since we build the dictionary from the codes we observe during encoding
62+
true,
6063
))
6164
}
6265
}

0 commit comments

Comments
 (0)