Skip to content

Commit e5709ef

Browse files
committed
feat: teach ALPArray to store validity only in the encoded array
The patches are now always non-nullable. This required PrimitiveArray::patch to gracefully handle non-nullable patches when the array is nullable. I modified the benchmarks to include patch manipulation time, but notice that the test data has no patches. The benchmarks measure the overhead of `is_valid`. If we had test data where the invalid positions contained exceptional values, I would expect a modest improvement in both decompression and compression time.
1 parent a7876ca commit e5709ef

File tree

9 files changed

+215
-158
lines changed

9 files changed

+215
-158
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/rust/quickstart.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
4545

4646
>>> cvtx = vortex.compress(vtx)
4747
>>> cvtx.nbytes
48-
15755
48+
15732
4949
>>> cvtx.nbytes / vtx.nbytes
5050
0.11...
5151

encodings/alp/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ readme = { workspace = true }
1717
workspace = true
1818

1919
[dependencies]
20+
arrow-array = { workspace = true }
2021
itertools = { workspace = true }
2122
num-traits = { workspace = true }
2223
serde = { workspace = true, features = ["derive"] }
@@ -30,6 +31,7 @@ vortex-scalar = { workspace = true }
3031

3132
[dev-dependencies]
3233
divan = { workspace = true }
34+
rand = { workspace = true }
3335
rstest = { workspace = true }
3436
vortex-array = { workspace = true, features = ["test-harness"] }
3537

encodings/alp/benches/alp_compress.rs

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,60 @@
11
#![allow(clippy::unwrap_used)]
22

33
use divan::Bencher;
4-
use vortex_alp::{ALPFloat, ALPRDFloat, Exponents, RDEncoder};
4+
use rand::rngs::StdRng;
5+
use rand::{Rng, SeedableRng as _};
6+
use vortex_alp::{alp_encode, ALPFloat, ALPRDFloat, RDEncoder};
57
use vortex_array::array::PrimitiveArray;
68
use vortex_array::validity::Validity;
79
use vortex_array::IntoCanonical;
8-
use vortex_buffer::{buffer, Buffer};
10+
use vortex_buffer::buffer;
11+
use vortex_dtype::NativePType;
912

1013
fn main() {
1114
divan::main();
1215
}
1316

14-
#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
15-
fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Buffer<T::ALPInt>, Buffer<u64>, Buffer<T>) {
16-
let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
17-
T::encode(values.as_slice(), None)
17+
#[divan::bench(types = [f32, f64], args = [
18+
(100_000, 1.0),
19+
(10_000_000, 1.0),
20+
(100_000, 0.25),
21+
(10_000_000, 0.25),
22+
(100_000, 0.95),
23+
(10_000_000, 0.95),
24+
])]
25+
fn compress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
26+
let (n, fraction_valid) = args;
27+
let mut rng = StdRng::seed_from_u64(0);
28+
let values = buffer![T::from(1.234).unwrap(); n];
29+
let validity = if fraction_valid < 1.0 {
30+
Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
31+
} else {
32+
Validity::NonNullable
33+
};
34+
bencher.bench_local(move || {
35+
alp_encode(&PrimitiveArray::new(values.clone(), validity.clone())).unwrap()
36+
})
1837
}
1938

20-
#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
21-
fn decompress_alp<T: ALPFloat>(bencher: Bencher, n: usize) {
22-
let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
23-
let (exponents, encoded, ..) = T::encode(values.as_slice(), None);
24-
bencher.bench_local(move || T::decode(&encoded, exponents));
39+
#[divan::bench(types = [f32, f64], args = [
40+
(100_000, 1.0),
41+
(10_000_000, 1.0),
42+
(100_000, 0.25),
43+
(10_000_000, 0.25),
44+
(100_000, 0.95),
45+
(10_000_000, 0.95),
46+
])]
47+
fn decompress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
48+
let (n, fraction_valid) = args;
49+
let mut rng = StdRng::seed_from_u64(0);
50+
let values = buffer![T::from(1.234).unwrap(); n];
51+
let validity = if fraction_valid < 1.0 {
52+
Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
53+
} else {
54+
Validity::NonNullable
55+
};
56+
let array = alp_encode(&PrimitiveArray::new(values, validity)).unwrap();
57+
bencher.bench_local(move || array.clone().into_canonical().unwrap());
2558
}
2659

2760
#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]

encodings/alp/src/alp/array.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ impl ALPArray {
4646
let mut children = Vec::with_capacity(2);
4747
children.push(encoded);
4848
if let Some(patches) = &patches {
49+
if patches.dtype() != &dtype {
50+
vortex_bail!(MismatchedTypes: dtype, patches.dtype());
51+
}
52+
53+
if patches.values().logical_validity()?.false_count() != 0 {
54+
vortex_bail!("ALPArray: patches must not contain invalid entries");
55+
}
56+
4957
children.push(patches.indices().clone());
5058
children.push(patches.values().clone());
5159
}

encodings/alp/src/alp/compress.rs

Lines changed: 98 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
use vortex_array::array::PrimitiveArray;
22
use vortex_array::patches::Patches;
3+
use vortex_array::validity::Validity;
34
use vortex_array::variants::PrimitiveArrayTrait;
45
use vortex_array::{Array, IntoArray, IntoArrayVariant};
6+
use vortex_buffer::Buffer;
57
use vortex_dtype::{NativePType, PType};
6-
use vortex_error::{vortex_bail, VortexResult, VortexUnwrap};
8+
use vortex_error::{vortex_bail, VortexResult};
79
use vortex_scalar::ScalarType;
810

911
use crate::alp::{ALPArray, ALPFloat};
@@ -24,39 +26,74 @@ macro_rules! match_each_alp_float_ptype {
2426
})
2527
}
2628

27-
pub fn alp_encode_components<T>(
29+
pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult<ALPArray> {
30+
let (exponents, encoded, patches) = alp_encode_components(parray)?;
31+
ALPArray::try_new(encoded, exponents, patches)
32+
}
33+
34+
pub fn alp_encode_components(
35+
parray: &PrimitiveArray,
36+
) -> VortexResult<(Exponents, Array, Option<Patches>)> {
37+
match parray.ptype() {
38+
PType::F32 => alp_encode_components_typed::<f32>(parray),
39+
PType::F64 => alp_encode_components_typed::<f64>(parray),
40+
_ => vortex_bail!("ALP can only encode f32 and f64"),
41+
}
42+
}
43+
44+
#[allow(clippy::cast_possible_truncation)]
45+
fn alp_encode_components_typed<T>(
2846
values: &PrimitiveArray,
29-
exponents: Option<Exponents>,
30-
) -> (Exponents, Array, Option<Patches>)
47+
) -> VortexResult<(Exponents, Array, Option<Patches>)>
3148
where
3249
T: ALPFloat + NativePType,
3350
T::ALPInt: NativePType,
3451
T: ScalarType,
3552
{
36-
let (exponents, encoded, exc_pos, exc) = T::encode(values.as_slice::<T>(), exponents);
37-
let len = encoded.len();
38-
(
39-
exponents,
40-
PrimitiveArray::new(encoded, values.validity()).into_array(),
41-
(!exc.is_empty()).then(|| {
42-
let position_arr = exc_pos.into_array();
43-
let patch_validity = values.validity().take(&position_arr).vortex_unwrap();
44-
Patches::new(
45-
len,
46-
position_arr,
47-
PrimitiveArray::new(exc, patch_validity).into_array(),
48-
)
49-
}),
50-
)
51-
}
53+
let values_slice = values.as_slice::<T>();
5254

53-
pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult<ALPArray> {
54-
let (exponents, encoded, patches) = match parray.ptype() {
55-
PType::F32 => alp_encode_components::<f32>(parray, None),
56-
PType::F64 => alp_encode_components::<f64>(parray, None),
57-
_ => vortex_bail!("ALP can only encode f32 and f64"),
55+
let exponents = T::find_best_exponents(values_slice);
56+
let (encoded, exceptional_positions) = T::encode_chunkwise(values.as_slice::<T>(), exponents);
57+
58+
let encoded_array = PrimitiveArray::new(encoded, values.validity()).into_array();
59+
60+
let validity = values.logical_validity()?;
61+
let n_valid = validity.true_count();
62+
// exceptional_positions may contain exceptions at invalid positions (which contain garbage
63+
// data). We remove invalid exceptional positions in order to keep the Patches small.
64+
let valid_exceptional_positions = if n_valid == 0 {
65+
Buffer::empty()
66+
} else if n_valid == values.len() {
67+
exceptional_positions
68+
} else {
69+
exceptional_positions
70+
.into_iter()
71+
// index is a valid usize because it is an index into values.as_slice::<T>()
72+
.filter(|index| validity.value(*index as usize))
73+
.collect()
5874
};
59-
ALPArray::try_new(encoded, exponents, patches)
75+
76+
let patches = if valid_exceptional_positions.is_empty() {
77+
None
78+
} else {
79+
let patches_validity = if values.dtype().is_nullable() {
80+
Validity::AllValid
81+
} else {
82+
Validity::NonNullable
83+
};
84+
let exceptional_values: Buffer<T> = valid_exceptional_positions
85+
.iter()
86+
.map(|index| values_slice[*index as usize])
87+
.collect();
88+
let exceptional_values =
89+
PrimitiveArray::new(exceptional_values, patches_validity).into_array();
90+
Some(Patches::new(
91+
values_slice.len(),
92+
valid_exceptional_positions.into_array(),
93+
exceptional_values,
94+
))
95+
};
96+
Ok((exponents, encoded_array, patches))
6097
}
6198

6299
pub fn decompress(array: ALPArray) -> VortexResult<PrimitiveArray> {
@@ -140,14 +177,47 @@ mod tests {
140177
.into_primitive()
141178
.unwrap()
142179
.as_slice::<i64>(),
143-
vec![1234i64, 2718, 1234, 4000] // fill forward
180+
vec![1234i64, 2718, 3142, 4000]
144181
);
145182
assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
146183

147184
let decoded = decompress(encoded).unwrap();
148185
assert_eq!(values.as_slice(), decoded.as_slice::<f64>());
149186
}
150187

188+
#[test]
189+
#[allow(clippy::approx_constant)] // ALP doesn't like E
190+
fn test_compress_ignores_invalid_exceptional_values() {
191+
let values = buffer![1.234f64, 2.718, f64::consts::PI, 4.0];
192+
let array = PrimitiveArray::new(values, Validity::from_iter([true, true, false, true]));
193+
let encoded = alp_encode(&array).unwrap();
194+
assert!(encoded.patches().is_none());
195+
assert_eq!(
196+
encoded
197+
.encoded()
198+
.into_primitive()
199+
.unwrap()
200+
.as_slice::<i64>(),
201+
vec![1234i64, 2718, 3142, 4000]
202+
);
203+
assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
204+
205+
let decoded = decompress(encoded).unwrap();
206+
assert_eq!(
207+
scalar_at(&decoded, 0).unwrap(),
208+
scalar_at(&array, 0).unwrap()
209+
);
210+
assert_eq!(
211+
scalar_at(&decoded, 1).unwrap(),
212+
scalar_at(&array, 1).unwrap()
213+
);
214+
assert!(!decoded.is_valid(2).unwrap());
215+
assert_eq!(
216+
scalar_at(&decoded, 3).unwrap(),
217+
scalar_at(&array, 3).unwrap()
218+
);
219+
}
220+
151221
#[test]
152222
#[allow(clippy::approx_constant)] // ALP doesn't like E
153223
fn test_nullable_patched_scalar_at() {
@@ -168,6 +238,7 @@ mod tests {
168238
assert!(s.is_valid());
169239
}
170240

241+
assert!(!encoded.is_valid(4).unwrap());
171242
let s = scalar_at(encoded.as_ref(), 4).unwrap();
172243
assert!(s.is_null());
173244

@@ -190,7 +261,6 @@ mod tests {
190261
);
191262
let alp_arr = alp_encode(&original).unwrap();
192263
let decompressed = alp_arr.into_primitive().unwrap();
193-
assert_eq!(original.as_slice::<f64>(), decompressed.as_slice::<f64>());
194264
assert_eq!(original.validity(), decompressed.validity());
195265
}
196266
}

encodings/alp/src/alp/compute/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,13 @@ impl ComputeVTable for ALPEncoding {
3636

3737
impl ScalarAtFn<ALPArray> for ALPEncoding {
3838
fn scalar_at(&self, array: &ALPArray, index: usize) -> VortexResult<Scalar> {
39+
if !array.encoded().is_valid(index)? {
40+
return Ok(Scalar::null(array.dtype().clone()));
41+
}
42+
3943
if let Some(patches) = array.patches() {
4044
if let Some(patch) = patches.get_patched(index)? {
41-
return Ok(patch);
45+
return patch.cast(array.dtype());
4246
}
4347
}
4448

0 commit comments

Comments
 (0)