feat: teach ALPArray to store validity only in the encoded array

danking · danking · commit e5709ef02d95 · 2025-01-30T16:14:58.000-05:00
The patches are now always non-nullable.

This required PrimitiveArray::patch to gracefully handle non-nullable patches when the array is
nullable.

I modified the benchmarks to include patch manipulation time, but notice that the test data has no
patches. The benchmarks measure the overhead of `is_valid`. If we had test data where the invalid
positions contained exceptional values, I would expect a modest improvement in both decompression
and compression time.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/rust/quickstart.rst b/docs/rust/quickstart.rst
@@ -45,7 +45,7 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
 
    >>> cvtx = vortex.compress(vtx)
    >>> cvtx.nbytes
-   15755
+   15732
    >>> cvtx.nbytes / vtx.nbytes
    0.11...
 
diff --git a/encodings/alp/Cargo.toml b/encodings/alp/Cargo.toml
@@ -17,6 +17,7 @@ readme = { workspace = true }
 workspace = true
 
 [dependencies]
+arrow-array = { workspace = true }
 itertools = { workspace = true }
 num-traits = { workspace = true }
 serde = { workspace = true, features = ["derive"] }
@@ -30,6 +31,7 @@ vortex-scalar = { workspace = true }
 
 [dev-dependencies]
 divan = { workspace = true }
+rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["test-harness"] }
 
diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
@@ -1,27 +1,60 @@
 #![allow(clippy::unwrap_used)]
 
 use divan::Bencher;
-use vortex_alp::{ALPFloat, ALPRDFloat, Exponents, RDEncoder};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng as _};
+use vortex_alp::{alp_encode, ALPFloat, ALPRDFloat, RDEncoder};
 use vortex_array::array::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_array::IntoCanonical;
-use vortex_buffer::{buffer, Buffer};
+use vortex_buffer::buffer;
+use vortex_dtype::NativePType;
 
 fn main() {
     divan::main();
 }
 
-#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
-fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Buffer<T::ALPInt>, Buffer<u64>, Buffer<T>) {
-    let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
-    T::encode(values.as_slice(), None)
+#[divan::bench(types = [f32, f64], args = [
+    (100_000, 1.0),
+    (10_000_000, 1.0),
+    (100_000, 0.25),
+    (10_000_000, 0.25),
+    (100_000, 0.95),
+    (10_000_000, 0.95),
+])]
+fn compress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_valid) = args;
+    let mut rng = StdRng::seed_from_u64(0);
+    let values = buffer![T::from(1.234).unwrap(); n];
+    let validity = if fraction_valid < 1.0 {
+        Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
+    } else {
+        Validity::NonNullable
+    };
+    bencher.bench_local(move || {
+        alp_encode(&PrimitiveArray::new(values.clone(), validity.clone())).unwrap()
+    })
 }
 
-#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
-fn decompress_alp<T: ALPFloat>(bencher: Bencher, n: usize) {
-    let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
-    let (exponents, encoded, ..) = T::encode(values.as_slice(), None);
-    bencher.bench_local(move || T::decode(&encoded, exponents));
+#[divan::bench(types = [f32, f64], args = [
+    (100_000, 1.0),
+    (10_000_000, 1.0),
+    (100_000, 0.25),
+    (10_000_000, 0.25),
+    (100_000, 0.95),
+    (10_000_000, 0.95),
+])]
+fn decompress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_valid) = args;
+    let mut rng = StdRng::seed_from_u64(0);
+    let values = buffer![T::from(1.234).unwrap(); n];
+    let validity = if fraction_valid < 1.0 {
+        Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
+    } else {
+        Validity::NonNullable
+    };
+    let array = alp_encode(&PrimitiveArray::new(values, validity)).unwrap();
+    bencher.bench_local(move || array.clone().into_canonical().unwrap());
 }
 
 #[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
diff --git a/encodings/alp/src/alp/array.rs b/encodings/alp/src/alp/array.rs
@@ -46,6 +46,14 @@ impl ALPArray {
         let mut children = Vec::with_capacity(2);
         children.push(encoded);
         if let Some(patches) = &patches {
+            if patches.dtype() != &dtype {
+                vortex_bail!(MismatchedTypes: dtype, patches.dtype());
+            }
+
+            if patches.values().logical_validity()?.false_count() != 0 {
+                vortex_bail!("ALPArray: patches must not contain invalid entries");
+            }
+
             children.push(patches.indices().clone());
             children.push(patches.values().clone());
         }
diff --git a/encodings/alp/src/alp/compress.rs b/encodings/alp/src/alp/compress.rs
@@ -1,9 +1,11 @@
 use vortex_array::array::PrimitiveArray;
 use vortex_array::patches::Patches;
+use vortex_array::validity::Validity;
 use vortex_array::variants::PrimitiveArrayTrait;
 use vortex_array::{Array, IntoArray, IntoArrayVariant};
+use vortex_buffer::Buffer;
 use vortex_dtype::{NativePType, PType};
-use vortex_error::{vortex_bail, VortexResult, VortexUnwrap};
+use vortex_error::{vortex_bail, VortexResult};
 use vortex_scalar::ScalarType;
 
 use crate::alp::{ALPArray, ALPFloat};
@@ -24,39 +26,74 @@ macro_rules! match_each_alp_float_ptype {
     })
 }
 
-pub fn alp_encode_components<T>(
+pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult<ALPArray> {
+    let (exponents, encoded, patches) = alp_encode_components(parray)?;
+    ALPArray::try_new(encoded, exponents, patches)
+}
+
+pub fn alp_encode_components(
+    parray: &PrimitiveArray,
+) -> VortexResult<(Exponents, Array, Option<Patches>)> {
+    match parray.ptype() {
+        PType::F32 => alp_encode_components_typed::<f32>(parray),
+        PType::F64 => alp_encode_components_typed::<f64>(parray),
+        _ => vortex_bail!("ALP can only encode f32 and f64"),
+    }
+}
+
+#[allow(clippy::cast_possible_truncation)]
+fn alp_encode_components_typed<T>(
     values: &PrimitiveArray,
-    exponents: Option<Exponents>,
-) -> (Exponents, Array, Option<Patches>)
+) -> VortexResult<(Exponents, Array, Option<Patches>)>
 where
     T: ALPFloat + NativePType,
     T::ALPInt: NativePType,
     T: ScalarType,
 {
-    let (exponents, encoded, exc_pos, exc) = T::encode(values.as_slice::<T>(), exponents);
-    let len = encoded.len();
-    (
-        exponents,
-        PrimitiveArray::new(encoded, values.validity()).into_array(),
-        (!exc.is_empty()).then(|| {
-            let position_arr = exc_pos.into_array();
-            let patch_validity = values.validity().take(&position_arr).vortex_unwrap();
-            Patches::new(
-                len,
-                position_arr,
-                PrimitiveArray::new(exc, patch_validity).into_array(),
-            )
-        }),
-    )
-}
+    let values_slice = values.as_slice::<T>();
 
-pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult<ALPArray> {
-    let (exponents, encoded, patches) = match parray.ptype() {
-        PType::F32 => alp_encode_components::<f32>(parray, None),
-        PType::F64 => alp_encode_components::<f64>(parray, None),
-        _ => vortex_bail!("ALP can only encode f32 and f64"),
+    let exponents = T::find_best_exponents(values_slice);
+    let (encoded, exceptional_positions) = T::encode_chunkwise(values.as_slice::<T>(), exponents);
+
+    let encoded_array = PrimitiveArray::new(encoded, values.validity()).into_array();
+
+    let validity = values.logical_validity()?;
+    let n_valid = validity.true_count();
+    // exceptional_positions may contain exceptions at invalid positions (which contain garbage
+    // data). We remove invalid exceptional positions in order to keep the Patches small.
+    let valid_exceptional_positions = if n_valid == 0 {
+        Buffer::empty()
+    } else if n_valid == values.len() {
+        exceptional_positions
+    } else {
+        exceptional_positions
+            .into_iter()
+            // index is a valid usize because it is an index into values.as_slice::<T>()
+            .filter(|index| validity.value(*index as usize))
+            .collect()
     };
-    ALPArray::try_new(encoded, exponents, patches)
+
+    let patches = if valid_exceptional_positions.is_empty() {
+        None
+    } else {
+        let patches_validity = if values.dtype().is_nullable() {
+            Validity::AllValid
+        } else {
+            Validity::NonNullable
+        };
+        let exceptional_values: Buffer<T> = valid_exceptional_positions
+            .iter()
+            .map(|index| values_slice[*index as usize])
+            .collect();
+        let exceptional_values =
+            PrimitiveArray::new(exceptional_values, patches_validity).into_array();
+        Some(Patches::new(
+            values_slice.len(),
+            valid_exceptional_positions.into_array(),
+            exceptional_values,
+        ))
+    };
+    Ok((exponents, encoded_array, patches))
 }
 
 pub fn decompress(array: ALPArray) -> VortexResult<PrimitiveArray> {
@@ -140,14 +177,47 @@ mod tests {
                 .into_primitive()
                 .unwrap()
                 .as_slice::<i64>(),
-            vec![1234i64, 2718, 1234, 4000] // fill forward
+            vec![1234i64, 2718, 3142, 4000]
         );
         assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
 
         let decoded = decompress(encoded).unwrap();
         assert_eq!(values.as_slice(), decoded.as_slice::<f64>());
     }
 
+    #[test]
+    #[allow(clippy::approx_constant)] // ALP doesn't like E
+    fn test_compress_ignores_invalid_exceptional_values() {
+        let values = buffer![1.234f64, 2.718, f64::consts::PI, 4.0];
+        let array = PrimitiveArray::new(values, Validity::from_iter([true, true, false, true]));
+        let encoded = alp_encode(&array).unwrap();
+        assert!(encoded.patches().is_none());
+        assert_eq!(
+            encoded
+                .encoded()
+                .into_primitive()
+                .unwrap()
+                .as_slice::<i64>(),
+            vec![1234i64, 2718, 3142, 4000]
+        );
+        assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
+
+        let decoded = decompress(encoded).unwrap();
+        assert_eq!(
+            scalar_at(&decoded, 0).unwrap(),
+            scalar_at(&array, 0).unwrap()
+        );
+        assert_eq!(
+            scalar_at(&decoded, 1).unwrap(),
+            scalar_at(&array, 1).unwrap()
+        );
+        assert!(!decoded.is_valid(2).unwrap());
+        assert_eq!(
+            scalar_at(&decoded, 3).unwrap(),
+            scalar_at(&array, 3).unwrap()
+        );
+    }
+
     #[test]
     #[allow(clippy::approx_constant)] // ALP doesn't like E
     fn test_nullable_patched_scalar_at() {
@@ -168,6 +238,7 @@ mod tests {
             assert!(s.is_valid());
         }
 
+        assert!(!encoded.is_valid(4).unwrap());
         let s = scalar_at(encoded.as_ref(), 4).unwrap();
         assert!(s.is_null());
 
@@ -190,7 +261,6 @@ mod tests {
         );
         let alp_arr = alp_encode(&original).unwrap();
         let decompressed = alp_arr.into_primitive().unwrap();
-        assert_eq!(original.as_slice::<f64>(), decompressed.as_slice::<f64>());
         assert_eq!(original.validity(), decompressed.validity());
     }
 }
diff --git a/encodings/alp/src/alp/compute/mod.rs b/encodings/alp/src/alp/compute/mod.rs
@@ -36,9 +36,13 @@ impl ComputeVTable for ALPEncoding {
 
 impl ScalarAtFn<ALPArray> for ALPEncoding {
     fn scalar_at(&self, array: &ALPArray, index: usize) -> VortexResult<Scalar> {
+        if !array.encoded().is_valid(index)? {
+            return Ok(Scalar::null(array.dtype().clone()));
+        }
+
         if let Some(patches) = array.patches() {
             if let Some(patch) = patches.get_patched(index)? {
-                return Ok(patch);
+                return patch.cast(array.dtype());
             }
         }
 
diff --git a/encodings/alp/src/alp/mod.rs b/encodings/alp/src/alp/mod.rs
diff --git a/vortex-sampling-compressor/src/compressors/alp.rs b/vortex-sampling-compressor/src/compressors/alp.rs

Original file line number	Diff line number	Diff line change
`@@ -36,9 +36,13 @@ impl ComputeVTable for ALPEncoding {`
`36`	`36`
`37`	`37`	`impl ScalarAtFn<ALPArray> for ALPEncoding {`
`38`	`38`	`fn scalar_at(&self, array: &ALPArray, index: usize) -> VortexResult<Scalar> {`
	`39`	`+ if !array.encoded().is_valid(index)? {`
	`40`	`+ return Ok(Scalar::null(array.dtype().clone()));`
	`41`	`+ }`
	`42`	`+`
`39`	`43`	`if let Some(patches) = array.patches() {`
`40`	`44`	`if let Some(patch) = patches.get_patched(index)? {`
`41`		`- return Ok(patch);`
	`45`	`+ return patch.cast(array.dtype());`
`42`	`46`	`}`
`43`	`47`	`}`
`44`	`48`