vortex-data · danking · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -46,7 +46,7 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
 
    >>> cvtx = vortex.compress(vtx)
    >>> cvtx.nbytes
-   16604
+   16596
    >>> cvtx.nbytes / vtx.nbytes
    0.11...
 

diff --git a/encodings/alp/Cargo.toml b/encodings/alp/Cargo.toml
@@ -29,6 +29,7 @@ vortex-scalar = { workspace = true }
 
 [dev-dependencies]
 divan = { workspace = true }
+rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["test-harness"] }
 

diff --git a/encodings/alp/benches/alp_compress.rs b/encodings/alp/benches/alp_compress.rs
@@ -1,27 +1,60 @@
 #![allow(clippy::unwrap_used)]
 
 use divan::Bencher;
-use vortex_alp::{ALPFloat, ALPRDFloat, Exponents, RDEncoder};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng as _};
+use vortex_alp::{alp_encode, ALPFloat, ALPRDFloat, RDEncoder};
 use vortex_array::array::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_array::IntoCanonical;
-use vortex_buffer::{buffer, Buffer};
+use vortex_buffer::buffer;
+use vortex_dtype::NativePType;
 
 fn main() {
     divan::main();
 }
 
-#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
-fn compress_alp<T: ALPFloat>(n: usize) -> (Exponents, Buffer<T::ALPInt>, Buffer<u64>, Buffer<T>) {
-    let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
-    T::encode(values.as_slice(), None)
+#[divan::bench(types = [f32, f64], args = [
+    (100_000, 1.0),
+    (10_000_000, 1.0),
+    (100_000, 0.25),
+    (10_000_000, 0.25),
+    (100_000, 0.95),
+    (10_000_000, 0.95),
+])]
+fn compress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_valid) = args;
+    let mut rng = StdRng::seed_from_u64(0);
+    let values = buffer![T::from(1.234).unwrap(); n];
+    let validity = if fraction_valid < 1.0 {
+        Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
+    } else {
+        Validity::NonNullable
+    };
+    bencher.bench_local(move || {
+        alp_encode(&PrimitiveArray::new(values.clone(), validity.clone())).unwrap()
+    })
 }
 
-#[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]
-fn decompress_alp<T: ALPFloat>(bencher: Bencher, n: usize) {
-    let values: Vec<T> = vec![T::from(1.234).unwrap(); n];
-    let (exponents, encoded, ..) = T::encode(values.as_slice(), None);
-    bencher.bench_local(move || T::decode(&encoded, exponents));
+#[divan::bench(types = [f32, f64], args = [
+    (100_000, 1.0),
+    (10_000_000, 1.0),
+    (100_000, 0.25),
+    (10_000_000, 0.25),
+    (100_000, 0.95),
+    (10_000_000, 0.95),
+])]
+fn decompress_alp<T: ALPFloat + NativePType>(bencher: Bencher, args: (usize, f64)) {
+    let (n, fraction_valid) = args;
+    let mut rng = StdRng::seed_from_u64(0);
+    let values = buffer![T::from(1.234).unwrap(); n];
+    let validity = if fraction_valid < 1.0 {
+        Validity::from_iter((0..values.len()).map(|_| rng.gen_bool(fraction_valid)))
+    } else {
+        Validity::NonNullable
+    };
+    let array = alp_encode(&PrimitiveArray::new(values, validity)).unwrap();
+    bencher.bench_local(move || array.clone().into_canonical().unwrap());
 }
 
 #[divan::bench(types = [f32, f64], args = [100_000, 10_000_000])]

diff --git a/encodings/alp/src/alp/array.rs b/encodings/alp/src/alp/array.rs
@@ -48,13 +48,16 @@ impl ALPArray {
         let mut children = Vec::with_capacity(2);
         children.push(encoded);
         if let Some(patches) = &patches {
+            if patches.dtype().is_nullable() {
+                vortex_bail!(MismatchedTypes: "patches should be non-nullable", patches.dtype());
+            }
             children.push(patches.indices().clone());
             children.push(patches.values().clone());
         }
 
         let patches = patches
             .as_ref()
-            .map(|p| p.to_metadata(length, &dtype))
+            .map(|p| p.to_metadata(length, &dtype.as_nonnullable()))
             .transpose()?;
 
         Self::try_from_parts(
@@ -93,7 +96,7 @@ impl ALPArray {
                     .child(1, &p.indices_dtype(), p.len())
                     .vortex_expect("ALPArray: patch indices"),
                 self.as_ref()
-                    .child(2, self.dtype(), p.len())
+                    .child(2, &self.dtype().as_nonnullable(), p.len())
                     .vortex_expect("ALPArray: patch values"),
             )
         })

diff --git a/encodings/alp/src/alp/compress.rs b/encodings/alp/src/alp/compress.rs
@@ -3,7 +3,7 @@ use vortex_array::patches::Patches;
 use vortex_array::variants::PrimitiveArrayTrait;
 use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
 use vortex_dtype::{NativePType, PType};
-use vortex_error::{vortex_bail, VortexResult, VortexUnwrap};
+use vortex_error::{vortex_bail, VortexResult};
 use vortex_scalar::ScalarType;
 
 use crate::alp::{ALPArray, ALPFloat};
@@ -27,33 +27,29 @@ macro_rules! match_each_alp_float_ptype {
 pub fn alp_encode_components<T>(
     values: &PrimitiveArray,
     exponents: Option<Exponents>,
-) -> (Exponents, ArrayData, Option<Patches>)
+) -> VortexResult<(Exponents, ArrayData, Option<Patches>)>
 where
     T: ALPFloat + NativePType,
     T::ALPInt: NativePType,
     T: ScalarType,
 {
-    let (exponents, encoded, exc_pos, exc) = T::encode(values.as_slice::<T>(), exponents);
+    let (exponents, encoded, exc_pos, exc) =
+        T::encode(values.as_slice::<T>(), &values.validity(), exponents)?;
     let len = encoded.len();
-    (
+    Ok((
         exponents,
         PrimitiveArray::new(encoded, values.validity()).into_array(),
         (!exc.is_empty()).then(|| {
             let position_arr = exc_pos.into_array();
-            let patch_validity = values.validity().take(&position_arr).vortex_unwrap();
-            Patches::new(
-                len,
-                position_arr,
-                PrimitiveArray::new(exc, patch_validity).into_array(),
-            )
+            Patches::new(len, position_arr, exc.into_array())
         }),
-    )
+    ))
 }
 
 pub fn alp_encode(parray: &PrimitiveArray) -> VortexResult<ALPArray> {
     let (exponents, encoded, patches) = match parray.ptype() {
-        PType::F32 => alp_encode_components::<f32>(parray, None),
-        PType::F64 => alp_encode_components::<f64>(parray, None),
+        PType::F32 => alp_encode_components::<f32>(parray, None)?,
+        PType::F64 => alp_encode_components::<f64>(parray, None)?,
         _ => vortex_bail!("ALP can only encode f32 and f64"),
     };
     ALPArray::try_new(encoded, exponents, patches)
@@ -83,7 +79,7 @@ mod tests {
     use core::f64;
 
     use vortex_array::compute::scalar_at;
-    use vortex_array::validity::Validity;
+    use vortex_array::validity::{ArrayValidity as _, Validity};
     use vortex_buffer::{buffer, Buffer};
 
     use super::*;
@@ -148,6 +144,39 @@ mod tests {
         assert_eq!(values.as_slice(), decoded.as_slice::<f64>());
     }
 
+    #[test]
+    #[allow(clippy::approx_constant)] // ALP doesn't like E
+    fn test_compress_ignores_invalid_exceptional_values() {
+        let values = buffer![1.234f64, 2.718, f64::consts::PI, 4.0];
+        let array = PrimitiveArray::new(values, Validity::from_iter([true, true, false, true]));
+        let encoded = alp_encode(&array).unwrap();
+        assert!(encoded.patches().is_none());
+        assert_eq!(
+            encoded
+                .encoded()
+                .into_primitive()
+                .unwrap()
+                .as_slice::<i64>(),
+            vec![1234i64, 2718, 3142, 4000] // fill forward
+        );
+        assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
+
+        let decoded = decompress(encoded).unwrap();
+        assert_eq!(
+            scalar_at(&decoded, 0).unwrap(),
+            scalar_at(&array, 0).unwrap()
+        );
+        assert_eq!(
+            scalar_at(&decoded, 1).unwrap(),
+            scalar_at(&array, 1).unwrap()
+        );
+        assert!(!decoded.is_valid(2));
+        assert_eq!(
+            scalar_at(&decoded, 3).unwrap(),
+            scalar_at(&array, 3).unwrap()
+        );
+    }
+
     #[test]
     #[allow(clippy::approx_constant)] // ALP doesn't like E
     fn test_nullable_patched_scalar_at() {
@@ -168,6 +197,7 @@ mod tests {
             assert!(s.is_valid());
         }
 
+        assert!(!encoded.is_valid(4));
         let s = scalar_at(encoded.as_ref(), 4).unwrap();
         assert!(s.is_null());
 
@@ -190,7 +220,6 @@ mod tests {
         );
         let alp_arr = alp_encode(&original).unwrap();
         let decompressed = alp_arr.into_primitive().unwrap();
-        assert_eq!(original.as_slice::<f64>(), decompressed.as_slice::<f64>());
         assert_eq!(original.validity(), decompressed.validity());
     }
 }
diff --git a/encodings/alp/src/alp/compute/mod.rs b/encodings/alp/src/alp/compute/mod.rs
@@ -2,6 +2,7 @@ use vortex_array::compute::{
     filter, scalar_at, slice, take, ComputeVTable, FilterFn, FilterMask, ScalarAtFn, SliceFn,
     TakeFn,
 };
+use vortex_array::validity::ArrayValidity as _;
 use vortex_array::variants::PrimitiveArrayTrait;
 use vortex_array::{ArrayDType, ArrayData, IntoArrayData};
 use vortex_error::VortexResult;
@@ -29,9 +30,13 @@ impl ComputeVTable for ALPEncoding {
 
 impl ScalarAtFn<ALPArray> for ALPEncoding {
     fn scalar_at(&self, array: &ALPArray, index: usize) -> VortexResult<Scalar> {
+        if !array.encoded().is_valid(index) {
+            return Ok(Scalar::null(array.dtype().clone()));
+        }
+
         if let Some(patches) = array.patches() {
             if let Some(patch) = patches.get_patched(index)? {
-                return Ok(patch);
+                return patch.cast(array.dtype());
             }
         }
-Original file line number
+Diff line change
@@ Expand Up @@
        >>> cvtx = vortex.compress(vtx)
        >>> cvtx.nbytes
        >>> cvtx.nbytes / vtx.nbytes
 .11...
@@ Expand Down @@