Skip to content

Commit 8e0e25c

Browse files
a10ylwwmanning
andauthored
feat: BitPackedCompressor allows signed arrays (#1699)
Most of the work to support signed integers has been done in BitPackedArray. This PR removes some assertions and branches in the compressor to make it possible to bit-pack an array of signed ints. --------- Co-authored-by: Will Manning <will@willmanning.io>
1 parent 3555d87 commit 8e0e25c

File tree

8 files changed

+58
-19
lines changed

8 files changed

+58
-19
lines changed

docs/quickstart.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the
4646

4747
>>> cvtx = vortex.compress(vtx)
4848
>>> cvtx.nbytes
49-
16756
49+
16539
5050
>>> cvtx.nbytes / vtx.nbytes
51-
0.118...
51+
0.117...
5252

5353
Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
5454
cache and RAM.

encodings/fastlanes/src/bitpacking/compress.rs

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ pub unsafe fn bitpack_encode_unchecked(
7070
///
7171
/// On success, returns a [Buffer] containing the packed data.
7272
pub fn bitpack(parray: &PrimitiveArray, bit_width: u8) -> VortexResult<Buffer> {
73-
// We know the min is > 0, so it's safe to re-interpret signed integers as unsigned.
7473
let parray = parray.reinterpret_cast(parray.ptype().to_unsigned());
7574
let packed = match_each_unsigned_integer_ptype!(parray.ptype(), |$P| {
7675
bitpack_primitive(parray.maybe_null_slice::<$P>(), bit_width)
@@ -359,7 +358,7 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
359358
#[cfg(test)]
360359
#[allow(clippy::cast_possible_truncation)]
361360
mod test {
362-
use vortex_array::{IntoArrayVariant, ToArrayData};
361+
use vortex_array::{IntoArrayVariant, IntoCanonical, ToArrayData};
363362

364363
use super::*;
365364

@@ -431,12 +430,25 @@ mod test {
431430
}
432431

433432
#[test]
434-
#[should_panic(expected = "expected type: uint but instead got i64")]
435-
fn gh_issue_929() {
433+
fn compress_signed_roundtrip() {
436434
let values: Vec<i64> = (-500..500).collect();
437-
let array = PrimitiveArray::from_vec(values, Validity::AllValid);
435+
let array = PrimitiveArray::from_vec(values.clone(), Validity::AllValid);
438436
assert!(array.ptype().is_signed_int());
439437

440-
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
438+
let bitpacked_array =
439+
BitPackedArray::encode(array.as_ref(), 1024u32.ilog2() as u8).unwrap();
440+
let num_patches = bitpacked_array
441+
.patches()
442+
.as_ref()
443+
.map(Patches::num_patches)
444+
.unwrap_or_default();
445+
assert_eq!(num_patches, 500);
446+
447+
let unpacked = bitpacked_array
448+
.into_canonical()
449+
.unwrap()
450+
.into_primitive()
451+
.unwrap();
452+
assert_eq!(unpacked.into_maybe_null_slice::<i64>(), values);
441453
}
442454
}

encodings/fastlanes/src/bitpacking/compute/filter.rs

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,21 @@ use crate::{BitPackedArray, BitPackedEncoding};
1313

1414
impl FilterFn<BitPackedArray> for BitPackedEncoding {
1515
fn filter(&self, array: &BitPackedArray, mask: FilterMask) -> VortexResult<ArrayData> {
16-
let primitive = match_each_unsigned_integer_ptype!(array.ptype(), |$I| {
16+
let primitive = match_each_unsigned_integer_ptype!(array.ptype().to_unsigned(), |$I| {
1717
filter_primitive::<$I>(array, mask)
1818
});
1919
Ok(primitive?.into_array())
2020
}
2121
}
2222

23+
/// Specialized filter kernel for primitive bit-packed arrays.
24+
///
25+
/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
26+
/// `T` should be promoted to the unsigned variant for any target bit width.
27+
/// For example, if the array is bit-packed `i16`, this function called be called with `T = u16`.
28+
///
29+
/// All bit-packing operations will use the unsigned kernels, but the logical type of `array`
30+
/// dictates the final `PType` of the result.
2331
fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
2432
array: &BitPackedArray,
2533
mask: FilterMask,
@@ -49,7 +57,7 @@ fn filter_primitive<T: NativePType + BitPacking + ArrowNativeType>(
4957
FilterIter::SlicesIter(iter) => filter_slices(array, mask.true_count(), iter),
5058
};
5159

52-
let mut values = PrimitiveArray::from_vec(values, validity);
60+
let mut values = PrimitiveArray::from_vec(values, validity).reinterpret_cast(array.ptype());
5361
if let Some(patches) = patches {
5462
values = values.patch(patches)?;
5563
}
@@ -120,6 +128,7 @@ fn filter_slices<T: NativePType + BitPacking + ArrowNativeType>(
120128

121129
#[cfg(test)]
122130
mod test {
131+
use itertools::Itertools;
123132
use vortex_array::array::PrimitiveArray;
124133
use vortex_array::compute::{filter, slice, FilterMask};
125134
use vortex_array::{ArrayLen, IntoArrayVariant};
@@ -166,4 +175,24 @@ mod test {
166175
(0..1024).map(|i| (i % 63) as u8).collect::<Vec<_>>()
167176
);
168177
}
178+
179+
#[test]
180+
fn filter_bitpacked_signed() {
181+
// Elements 0..=499 are negative integers (patches)
182+
// Element 500 = 0 (packed)
183+
// Elements 501..999 are positive integers (packed)
184+
let values: Vec<i64> = (-500..500).collect_vec();
185+
let unpacked = PrimitiveArray::from(values.clone());
186+
let bitpacked = BitPackedArray::encode(unpacked.as_ref(), 9).unwrap();
187+
let filtered = filter(
188+
bitpacked.as_ref(),
189+
FilterMask::from_indices(values.len(), 250..750),
190+
)
191+
.unwrap()
192+
.into_primitive()
193+
.unwrap()
194+
.into_maybe_null_slice::<i64>();
195+
196+
assert_eq!(filtered.as_slice(), &values[250..750]);
197+
}
169198
}

encodings/fastlanes/src/bitpacking/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,8 @@ impl BitPackedArray {
6363
offset: u16,
6464
) -> VortexResult<Self> {
6565
let dtype = DType::Primitive(ptype, validity.nullability());
66-
67-
if !dtype.is_unsigned_int() {
68-
vortex_bail!(MismatchedTypes: "uint", &dtype);
66+
if !dtype.is_int() {
67+
vortex_bail!(MismatchedTypes: "integer", dtype);
6968
}
7069

7170
if bit_width > u64::BITS as u8 {

pyvortex/src/compress.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use crate::array::PyArray;
2424
///
2525
/// >>> a = vortex.array(list(range(1000)))
2626
/// >>> str(vortex.compress(a))
27-
/// 'fastlanes.for(0x17)(i64, len=1000)'
27+
/// 'fastlanes.bitpacked(0x15)(i64, len=1000)'
2828
///
2929
/// Compress an array of increasing floating-point numbers and a few nulls:
3030
///

vortex-sampling-compressor/src/compressors/bitpacked.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ impl EncodingCompressor for BitPackedCompressor {
5757
// Only support primitive arrays
5858
let parray = PrimitiveArray::maybe_from(array)?;
5959

60-
// Only supports unsigned ints
61-
if !parray.ptype().is_unsigned_int() {
60+
if !parray.ptype().is_int() {
6261
return None;
6362
}
6463

vortex-sampling-compressor/src/compressors/for.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ impl EncodingCompressor for FoRCompressor {
4242
let shift = trailing_zeros(array);
4343
match_each_integer_ptype!(parray.ptype(), |$P| {
4444
let min: $P = parray.statistics().compute_min()?;
45-
if min == 0 && shift == 0 && parray.ptype().is_unsigned_int() {
45+
if min == 0 && shift == 0 {
4646
return None;
4747
}
4848
});

vortex-sampling-compressor/tests/smoketest.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ mod tests {
1818
use vortex_datetime_dtype::TimeUnit;
1919
use vortex_datetime_parts::DateTimePartsEncoding;
2020
use vortex_dict::DictEncoding;
21-
use vortex_fastlanes::FoREncoding;
21+
use vortex_fastlanes::BitPackedEncoding;
2222
use vortex_fsst::FSSTEncoding;
2323
use vortex_sampling_compressor::ALL_COMPRESSORS;
2424
use vortex_scalar::Scalar;
@@ -122,7 +122,7 @@ mod tests {
122122
.unwrap();
123123
println!("prim_col num chunks: {}", prim_col.nchunks());
124124
for chunk in prim_col.chunks() {
125-
assert_eq!(chunk.encoding().id(), FoREncoding::ID);
125+
assert_eq!(chunk.encoding().id(), BitPackedEncoding::ID);
126126
assert_eq!(
127127
chunk.statistics().get(Stat::UncompressedSizeInBytes),
128128
Some(Scalar::from((chunk.len() * 8) as u64 + 1))

0 commit comments

Comments
 (0)