Skip to content

Commit f7d0b79

Browse files
feat: run end bool compressor (#1355)
Co-authored-by: Robert Kruszewski <[email protected]>
1 parent 8339aaa commit f7d0b79

File tree

11 files changed

+173
-105
lines changed

11 files changed

+173
-105
lines changed

bench-vortex/benches/compressor_throughput.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ use rand::distributions::Alphanumeric;
55
use rand::seq::SliceRandom as _;
66
use rand::{thread_rng, Rng, SeedableRng as _};
77
use vortex::aliases::hash_set::HashSet;
8-
use vortex::array::{PrimitiveArray, VarBinViewArray};
8+
use vortex::array::{ConstantArray, PrimitiveArray, VarBinViewArray};
99
use vortex::compute::unary::try_cast;
10+
use vortex::compute::{compare, Operator};
1011
use vortex::dict::{dict_encode_varbinview, DictArray};
1112
use vortex::dtype::PType;
1213
use vortex::fsst::{fsst_compress, fsst_train_compressor};
@@ -18,11 +19,14 @@ use vortex::sampling_compressor::compressors::bitpacked::{
1819
use vortex::sampling_compressor::compressors::delta::DeltaCompressor;
1920
use vortex::sampling_compressor::compressors::dict::DictCompressor;
2021
use vortex::sampling_compressor::compressors::r#for::FoRCompressor;
22+
use vortex::sampling_compressor::compressors::roaring_bool::RoaringBoolCompressor;
2123
use vortex::sampling_compressor::compressors::roaring_int::RoaringIntCompressor;
2224
use vortex::sampling_compressor::compressors::runend::DEFAULT_RUN_END_COMPRESSOR;
25+
use vortex::sampling_compressor::compressors::runend_bool::RunEndBoolCompressor;
2326
use vortex::sampling_compressor::compressors::zigzag::ZigZagCompressor;
2427
use vortex::sampling_compressor::compressors::CompressorRef;
2528
use vortex::sampling_compressor::SamplingCompressor;
29+
use vortex::scalar::Scalar;
2630
use vortex::validity::Validity;
2731
use vortex::{IntoArrayData as _, IntoCanonical};
2832

@@ -44,11 +48,20 @@ fn primitive(c: &mut Criterion) {
4448
)
4549
.into_array();
4650
let int_array = try_cast(uint_array.clone(), PType::I32.into()).unwrap();
51+
52+
let bool_array = compare(
53+
&uint_array,
54+
ConstantArray::new(Scalar::from(0u32), uint_array.len()),
55+
Operator::Eq,
56+
)
57+
.unwrap();
58+
4759
let index_array = PrimitiveArray::from_vec(
4860
(0..num_values).map(|i| (i * 2) as u32 + 42).collect_vec(),
4961
Validity::NonNullable,
5062
)
5163
.into_array();
64+
5265
let float_array = try_cast(uint_array.clone(), PType::F32.into()).unwrap();
5366

5467
let compressors_names_and_arrays = [
@@ -61,7 +74,9 @@ fn primitive(c: &mut Criterion) {
6174
(&DEFAULT_RUN_END_COMPRESSOR, "runend", &uint_array),
6275
(&DeltaCompressor, "delta", &uint_array),
6376
(&DictCompressor, "dict", &uint_array),
77+
(&RoaringBoolCompressor, "roaring_bool", &bool_array),
6478
(&RoaringIntCompressor, "roaring_int", &index_array),
79+
(&RunEndBoolCompressor, "runend_bool", &bool_array),
6580
(&FoRCompressor, "frame_of_reference", &int_array),
6681
(&ZigZagCompressor, "zigzag", &int_array),
6782
(&ALPCompressor, "alp", &float_array),

encodings/runend-bool/src/array.rs

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use std::fmt::{Debug, Display};
22

33
use serde::{Deserialize, Serialize};
44
use vortex_array::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
5+
use vortex_array::array::{BoolArray, PrimitiveArray};
56
use vortex_array::compute::unary::scalar_at;
67
use vortex_array::compute::{search_sorted, SearchSortedSide};
78
use vortex_array::encoding::ids;
@@ -12,11 +13,11 @@ use vortex_array::{
1213
impl_encoding, ArrayDType, ArrayData, ArrayTrait, Canonical, IntoArrayData, IntoArrayVariant,
1314
IntoCanonical,
1415
};
15-
use vortex_dtype::{match_each_unsigned_integer_ptype, DType, PType};
16+
use vortex_dtype::{match_each_integer_ptype, match_each_unsigned_integer_ptype, DType, PType};
1617
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
1718
use vortex_scalar::Scalar;
1819

19-
use crate::compress::runend_bool_decode;
20+
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice};
2021

2122
impl_encoding!("vortex.runendbool", ids::RUN_END_BOOL, RunEndBool);
2223

@@ -146,6 +147,36 @@ impl RunEndBoolArray {
146147
}
147148
}
148149

150+
pub fn encode_runend_bool(array: &BoolArray) -> VortexResult<RunEndBoolArray> {
151+
let (ends, start) = runend_bool_encode_slice(&array.boolean_buffer());
152+
RunEndBoolArray::try_new(
153+
PrimitiveArray::from(ends).into_array(),
154+
start,
155+
array.validity(),
156+
)
157+
}
158+
159+
pub(crate) fn decode_runend_bool(
160+
run_ends: &PrimitiveArray,
161+
start: bool,
162+
validity: Validity,
163+
offset: usize,
164+
length: usize,
165+
) -> VortexResult<BoolArray> {
166+
match_each_integer_ptype!(run_ends.ptype(), |$E| {
167+
let bools = runend_bool_decode_slice::<$E>(run_ends.maybe_null_slice(), start, offset, length);
168+
Ok(BoolArray::try_new(bools, validity)?)
169+
})
170+
}
171+
172+
pub(crate) fn value_at_index(idx: usize, start: bool) -> bool {
173+
if idx % 2 == 0 {
174+
start
175+
} else {
176+
!start
177+
}
178+
}
179+
149180
impl BoolArrayTrait for RunEndBoolArray {
150181
fn invert(&self) -> VortexResult<ArrayData> {
151182
RunEndBoolArray::try_new(self.ends(), !self.start(), self.validity())
@@ -174,7 +205,7 @@ impl ArrayValidity for RunEndBoolArray {
174205
impl IntoCanonical for RunEndBoolArray {
175206
fn into_canonical(self) -> VortexResult<Canonical> {
176207
let pends = self.ends().into_primitive()?;
177-
runend_bool_decode(
208+
decode_runend_bool(
178209
&pends,
179210
self.start(),
180211
self.validity(),

encodings/runend-bool/src/compress.rs

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,10 @@ use std::cmp::min;
33
use arrow_buffer::buffer::BooleanBuffer;
44
use arrow_buffer::BooleanBufferBuilder;
55
use num_traits::{AsPrimitive, FromPrimitive};
6-
use vortex_array::array::{BoolArray, PrimitiveArray};
7-
use vortex_array::validity::Validity;
8-
use vortex_array::variants::PrimitiveArrayTrait;
9-
use vortex_dtype::{match_each_integer_ptype, NativePType};
10-
use vortex_error::{vortex_panic, VortexExpect as _, VortexResult};
11-
12-
pub fn runend_bool_encode(elements: &BoolArray) -> (PrimitiveArray, bool) {
13-
let (arr, start) = runend_bool_encode_slice(&elements.boolean_buffer());
14-
(arr.into(), start)
15-
}
6+
use vortex_dtype::NativePType;
7+
use vortex_error::{vortex_panic, VortexExpect as _};
8+
9+
use crate::value_at_index;
1610

1711
pub fn runend_bool_encode_slice(elements: &BooleanBuffer) -> (Vec<u64>, bool) {
1812
let mut iter = elements.set_slices();
@@ -41,19 +35,6 @@ pub fn runend_bool_encode_slice(elements: &BooleanBuffer) -> (Vec<u64>, bool) {
4135
(ends, first_bool)
4236
}
4337

44-
pub fn runend_bool_decode(
45-
run_ends: &PrimitiveArray,
46-
start: bool,
47-
validity: Validity,
48-
offset: usize,
49-
length: usize,
50-
) -> VortexResult<BoolArray> {
51-
match_each_integer_ptype!(run_ends.ptype(), |$E| {
52-
let bools = runend_bool_decode_slice::<$E>(run_ends.maybe_null_slice(), start, offset, length);
53-
Ok(BoolArray::try_new(bools, validity)?)
54-
})
55-
}
56-
5738
pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimitive + Ord>(
5839
run_ends: &[E],
5940
start: bool,
@@ -86,28 +67,19 @@ pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimit
8667
BooleanBuffer::from(decoded)
8768
}
8869

89-
pub fn value_at_index(idx: usize, start: bool) -> bool {
90-
if idx % 2 == 0 {
91-
start
92-
} else {
93-
!start
94-
}
95-
}
96-
9770
#[cfg(test)]
9871
mod test {
9972
use arrow_buffer::BooleanBuffer;
10073
use itertools::Itertools;
10174
use rand::prelude::StdRng;
10275
use rand::{Rng, SeedableRng};
103-
use vortex_array::array::BoolArray;
76+
use vortex_array::array::{BoolArray, PrimitiveArray};
10477
use vortex_array::compute::SliceFn;
10578
use vortex_array::validity::Validity;
10679
use vortex_array::IntoArrayVariant;
10780

108-
use crate::compress::{
109-
runend_bool_decode, runend_bool_decode_slice, runend_bool_encode, runend_bool_encode_slice,
110-
};
81+
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice};
82+
use crate::decode_runend_bool;
11183

11284
#[test]
11385
fn encode_bool() {
@@ -175,9 +147,10 @@ mod test {
175147
let input = (0..1024 * 8 - 61).map(|_x| rng.gen::<bool>()).collect_vec();
176148
let b = BoolArray::from_iter(input.clone());
177149
let b = b.slice(3, 1024 * 8 - 66).unwrap().into_bool().unwrap();
178-
let (ends, start) = runend_bool_encode(&b);
150+
let (ends, start) = runend_bool_encode_slice(&b.boolean_buffer());
151+
let ends = PrimitiveArray::from(ends);
179152

180-
let decoded = runend_bool_decode(&ends, start, Validity::NonNullable, 0, 1024 * 8 - 69)
153+
let decoded = decode_runend_bool(&ends, start, Validity::NonNullable, 0, 1024 * 8 - 69)
181154
.unwrap()
182155
.into_bool()
183156
.unwrap()

encodings/runend-bool/src/compute.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ use vortex_dtype::match_each_integer_ptype;
77
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
88
use vortex_scalar::Scalar;
99

10-
use crate::compress::value_at_index;
11-
use crate::RunEndBoolArray;
10+
use crate::{value_at_index, RunEndBoolArray};
1211

1312
impl ArrayCompute for RunEndBoolArray {
1413
fn scalar_at(&self) -> Option<&dyn ScalarAtFn> {

vortex-sampling-compressor/src/compressors/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub mod fsst;
2424
pub mod roaring_bool;
2525
pub mod roaring_int;
2626
pub mod runend;
27+
pub mod runend_bool;
2728
pub mod sparse;
2829
pub mod struct_;
2930
pub mod zigzag;
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
use vortex_array::aliases::hash_set::HashSet;
2+
use vortex_array::array::{Bool, PrimitiveArray};
3+
use vortex_array::encoding::EncodingRef;
4+
use vortex_array::stats::ArrayStatistics as _;
5+
use vortex_array::{ArrayData, ArrayDef, IntoArrayData, IntoArrayVariant};
6+
use vortex_error::VortexResult;
7+
use vortex_runend_bool::compress::runend_bool_encode_slice;
8+
use vortex_runend_bool::{RunEndBool, RunEndBoolArray, RunEndBoolEncoding};
9+
10+
use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
11+
use crate::{constants, SamplingCompressor};
12+
13+
#[derive(Debug)]
14+
pub struct RunEndBoolCompressor;
15+
16+
impl EncodingCompressor for RunEndBoolCompressor {
17+
fn id(&self) -> &str {
18+
RunEndBool::ID.as_ref()
19+
}
20+
21+
fn cost(&self) -> u8 {
22+
constants::RUN_END_BOOL_COST
23+
}
24+
25+
fn can_compress(&self, array: &ArrayData) -> Option<&dyn EncodingCompressor> {
26+
// Only support bool arrays
27+
if !array.is_encoding(Bool::ID) {
28+
return None;
29+
}
30+
31+
Some(self)
32+
}
33+
34+
fn compress<'a>(
35+
&'a self,
36+
array: &ArrayData,
37+
like: Option<CompressionTree<'a>>,
38+
ctx: SamplingCompressor<'a>,
39+
) -> VortexResult<CompressedArray<'a>> {
40+
let bool_array = array.clone().into_bool()?;
41+
let (ends, start) = runend_bool_encode_slice(&bool_array.boolean_buffer());
42+
let ends = PrimitiveArray::from(ends);
43+
44+
let compressed_ends = ctx
45+
.auxiliary("ends")
46+
.compress(&ends.into_array(), like.as_ref().and_then(|l| l.child(0)))?;
47+
48+
Ok(CompressedArray::compressed(
49+
RunEndBoolArray::try_new(compressed_ends.array, start, bool_array.validity())?
50+
.into_array(),
51+
Some(CompressionTree::new(self, vec![compressed_ends.path])),
52+
Some(array.statistics()),
53+
))
54+
}
55+
56+
fn used_encodings(&self) -> HashSet<EncodingRef> {
57+
HashSet::from([&RunEndBoolEncoding as EncodingRef])
58+
}
59+
}

vortex-sampling-compressor/src/compressors/struct_.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ use vortex_array::compress::compute_precompression_stats;
55
use vortex_array::encoding::EncodingRef;
66
use vortex_array::stats::ArrayStatistics as _;
77
use vortex_array::variants::StructArrayTrait;
8-
use vortex_array::{ArrayData, ArrayDef, IntoArrayData};
8+
use vortex_array::{ArrayDType, ArrayData, ArrayDef, IntoArrayData};
9+
use vortex_dtype::DType;
910
use vortex_error::VortexResult;
1011

1112
use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
@@ -24,9 +25,8 @@ impl EncodingCompressor for StructCompressor {
2425
}
2526

2627
fn can_compress(&self, array: &ArrayData) -> Option<&dyn EncodingCompressor> {
27-
StructArray::try_from(array)
28-
.ok()
29-
.map(|_| self as &dyn EncodingCompressor)
28+
let is_struct = matches!(array.dtype(), DType::Struct(..)) && array.is_encoding(Struct::ID);
29+
is_struct.then_some(self)
3030
}
3131

3232
fn compress<'a>(

vortex-sampling-compressor/src/constants.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#![allow(dead_code)]
22

33
// structural pass-throughs have no cost
4-
pub const SPARSE_COST: u8 = 0;
54
pub const CHUNKED_COST: u8 = 0;
5+
pub const SPARSE_COST: u8 = 0;
66
pub const STRUCT_COST: u8 = 0;
77

88
// so fast that we can ignore the cost
@@ -20,6 +20,7 @@ pub const FOR_COST: u8 = 1;
2020
pub const FSST_COST: u8 = 1;
2121
pub const ROARING_BOOL_COST: u8 = 1;
2222
pub const ROARING_INT_COST: u8 = 1;
23+
pub const RUN_END_BOOL_COST: u8 = 1;
2324
pub const RUN_END_COST: u8 = 1;
2425

2526
// "expensive" encodings

0 commit comments

Comments
 (0)