Skip to content

Commit e7cbad6

Browse files
authored
fix: RunEndBool stats and slice accounts for offsets (#1428)
1 parent ca6e648 commit e7cbad6

File tree

4 files changed

+73
-33
lines changed

4 files changed

+73
-33
lines changed

encodings/runend-bool/benches/ree_bool_compress.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ use itertools::Itertools;
88
use rand::distributions::Open01;
99
use rand::prelude::StdRng;
1010
use rand::{Rng, SeedableRng};
11-
use vortex_runend_bool::compress::{runend_bool_decode_slice, runend_bool_encode_slice};
11+
use vortex_error::VortexExpect;
12+
use vortex_runend_bool::compress::{
13+
runend_bool_decode_slice, runend_bool_encode_slice, trimmed_ends_iter,
14+
};
1215

1316
fn compress_compare(c: &mut Criterion) {
1417
compress_compare_param(c, 0.);
@@ -38,8 +41,15 @@ fn compress_compare_param(c: &mut Criterion, sel_fac: f32) {
3841
});
3942

4043
let (ends, start) = runend_bool_encode_slice(&boolbuf);
44+
let length = ends.last().copied().vortex_expect("must have one elem") as usize;
4145
group.bench_function("ree bool decompress", |b| {
42-
b.iter(|| black_box(runend_bool_decode_slice(&ends, start, 0, ends.len())));
46+
b.iter(|| {
47+
black_box(runend_bool_decode_slice(
48+
trimmed_ends_iter(&ends, 0, length),
49+
start,
50+
length,
51+
))
52+
});
4353
});
4454
group.finish()
4555
}

encodings/runend-bool/src/array.rs

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use vortex_dtype::{match_each_integer_ptype, match_each_unsigned_integer_ptype,
1717
use vortex_error::{vortex_bail, VortexExpect as _, VortexResult};
1818
use vortex_scalar::Scalar;
1919

20-
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice};
20+
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice, trimmed_ends_iter};
2121

2222
impl_encoding!("vortex.runendbool", ids::RUN_END_BOOL, RunEndBool);
2323

@@ -164,7 +164,7 @@ pub(crate) fn decode_runend_bool(
164164
length: usize,
165165
) -> VortexResult<BoolArray> {
166166
match_each_integer_ptype!(run_ends.ptype(), |$E| {
167-
let bools = runend_bool_decode_slice::<$E>(run_ends.maybe_null_slice(), start, offset, length);
167+
let bools = runend_bool_decode_slice(trimmed_ends_iter(run_ends.maybe_null_slice::<$E>(), offset, length), start, length);
168168
Ok(BoolArray::try_new(bools, validity)?)
169169
})
170170
}
@@ -179,8 +179,14 @@ pub(crate) fn value_at_index(idx: usize, start: bool) -> bool {
179179

180180
impl BoolArrayTrait for RunEndBoolArray {
181181
fn invert(&self) -> VortexResult<ArrayData> {
182-
RunEndBoolArray::try_new(self.ends(), !self.start(), self.validity())
183-
.map(|a| a.into_array())
182+
RunEndBoolArray::with_offset_and_size(
183+
self.ends(),
184+
!self.start(),
185+
self.validity(),
186+
self.len(),
187+
self.offset(),
188+
)
189+
.map(|a| a.into_array())
184190
}
185191
}
186192

@@ -229,19 +235,19 @@ impl ArrayStatisticsCompute for RunEndBoolArray {
229235
Stat::NullCount => Some(self.validity().null_count(self.len())?.into()),
230236
Stat::TrueCount => {
231237
let pends = self.ends().into_primitive()?;
232-
let mut true_count: u64 = 0;
233-
let mut prev_end: u64 = 0;
238+
let mut true_count: usize = 0;
239+
let mut prev_end: usize = 0;
234240
let mut include = self.start();
235241
match_each_unsigned_integer_ptype!(pends.ptype(), |$P| {
236-
for end in pends.maybe_null_slice::<$P>() {
242+
for end in trimmed_ends_iter(pends.maybe_null_slice::<$P>(), self.offset(), self.len()) {
237243
if include {
238-
true_count += (*end as u64 - prev_end);
244+
true_count += end - prev_end;
239245
}
240246
include = !include;
241-
prev_end = *end as u64;
247+
prev_end = end;
242248
}
243249
});
244-
Some(true_count.into())
250+
Some((true_count as u64).into())
245251
}
246252
_ => None,
247253
};
@@ -259,7 +265,7 @@ mod test {
259265

260266
use itertools::Itertools as _;
261267
use rstest::rstest;
262-
use vortex_array::array::BoolArray;
268+
use vortex_array::array::{BoolArray, PrimitiveArray};
263269
use vortex_array::compute::unary::scalar_at;
264270
use vortex_array::compute::{slice, take, TakeOptions};
265271
use vortex_array::stats::{ArrayStatistics as _, ArrayStatisticsCompute};
@@ -408,4 +414,16 @@ mod test {
408414

409415
assert_eq!(arr.statistics().compute_run_count(), Some(ends_len));
410416
}
417+
418+
#[test]
419+
fn sliced_true_count() {
420+
let arr = RunEndBoolArray::try_new(
421+
PrimitiveArray::from(vec![5u32, 7, 10]).into_array(),
422+
true,
423+
Validity::NonNullable,
424+
)
425+
.unwrap();
426+
let sliced = slice(&arr, 4, 8).unwrap();
427+
assert_eq!(sliced.statistics().compute_true_count().unwrap(), 2);
428+
}
411429
}

encodings/runend-bool/src/compress.rs

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::cmp::min;
22

33
use arrow_buffer::buffer::BooleanBuffer;
44
use arrow_buffer::BooleanBufferBuilder;
5-
use num_traits::{AsPrimitive, FromPrimitive};
5+
use num_traits::AsPrimitive;
66
use vortex_dtype::NativePType;
77
use vortex_error::{vortex_panic, VortexExpect as _};
88

@@ -35,12 +35,12 @@ pub fn runend_bool_encode_slice(elements: &BooleanBuffer) -> (Vec<u64>, bool) {
3535
(ends, first_bool)
3636
}
3737

38-
pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimitive + Ord>(
38+
#[inline]
39+
pub fn trimmed_ends_iter<E: NativePType + AsPrimitive<usize> + Ord>(
3940
run_ends: &[E],
40-
start: bool,
4141
offset: usize,
4242
length: usize,
43-
) -> BooleanBuffer {
43+
) -> impl Iterator<Item = usize> + use<'_, E> {
4444
let offset_e = E::from_usize(offset).unwrap_or_else(|| {
4545
vortex_panic!(
4646
"offset {} cannot be converted to {}",
@@ -55,14 +55,22 @@ pub fn runend_bool_decode_slice<E: NativePType + AsPrimitive<usize> + FromPrimit
5555
std::any::type_name::<E>()
5656
)
5757
});
58-
let trimmed_ends = run_ends
58+
run_ends
5959
.iter()
60-
.map(|v| *v - offset_e)
61-
.map(|v| min(v, length_e));
60+
.copied()
61+
.map(move |v| v - offset_e)
62+
.map(move |v| min(v, length_e))
63+
.map(|v| v.as_())
64+
}
6265

66+
pub fn runend_bool_decode_slice(
67+
run_ends_iter: impl Iterator<Item = usize>,
68+
start: bool,
69+
length: usize,
70+
) -> BooleanBuffer {
6371
let mut decoded = BooleanBufferBuilder::new(length);
64-
for (idx, end) in trimmed_ends.enumerate() {
65-
decoded.append_n(end.as_() - decoded.len(), value_at_index(idx, start));
72+
for (idx, end) in run_ends_iter.enumerate() {
73+
decoded.append_n(end - decoded.len(), value_at_index(idx, start));
6674
}
6775
BooleanBuffer::from(decoded)
6876
}
@@ -78,7 +86,7 @@ mod test {
7886
use vortex_array::validity::Validity;
7987
use vortex_array::IntoArrayVariant;
8088

81-
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice};
89+
use crate::compress::{runend_bool_decode_slice, runend_bool_encode_slice, trimmed_ends_iter};
8290
use crate::decode_runend_bool;
8391

8492
#[test]
@@ -107,27 +115,30 @@ mod test {
107115
fn encode_decode_bool() {
108116
let input = [true, true, false, true, true, false];
109117
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));
118+
let ends_iter = trimmed_ends_iter(ends.as_slice(), 0, input.len());
110119

111-
let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
120+
let decoded = runend_bool_decode_slice(ends_iter, start, input.len());
112121
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
113122
}
114123

115124
#[test]
116125
fn encode_decode_bool_false_start() {
117126
let input = [false, false, true, true, false, true, true, false];
118127
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));
128+
let ends_iter = trimmed_ends_iter(ends.as_slice(), 0, input.len());
119129

120-
let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
130+
let decoded = runend_bool_decode_slice(ends_iter, start, input.len());
121131
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
122132
}
123133

124134
#[test]
125135
fn encode_decode_bool_false_start_long() {
126-
let input = vec![true; 1024];
127-
// input.extend([false, true, false, true].as_slice());
136+
let mut input = vec![true; 1024];
137+
input.extend([false, true, false, true].as_slice());
128138
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));
139+
let ends_iter = trimmed_ends_iter(ends.as_slice(), 0, input.len());
129140

130-
let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
141+
let decoded = runend_bool_decode_slice(ends_iter, start, input.len());
131142
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
132143
}
133144

@@ -136,8 +147,9 @@ mod test {
136147
let mut rng = StdRng::seed_from_u64(4352);
137148
let input = (0..1024 * 4).map(|_x| rng.gen::<bool>()).collect_vec();
138149
let (ends, start) = runend_bool_encode_slice(&BooleanBuffer::from(input.as_slice()));
150+
let ends_iter = trimmed_ends_iter(ends.as_slice(), 0, input.len());
139151

140-
let decoded = runend_bool_decode_slice(ends.as_slice(), start, 0, input.len());
152+
let decoded = runend_bool_decode_slice(ends_iter, start, input.len());
141153
assert_eq!(decoded, BooleanBuffer::from(input.as_slice()))
142154
}
143155

encodings/runend-bool/src/compute.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ impl TakeFn<RunEndBoolArray> for RunEndBoolEncoding {
4343
let primitive_indices = indices.clone().into_primitive()?;
4444
let physical_indices = match_each_integer_ptype!(primitive_indices.ptype(), |$P| {
4545
primitive_indices
46-
.maybe_null_slice::<$P>()
47-
.iter()
48-
.map(|idx| *idx as usize)
46+
.into_maybe_null_slice::<$P>()
47+
.into_iter()
48+
.map(|idx| idx as usize)
4949
.map(|idx| {
5050
if idx >= array.len() {
5151
vortex_bail!(OutOfBounds: idx, 0, array.len())
@@ -72,7 +72,7 @@ impl SliceFn<RunEndBoolArray> for RunEndBoolEncoding {
7272
value_at_index(slice_begin, array.start()),
7373
array.validity().slice(slice_begin, slice_end + 1)?,
7474
stop - start,
75-
start,
75+
start + array.offset(),
7676
)?
7777
.into_array())
7878
}

0 commit comments

Comments
 (0)