Skip to content

Commit d78ec6a

Browse files
committed
pull patch filter to alp_encode_components
1 parent 2c0af3b commit d78ec6a

File tree

3 files changed

+46
-71
lines changed

3 files changed

+46
-71
lines changed

encodings/alp/src/alp/compress.rs

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
use itertools::Itertools as _;
12
use vortex_array::array::PrimitiveArray;
23
use vortex_array::patches::Patches;
3-
use vortex_array::validity::Validity;
4+
use vortex_array::validity::{ArrayValidity as _, Validity};
45
use vortex_array::variants::PrimitiveArrayTrait;
56
use vortex_array::{ArrayDType, ArrayData, IntoArrayData, IntoArrayVariant};
7+
use vortex_buffer::BufferMut;
68
use vortex_dtype::{NativePType, PType};
79
use vortex_error::{vortex_bail, VortexResult};
810
use vortex_scalar::ScalarType;
@@ -25,6 +27,7 @@ macro_rules! match_each_alp_float_ptype {
2527
})
2628
}
2729

30+
#[allow(clippy::cast_possible_truncation)]
2831
pub fn alp_encode_components<T>(
2932
values: &PrimitiveArray,
3033
exponents: Option<Exponents>,
@@ -34,23 +37,33 @@ where
3437
T::ALPInt: NativePType,
3538
T: ScalarType,
3639
{
37-
let (exponents, encoded, exc_pos, exc) =
38-
T::encode(values.as_slice::<T>(), &values.validity(), exponents)?;
40+
let (exponents, encoded, exc_pos, exc) = T::encode(values.as_slice::<T>(), exponents);
3941
let len = encoded.len();
4042
let patches_validity = if values.dtype().is_nullable() {
4143
Validity::AllValid
4244
} else {
4345
Validity::NonNullable
4446
};
47+
48+
let mut exc_pos_valid_only =
49+
BufferMut::<u64>::with_capacity_aligned(exc_pos.len(), exc_pos.alignment());
50+
let mut exc_valid_only = BufferMut::<T>::with_capacity_aligned(exc.len(), exc.alignment());
51+
for (position, value) in exc_pos.into_iter().zip_eq(exc.into_iter()) {
52+
if values.is_valid(position as usize) {
53+
exc_pos_valid_only.push(position);
54+
exc_valid_only.push(value);
55+
}
56+
}
57+
4558
Ok((
4659
exponents,
4760
PrimitiveArray::new(encoded, values.validity()).into_array(),
48-
(!exc.is_empty()).then(|| {
49-
let position_arr = exc_pos.into_array();
61+
(!exc_valid_only.is_empty()).then(|| {
62+
let position_arr = exc_pos_valid_only.freeze().into_array();
5063
Patches::new(
5164
len,
5265
position_arr,
53-
PrimitiveArray::new(exc, patches_validity).into_array(),
66+
PrimitiveArray::new(exc_valid_only.freeze(), patches_validity).into_array(),
5467
)
5568
}),
5669
))
@@ -167,7 +180,7 @@ mod tests {
167180
.into_primitive()
168181
.unwrap()
169182
.as_slice::<i64>(),
170-
vec![1234i64, 2718, 3142, 4000] // fill forward
183+
vec![1234i64, 2718, 1234, 4000] // fill forward
171184
);
172185
assert_eq!(encoded.exponents(), Exponents { e: 16, f: 13 });
173186

encodings/alp/src/alp/mod.rs

Lines changed: 23 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@ mod compute;
1111

1212
pub use array::*;
1313
pub use compress::*;
14-
use vortex_array::array::PrimitiveArray;
15-
use vortex_array::validity::Validity;
16-
use vortex_array::IntoArrayData as _;
1714
use vortex_buffer::{Buffer, BufferMut};
18-
use vortex_error::VortexResult;
1915

2016
const SAMPLE_SIZE: usize = 32;
2117

@@ -59,47 +55,25 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
5955
/// Convert from the integer type back to the float type using `as`.
6056
fn from_int(n: Self::ALPInt) -> Self;
6157

62-
fn sampled_find_best_exponents(
63-
values: &[Self],
64-
validity: &Validity,
65-
) -> VortexResult<Exponents> {
66-
if values.len() <= SAMPLE_SIZE {
67-
Self::find_best_exponents(values, validity)
68-
} else {
69-
let validity = validity.take(
70-
&PrimitiveArray::from_iter(
71-
(0..values.len())
72-
.step_by(values.len() / SAMPLE_SIZE)
73-
.map(|x| x as u64)
74-
.take(SAMPLE_SIZE),
75-
)
76-
.into_array(),
77-
)?;
78-
let values = values
58+
fn find_best_exponents(values: &[Self]) -> Exponents {
59+
let mut best_exp = Exponents { e: 0, f: 0 };
60+
let mut best_nbytes: usize = usize::MAX;
61+
62+
let sample = (values.len() > SAMPLE_SIZE).then(|| {
63+
values
7964
.iter()
8065
.step_by(values.len() / SAMPLE_SIZE)
8166
.take(SAMPLE_SIZE)
8267
.cloned()
83-
.collect_vec();
84-
Self::find_best_exponents(&values, &validity)
85-
}
86-
}
87-
88-
fn find_best_exponents(values: &[Self], validity: &Validity) -> VortexResult<Exponents> {
89-
let mut best_exp = Exponents { e: 0, f: 0 };
90-
let mut best_nbytes: usize = usize::MAX;
91-
92-
assert!(
93-
values.len() <= SAMPLE_SIZE,
94-
"{} <= {}",
95-
values.len(),
96-
SAMPLE_SIZE
97-
);
68+
.collect_vec()
69+
});
9870

9971
for e in (0..Self::MAX_EXPONENT).rev() {
10072
for f in 0..e {
101-
let (_, encoded, _, exc_patches) =
102-
Self::encode(values, validity, Some(Exponents { e, f }))?;
73+
let (_, encoded, _, exc_patches) = Self::encode(
74+
sample.as_deref().unwrap_or(values),
75+
Some(Exponents { e, f }),
76+
);
10377

10478
let size = Self::estimate_encoded_size(&encoded, &exc_patches);
10579
if size < best_nbytes {
@@ -111,7 +85,7 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
11185
}
11286
}
11387

114-
Ok(best_exp)
88+
best_exp
11589
}
11690

11791
#[inline]
@@ -139,16 +113,11 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
139113
encoded_bytes + patch_bytes
140114
}
141115

142-
#[allow(clippy::type_complexity)]
143116
fn encode(
144117
values: &[Self],
145-
validity: &Validity,
146118
exponents: Option<Exponents>,
147-
) -> VortexResult<(Exponents, Buffer<Self::ALPInt>, Buffer<u64>, Buffer<Self>)> {
148-
let exponents = match exponents {
149-
Some(exponents) => exponents,
150-
None => Self::sampled_find_best_exponents(values, validity)?,
151-
};
119+
) -> (Exponents, Buffer<Self::ALPInt>, Buffer<u64>, Buffer<Self>) {
120+
let exp = exponents.unwrap_or_else(|| Self::find_best_exponents(values));
152121

153122
let mut encoded_output = BufferMut::<Self::ALPInt>::with_capacity(values.len());
154123
let mut patch_indices = BufferMut::<u64>::with_capacity(values.len());
@@ -161,21 +130,20 @@ pub trait ALPFloat: private::Sealed + Float + Display + 'static {
161130
for chunk in values.chunks(encode_chunk_size) {
162131
encode_chunk_unchecked(
163132
chunk,
164-
exponents,
133+
exp,
165134
&mut encoded_output,
166135
&mut patch_indices,
167136
&mut patch_values,
168137
&mut fill_value,
169-
validity,
170138
);
171139
}
172140

173-
Ok((
174-
exponents,
141+
(
142+
exp,
175143
encoded_output.freeze(),
176144
patch_indices.freeze(),
177145
patch_values.freeze(),
178-
))
146+
)
179147
}
180148

181149
#[inline]
@@ -224,7 +192,6 @@ fn encode_chunk_unchecked<T: ALPFloat>(
224192
patch_indices: &mut BufferMut<u64>,
225193
patch_values: &mut BufferMut<T>,
226194
fill_value: &mut Option<T::ALPInt>,
227-
validity: &Validity,
228195
) {
229196
let num_prev_encoded = encoded_output.len();
230197
let num_prev_patches = patch_indices.len();
@@ -258,13 +225,12 @@ fn encode_chunk_unchecked<T: ALPFloat>(
258225
// write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op)
259226
patch_indices_mut[chunk_patch_index].write(i as u64);
260227
patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]);
261-
let is_valid_and_an_exception =
262-
(decoded != chunk[i - num_prev_encoded]) && validity.is_valid(i);
263-
chunk_patch_index += is_valid_and_an_exception as usize;
228+
chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize;
264229
}
230+
assert_eq!(chunk_patch_index, chunk_patch_count);
265231
unsafe {
266-
patch_indices.set_len(num_prev_patches + chunk_patch_index);
267-
patch_values.set_len(num_prev_patches + chunk_patch_index);
232+
patch_indices.set_len(num_prev_patches + chunk_patch_count);
233+
patch_values.set_len(num_prev_patches + chunk_patch_count);
268234
}
269235
}
270236

vortex-array/src/array/primitive/patch.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,9 @@ impl PrimitiveArray {
1515
let patch_indices = patch_indices.into_primitive()?;
1616
let patch_values = patch_values.into_primitive()?;
1717

18-
let patched_validity = match patch_values.validity() {
19-
Validity::NonNullable => self.validity(),
20-
patch_validity => {
21-
self.validity()
22-
.patch(self.len(), patch_indices.as_ref(), patch_validity)?
23-
}
24-
};
18+
let patched_validity =
19+
self.validity()
20+
.patch(self.len(), patch_indices.as_ref(), patch_values.validity())?;
2521

2622
match_each_integer_ptype!(patch_indices.ptype(), |$I| {
2723
match_each_native_ptype!(self.ptype(), |$T| {

0 commit comments

Comments
 (0)