Skip to content

Commit 3e7f585

Browse files
perf[fastlanes]: fused for bitpacking decode (#4955)
Signed-off-by: Joe Isaacs <[email protected]>
1 parent 978a947 commit 3e7f585

File tree

6 files changed

+245
-58
lines changed

6 files changed

+245
-58
lines changed

Cargo.lock

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ divan = { package = "codspeed-divan-compat", version = "4.0.4" }
111111
dyn-hash = "0.2.0"
112112
enum-iterator = "2.0.0"
113113
erased-serde = "0.4"
114-
fastlanes = "0.4.0"
114+
fastlanes = { rev = "3820eeb33433458fa035bea5a834dee0039612f5", git = "https://github.com/spiraldb/fastlanes.git" }
115115
flatbuffers = "25.2.10"
116116
fsst-rs = "0.5.2"
117117
futures = { version = "0.3.31", default-features = false }

encodings/fastlanes/src/bitpacking/compress.rs

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@ use vortex_array::vtable::ValidityHelper;
1212
use vortex_array::{IntoArray, ToCanonical};
1313
use vortex_buffer::{Buffer, BufferMut, ByteBuffer};
1414
use vortex_dtype::{
15-
IntegerPType, NativePType, PType, match_each_integer_ptype, match_each_unsigned_integer_ptype,
15+
IntegerPType, NativePType, PType, PhysicalPType, match_each_integer_ptype,
16+
match_each_unsigned_integer_ptype,
1617
};
1718
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
1819
use vortex_mask::{AllOr, Mask};
1920
use vortex_scalar::Scalar;
2021

2122
use crate::BitPackedArray;
22-
use crate::unpack_iter::BitPacked;
23+
use crate::unpack_iter::{BitPacked, UnpackStrategy};
2324

2425
pub fn bitpack_to_best_bit_width(array: &PrimitiveArray) -> VortexResult<BitPackedArray> {
2526
let bit_width_freq = bit_width_histogram(array)?;
@@ -284,6 +285,24 @@ where
284285
})
285286
}
286287

288+
/// BitPacking strategy - uses plain bitpacking without reference value
289+
pub struct BitPackingStrategy;
290+
291+
impl<T: PhysicalPType<Physical: BitPacking>> UnpackStrategy<T> for BitPackingStrategy {
292+
#[inline(always)]
293+
unsafe fn unpack_chunk(
294+
&self,
295+
bit_width: usize,
296+
chunk: &[T::Physical],
297+
dst: &mut [T::Physical],
298+
) {
299+
// SAFETY: Caller must ensure [`BitPacking::unchecked_unpack`] safety requirements hold.
300+
unsafe {
301+
BitPacking::unchecked_unpack(bit_width, chunk, dst);
302+
}
303+
}
304+
}
305+
287306
pub fn unpack(array: &BitPackedArray) -> PrimitiveArray {
288307
match_each_integer_ptype!(array.ptype(), |P| { unpack_primitive::<P>(array) })
289308
}
@@ -314,14 +333,7 @@ pub(crate) fn unpack_into<T: BitPacked>(
314333
}
315334

316335
let mut bit_packed_iter = array.unpacked_chunks();
317-
if let Some(header) = bit_packed_iter.initial() {
318-
uninit_range.copy_from_slice(0, header);
319-
}
320-
321-
let out_idx = bit_packed_iter.decode_full_chunks_into(&mut uninit_range);
322-
if let Some(trailer) = bit_packed_iter.trailer() {
323-
uninit_range.copy_from_slice(out_idx, trailer);
324-
}
336+
bit_packed_iter.decode_into(&mut uninit_range);
325337

326338
if let Some(patches) = array.patches() {
327339
apply_patches(&mut uninit_range, patches);
@@ -334,7 +346,15 @@ pub(crate) fn unpack_into<T: BitPacked>(
334346
}
335347
}
336348

337-
fn apply_patches<T: NativePType>(dst: &mut UninitRange<T>, patches: &Patches) {
349+
pub fn apply_patches<T: NativePType>(dst: &mut UninitRange<T>, patches: &Patches) {
350+
apply_patches_fn(dst, patches, |x| x)
351+
}
352+
353+
pub fn apply_patches_fn<T: NativePType, F: Fn(T) -> T>(
354+
dst: &mut UninitRange<T>,
355+
patches: &Patches,
356+
f: F,
357+
) {
338358
assert_eq!(patches.array_len(), dst.len());
339359

340360
let indices = patches.indices().to_primitive();
@@ -349,21 +369,23 @@ fn apply_patches<T: NativePType>(dst: &mut UninitRange<T>, patches: &Patches) {
349369
values,
350370
validity,
351371
patches.offset(),
372+
f,
352373
)
353374
});
354375
}
355376

356-
fn insert_values_and_validity_at_indices<T: NativePType, IndexT: IntegerPType>(
377+
fn insert_values_and_validity_at_indices<T: NativePType, IndexT: IntegerPType, F: Fn(T) -> T>(
357378
dst: &mut UninitRange<T>,
358379
indices: &[IndexT],
359380
values: &[T],
360381
values_validity: Mask,
361382
indices_offset: usize,
383+
f: F,
362384
) {
363385
match values_validity {
364386
Mask::AllTrue(_) => {
365387
for (index, &value) in indices.iter().zip_eq(values) {
366-
dst.set_value(index.as_() - indices_offset, value);
388+
dst.set_value(index.as_() - indices_offset, f(value));
367389
}
368390
}
369391
Mask::AllFalse(_) => {
@@ -374,7 +396,7 @@ fn insert_values_and_validity_at_indices<T: NativePType, IndexT: IntegerPType>(
374396
Mask::Values(vb) => {
375397
for (index, &value) in indices.iter().zip_eq(values) {
376398
let out_index = index.as_() - indices_offset;
377-
dst.set_value(out_index, value);
399+
dst.set_value(out_index, f(value));
378400
dst.set_validity_bit(out_index, vb.value(out_index));
379401
}
380402
}

encodings/fastlanes/src/bitpacking/unpack_iter.rs

Lines changed: 86 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,20 @@ use vortex_array::builders::UninitRange;
1313
use vortex_buffer::ByteBuffer;
1414
use vortex_dtype::PhysicalPType;
1515

16-
use crate::BitPackedArray;
16+
use crate::{BitPackedArray, BitPackingStrategy};
1717

1818
const CHUNK_SIZE: usize = 1024;
1919

20+
/// Strategy trait for fastlanes unpacking operations
21+
pub trait UnpackStrategy<T: PhysicalPType> {
22+
/// Unpack a chunk of packed data into the destination buffer
23+
///
24+
/// # Safety
25+
/// - `chunk` must contain exactly `elems_per_chunk` elements
26+
/// - `dst` must have exactly CHUNK_SIZE capacity
27+
unsafe fn unpack_chunk(&self, bit_width: usize, chunk: &[T::Physical], dst: &mut [T::Physical]);
28+
}
29+
2030
/// Accessor to unpacked chunks of bitpacked arrays
2131
///
2232
/// The usual pattern of usage should follow
@@ -47,7 +57,8 @@ const CHUNK_SIZE: usize = 1024;
4757
/// }
4858
/// ```
4959
///
50-
pub struct BitUnpackedChunks<T: BitPacked> {
60+
pub struct UnpackedChunks<T: PhysicalPType, S: UnpackStrategy<T>> {
61+
strategy: S,
5162
bit_width: usize,
5263
offset: usize,
5364
len: usize,
@@ -58,28 +69,60 @@ pub struct BitUnpackedChunks<T: BitPacked> {
5869
buffer: [MaybeUninit<T>; CHUNK_SIZE],
5970
}
6071

72+
pub type BitUnpackedChunks<T> = UnpackedChunks<T, BitPackingStrategy>;
73+
6174
impl<T: BitPacked> BitUnpackedChunks<T> {
6275
pub fn new(array: &BitPackedArray) -> Self {
63-
let offset = array.offset() as usize;
64-
let len = array.len();
65-
let bit_width = array.bit_width() as usize;
76+
Self::new_with_strategy(
77+
BitPackingStrategy,
78+
array.packed().clone(),
79+
array.bit_width() as usize,
80+
array.offset() as usize,
81+
array.len(),
82+
)
83+
}
84+
85+
pub fn full_chunks(&mut self) -> BitUnpackIterator<'_, T> {
86+
let elems_per_chunk = self.elems_per_chunk();
87+
let last_chunk_is_sliced = self.last_chunk_is_sliced() as usize;
88+
let first_chunk_is_sliced = self.first_chunk_is_sliced();
89+
BitUnpackIterator::new(
90+
buffer_as_slice(&self.packed),
91+
&mut self.buffer,
92+
self.bit_width,
93+
elems_per_chunk,
94+
self.num_chunks - last_chunk_is_sliced,
95+
first_chunk_is_sliced,
96+
)
97+
}
98+
}
99+
100+
impl<T: PhysicalPType, S: UnpackStrategy<T>> UnpackedChunks<T, S> {
101+
pub fn new_with_strategy(
102+
strategy: S,
103+
packed: ByteBuffer,
104+
bit_width: usize,
105+
offset: usize,
106+
len: usize,
107+
) -> Self {
66108
let elems_per_chunk = 128 * bit_width / size_of::<T>();
67109
let num_chunks = (offset + len).div_ceil(CHUNK_SIZE);
68110

69111
assert_eq!(
70-
array.packed().len() / size_of::<T>(),
112+
packed.len() / size_of::<T>(),
71113
num_chunks * elems_per_chunk,
72114
"Invalid packed length: got {}, expected {}",
73-
array.packed().len() / size_of::<T>(),
115+
packed.len() / size_of::<T>(),
74116
num_chunks * elems_per_chunk
75117
);
76118

77119
let last_chunk_length = (offset + len) % CHUNK_SIZE;
78120
Self {
121+
strategy,
79122
bit_width,
80123
offset,
81124
len,
82-
packed: array.packed().clone(),
125+
packed,
83126
buffer: [const { MaybeUninit::<T>::uninit() }; CHUNK_SIZE],
84127
num_chunks,
85128
last_chunk_length,
@@ -107,48 +150,52 @@ impl<T: BitPacked> BitUnpackedChunks<T> {
107150
// 1. chunk is elems_per_chunk.
108151
// 2. buffer is exactly CHUNK_SIZE.
109152
unsafe {
110-
BitPacking::unchecked_unpack(self.bit_width, chunk, dst);
153+
self.strategy.unpack_chunk(self.bit_width, chunk, dst);
111154
mem::transmute(&mut self.buffer[self.offset..][..header_end_slice])
112155
}
113156
})
114157
}
115158

116-
/// Iterator over complete chunks of this array
117-
pub fn full_chunks(&mut self) -> BitUnpackIterator<'_, T> {
118-
let elems_per_chunk = self.elems_per_chunk();
119-
let last_chunk_is_sliced = self.last_chunk_is_sliced() as usize;
120-
let first_chunk_is_sliced = self.first_chunk_is_sliced();
121-
BitUnpackIterator::new(
122-
buffer_as_slice(&self.packed),
123-
&mut self.buffer,
124-
self.bit_width,
125-
elems_per_chunk,
126-
self.num_chunks - last_chunk_is_sliced,
127-
first_chunk_is_sliced,
128-
)
159+
/// Decode all chunks (initial, full, and trailer) into the output range.
160+
/// This consolidates the logic for handling all three chunk types in one place.
161+
pub fn decode_into(&mut self, output: &mut UninitRange<T>) {
162+
let mut local_idx = 0;
163+
164+
// Handle initial partial chunk if present
165+
if let Some(initial) = self.initial() {
166+
output.copy_from_slice(0, initial);
167+
local_idx = initial.len();
168+
}
169+
170+
// Handle full chunks
171+
local_idx = self.decode_full_chunks_into_at(output, local_idx);
172+
173+
// Handle trailing partial chunk if present
174+
if let Some(trailer) = self.trailer() {
175+
output.copy_from_slice(local_idx, trailer);
176+
}
129177
}
130178

131-
/// Unpack full chunks into output range and return the next local index to write to.
132-
pub fn decode_full_chunks_into(&mut self, output: &mut UninitRange<T>) -> usize {
133-
let first_chunk_is_sliced = self.first_chunk_is_sliced();
134-
// If there's only one chunk and that chunk is sliced it has been handled already by
135-
// `header` method
136-
if first_chunk_is_sliced && self.num_chunks == 1 {
137-
// Return the length since the header already wrote everything.
138-
return CHUNK_SIZE - self.offset;
179+
/// Unpack full chunks into output range starting at the given index.
180+
/// Returns the next local index to write to.
181+
fn decode_full_chunks_into_at(
182+
&mut self,
183+
output: &mut UninitRange<T>,
184+
start_idx: usize,
185+
) -> usize {
186+
// If there's only one chunk it has been handled already by `initial` method
187+
if self.num_chunks == 1 {
188+
// Return the start_idx since initial already wrote everything.
189+
return start_idx;
139190
}
140191

192+
let first_chunk_is_sliced = self.first_chunk_is_sliced();
193+
141194
let last_chunk_is_sliced = self.last_chunk_is_sliced();
142195
let full_chunks_range =
143196
(first_chunk_is_sliced as usize)..(self.num_chunks - last_chunk_is_sliced as usize);
144197

145-
// Track position relative to the start of the UninitRange.
146-
let mut local_idx = if first_chunk_is_sliced {
147-
// The header already wrote from 0 to (CHUNK_SIZE - self.offset).
148-
CHUNK_SIZE - self.offset
149-
} else {
150-
0
151-
};
198+
let mut local_idx = start_idx;
152199

153200
let packed_slice: &[T::Physical] = buffer_as_slice(&self.packed);
154201
let elems_per_chunk = self.elems_per_chunk();
@@ -160,7 +207,7 @@ impl<T: BitPacked> BitUnpackedChunks<T> {
160207
let uninit_dst = output.slice_uninit_mut(local_idx, CHUNK_SIZE);
161208
// SAFETY: &[T] and &[MaybeUninit<T>] have the same layout
162209
let dst: &mut [T::Physical] = mem::transmute(uninit_dst);
163-
BitPacking::unchecked_unpack(self.bit_width, chunk, dst);
210+
self.strategy.unpack_chunk(self.bit_width, chunk, dst);
164211
}
165212
local_idx += CHUNK_SIZE;
166213
}
@@ -178,7 +225,7 @@ impl<T: BitPacked> BitUnpackedChunks<T> {
178225
// 1. chunk is elems_per_chunk.
179226
// 2. buffer is exactly CHUNK_SIZE.
180227
unsafe {
181-
BitPacking::unchecked_unpack(self.bit_width, chunk, dst);
228+
self.strategy.unpack_chunk(self.bit_width, chunk, dst);
182229
mem::transmute(&mut self.buffer[..self.last_chunk_length])
183230
}
184231
})

0 commit comments

Comments
 (0)