Skip to content

Commit ecd0496

Browse files
committed
optimize take_scalar
Signed-off-by: Connor Tsui <[email protected]>
1 parent 5c5f7d1 commit ecd0496

File tree

1 file changed

+33
-1
lines changed
  • vortex-compute/src/take/slice

1 file changed

+33
-1
lines changed

vortex-compute/src/take/slice/mod.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
//! Take function implementations on slices.
55
66
use vortex_buffer::Buffer;
7+
use vortex_buffer::BufferMut;
78
use vortex_dtype::UnsignedPType;
89

910
use crate::take::Take;
@@ -45,5 +46,36 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] {
4546
)]
4647
#[inline]
4748
fn take_scalar<T: Copy, I: UnsignedPType>(buffer: &[T], indices: &[I]) -> Buffer<T> {
48-
indices.iter().map(|idx| buffer[idx.as_()]).collect()
49+
// NB: The simpler `indices.iter().map(|idx| buff1er[idx.as_()]).collect()` generates suboptimal
50+
// assembly where the buffer length is repeatedly loaded from the stack on each iteration.
51+
52+
let mut result = BufferMut::with_capacity(indices.len());
53+
let ptr = result.spare_capacity_mut().as_mut_ptr().cast::<T>();
54+
55+
// This explicit loop with pointer writes keeps the length in a register and avoids per-element
56+
// capacity checks from `push()`.
57+
for (i, idx) in indices.iter().enumerate() {
58+
// SAFETY: We reserved `indices.len()` capacity, so `ptr.add(i)` is valid.
59+
unsafe { ptr.add(i).write(buffer[idx.as_()]) };
60+
}
61+
62+
// SAFETY: We just wrote exactly `indices.len()` elements.
63+
unsafe { result.set_len(indices.len()) };
64+
result.freeze()
65+
}
66+
67+
/// This is to help with inspecting assembly with cargo-show-asm (without it, the code is inlined).
68+
///
69+
/// Use this command:
70+
///
71+
/// ```sh
72+
/// cargo asm -p vortex-compute --lib 'vortex_compute::take::slice::__take_show_asm'
73+
/// ```
74+
///
75+
/// You can play around with the different type combinations of buffer and indices type (you can
76+
/// even try `u128` as the buffer type)!
77+
#[doc(hidden)]
78+
#[inline(never)]
79+
pub fn __take_show_asm(buffer: &[u32], indices: &[u64]) -> Buffer<u32> {
80+
take_scalar(buffer, indices)
4981
}

0 commit comments

Comments
 (0)