|
4 | 4 | //! Take function implementations on slices. |
5 | 5 |
|
6 | 6 | use vortex_buffer::Buffer; |
| 7 | +use vortex_buffer::BufferMut; |
7 | 8 | use vortex_dtype::UnsignedPType; |
8 | 9 |
|
9 | 10 | use crate::take::Take; |
@@ -45,5 +46,36 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] { |
45 | 46 | )] |
46 | 47 | #[inline] |
47 | 48 | fn take_scalar<T: Copy, I: UnsignedPType>(buffer: &[T], indices: &[I]) -> Buffer<T> { |
48 | | - indices.iter().map(|idx| buffer[idx.as_()]).collect() |
| 49 | + // NB: The simpler `indices.iter().map(|idx| buff1er[idx.as_()]).collect()` generates suboptimal |
| 50 | + // assembly where the buffer length is repeatedly loaded from the stack on each iteration. |
| 51 | + |
| 52 | + let mut result = BufferMut::with_capacity(indices.len()); |
| 53 | + let ptr = result.spare_capacity_mut().as_mut_ptr().cast::<T>(); |
| 54 | + |
| 55 | + // This explicit loop with pointer writes keeps the length in a register and avoids per-element |
| 56 | + // capacity checks from `push()`. |
| 57 | + for (i, idx) in indices.iter().enumerate() { |
| 58 | + // SAFETY: We reserved `indices.len()` capacity, so `ptr.add(i)` is valid. |
| 59 | + unsafe { ptr.add(i).write(buffer[idx.as_()]) }; |
| 60 | + } |
| 61 | + |
| 62 | + // SAFETY: We just wrote exactly `indices.len()` elements. |
| 63 | + unsafe { result.set_len(indices.len()) }; |
| 64 | + result.freeze() |
| 65 | +} |
| 66 | + |
| 67 | +/// This is to help with inspecting assembly with cargo-show-asm (without it, the code is inlined). |
| 68 | +/// |
| 69 | +/// Use this command: |
| 70 | +/// |
| 71 | +/// ```sh |
| 72 | +/// cargo asm -p vortex-compute --lib 'vortex_compute::take::slice::__take_show_asm' |
| 73 | +/// ``` |
| 74 | +/// |
| 75 | +/// You can play around with the different type combinations of buffer and indices type (you can |
| 76 | +/// even try `u128` as the buffer type)! |
| 77 | +#[doc(hidden)] |
| 78 | +#[inline(never)] |
| 79 | +pub fn __take_show_asm(buffer: &[u32], indices: &[u64]) -> Buffer<u32> { |
| 80 | + take_scalar(buffer, indices) |
49 | 81 | } |
0 commit comments