Commit 13fb041
authored
perf: improve
# Which issue does this PR close?
N/A
# Rationale for this change
Just making things faster
# What changes are included in this PR?
Explained below
# Are these changes tested?
Existing tests
# Are there any user-facing changes?
Nope
------------
Changing from:
```rust
let mut intermediate = Vec::with_capacity(offsets.len() - 1);
for &offset in &offsets[1..] {
intermediate.push(offset + shift)
}
self.offsets_builder.extend_from_slice(&intermediate);
```
to:
```rust
self.offsets_builder.extend(
offsets[..offsets.len() - 1]
.iter()
.map(|&offset| offset + shift),
);
```
When looking at the assembly
> Used rustc 1.89.0 and compiler flags `-C opt-level=2 -C
target-feature=+avx2 -C codegen-units=1` in
[godbold](https://godbolt.org/)
you see that for the old code:
```rust
let mut intermediate = Vec::<T>::with_capacity(offsets.len() - 1);
for &offset in &offsets[1..] {
intermediate.push(offset + shift)
}
```
the assembly for the loop is:
```asm
.LBB3_22:
mov rbx, qword ptr [r13 + 8*rbp + 8]
add rbx, r15
cmp rbp, qword ptr [rsp]
jne .LBB3_25
mov rdi, rsp
lea rsi, [rip + .Lanon.da681cffc384a5add117668a344b291b.6]
call qword ptr [rip + alloc::raw_vec::RawVec<T,A>::grow_one::ha1b398ade64b0727@GOTPCREL]
mov r14, qword ptr [rsp + 8]
jmp .LBB3_25
.LBB3_25:
mov qword ptr [r14 + 8*rbp], rbx
inc rbp
mov qword ptr [rsp + 16], rbp
add r12, -8
je .LBB3_9
```
and for the new code:
```rust
self.offsets_builder.extend(
offsets[..offsets.len() - 1]
.iter()
.map(|&offset| offset + shift),
);
```
the assembly for the loop is:
```asm
.LBB2_7:
vpaddq ymm1, ymm0, ymmword ptr [r14 + 8*r9]
vpaddq ymm2, ymm0, ymmword ptr [r14 + 8*r9 + 32]
vpaddq ymm3, ymm0, ymmword ptr [r14 + 8*r9 + 64]
vpaddq ymm4, ymm0, ymmword ptr [r14 + 8*r9 + 96]
vmovdqu ymmword ptr [r8 + 8*r9 - 96], ymm1
vmovdqu ymmword ptr [r8 + 8*r9 - 64], ymm2
vmovdqu ymmword ptr [r8 + 8*r9 - 32], ymm3
vmovdqu ymmword ptr [r8 + 8*r9], ymm4
add r9, 16
cmp rdx, r9
jne .LBB2_7
cmp rbx, rdx
je .LBB2_12
```
which uses SIMD instructions.
<details>
<summary>The code that I wrote in GodBolt:</summary>
For the old code:
```rust
#[inline(always)]
fn extend_offsets<T: std::ops::Add<Output = T> + Copy + Default>(output: &mut Vec<T>, offsets: &[T], next_offset: T) {
assert_ne!(offsets.len(), 0);
let shift: T = next_offset + offsets[0];
let mut intermediate = Vec::<T>::with_capacity(offsets.len() - 1);
// Make it easier to find the loop in the assembly
let mut dummy = 0u64;
unsafe {
std::arch::asm!(
"# VECTORIZED_START
mov {}, 1",
out(reg) dummy,
options(nostack)
);
}
for &offset in &offsets[1..] {
intermediate.push(offset + shift)
}
// Make it easier to find the loop in the assembly
unsafe {
std::arch::asm!(
"# VECTORIZED_END
mov {}, 2",
out(reg) dummy,
options(nostack)
);
}
std::hint::black_box(dummy);
output.extend_from_slice(&intermediate);
}
#[no_mangle]
pub fn extend_offsets_usize(output: &mut Vec<usize>, offsets: &[usize], next_offset: usize) {
extend_offsets(output, offsets, next_offset);
}
```
And for the new code:
```rust
#[inline(always)]
fn extend_offsets<T: std::ops::Add<Output = T> + Copy + Default>(output: &mut Vec<T>, offsets: &[T], next_offset: T) {
assert_ne!(offsets.len(), 0);
let shift: T = next_offset + offsets[0];
output.extend(offsets[..(offsets.len() - 1)]
.iter()
.map(|&offset| offset + shift));
}
#[no_mangle]
pub fn extend_offsets_usize(output: &mut Vec<usize>, offsets: &[usize], next_offset: usize) {
extend_offsets(output, offsets, next_offset);
}
```
</details>GenericByteBuilder::append_array to use SIMD for extending the offsets (#8388)1 parent de84ff5 commit 13fb041
1 file changed
+2
-10
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
157 | 157 | | |
158 | 158 | | |
159 | 159 | | |
160 | | - | |
161 | | - | |
162 | | - | |
163 | | - | |
164 | | - | |
165 | | - | |
166 | | - | |
167 | | - | |
168 | | - | |
169 | | - | |
| 160 | + | |
| 161 | + | |
170 | 162 | | |
171 | 163 | | |
172 | 164 | | |
| |||
0 commit comments