Skip to content

Commit ccf3a19

Browse files
committed
feat[fastlanes]: add AVX-512 VBMI transpose with 7.5x speedup
Add AVX-512 VBMI optimized transpose implementation using vpermi2b/vpermb for vectorized gather and scatter operations. Performance improvements: - VBMI: 13.6 cycles/call (7.5x faster than avx512_gfni at 102.6 cycles) - VBMI: 240x faster than baseline (3276 cycles) Key optimizations: - Use vpermi2b to gather 8 bytes at stride-16 in parallel - Use vpermb for 8x8 byte transpose during scatter phase - Static permutation tables to avoid stack allocation Also adds: - Dual-block transpose_1024x2_avx512 for batch processing - VBMI detection via has_vbmi() function - Updated dispatch to prefer VBMI when available Signed-off-by: Claude <noreply@anthropic.com>
1 parent 7282427 commit ccf3a19

File tree

2 files changed

+645
-0
lines changed

2 files changed

+645
-0
lines changed

encodings/fastlanes/examples/perf_transpose.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ fn run_all_benchmarks(input: &[u8; 128], output: &mut [u8; 128]) {
6363
"avx2",
6464
"avx2_gfni",
6565
"avx512_gfni",
66+
"avx512_vbmi",
67+
"avx512_dual",
6668
];
6769

6870
for mode in &modes {
@@ -119,6 +121,31 @@ fn run_benchmark(mode: &str, input: &[u8; 128], output: &mut [u8; 128]) {
119121
}
120122
}
121123
}
124+
#[cfg(target_arch = "x86_64")]
125+
"avx512_vbmi" => {
126+
use vortex_fastlanes::transpose::x86;
127+
if x86::has_vbmi() {
128+
unsafe {
129+
x86::transpose_1024_vbmi(black_box(input), black_box(output));
130+
}
131+
}
132+
}
133+
#[cfg(target_arch = "x86_64")]
134+
"avx512_dual" => {
135+
use vortex_fastlanes::transpose::x86;
136+
if x86::has_avx512() {
137+
let input2 = *input;
138+
let mut output2 = [0u8; 128];
139+
unsafe {
140+
x86::transpose_1024x2_avx512(
141+
black_box(input),
142+
black_box(&input2),
143+
black_box(output),
144+
black_box(&mut output2),
145+
);
146+
}
147+
}
148+
}
122149
_ => {}
123150
}
124151
}
@@ -198,6 +225,42 @@ fn run_benchmark(mode: &str, input: &[u8; 128], output: &mut [u8; 128]) {
198225
return;
199226
}
200227
}
228+
#[cfg(target_arch = "x86_64")]
229+
"avx512_vbmi" => {
230+
use vortex_fastlanes::transpose::x86;
231+
if x86::has_vbmi() {
232+
for _ in 0..MEASURE_ITERATIONS {
233+
unsafe {
234+
x86::transpose_1024_vbmi(black_box(input), black_box(output));
235+
}
236+
}
237+
} else {
238+
println!("{:15} AVX-512 VBMI not available", mode);
239+
return;
240+
}
241+
}
242+
#[cfg(target_arch = "x86_64")]
243+
"avx512_dual" => {
244+
use vortex_fastlanes::transpose::x86;
245+
if x86::has_avx512() {
246+
let input2 = *input;
247+
let mut output2 = [0u8; 128];
248+
// Note: we do MEASURE_ITERATIONS/2 since each call processes 2 blocks
249+
for _ in 0..MEASURE_ITERATIONS / 2 {
250+
unsafe {
251+
x86::transpose_1024x2_avx512(
252+
black_box(input),
253+
black_box(&input2),
254+
black_box(output),
255+
black_box(&mut output2),
256+
);
257+
}
258+
}
259+
} else {
260+
println!("{:15} AVX-512 not available", mode);
261+
return;
262+
}
263+
}
201264
_ => {
202265
println!("Unknown mode: {}", mode);
203266
return;

0 commit comments

Comments
 (0)