Skip to content

Commit 7424106

Browse files
committed
Optimize GFNI+AVX512VL implementation of SIMD GF2p8 arithmetic by instruction level parallelism (ILP)
Signed-off-by: Anjan Roy <[email protected]>
1 parent 300e9e9 commit 7424106

File tree

1 file changed

+40
-7
lines changed

1 file changed

+40
-7
lines changed

src/common/simd/x86/gfni/m256i.rs

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
2727

2828
#[target_feature(enable = "gfni", enable = "avx512vl")]
2929
pub unsafe fn mul_vec_by_scalar_then_add_into(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
30-
let mut add_vec_iter = add_into_vec.chunks_exact_mut(2 * GF256_HALF_ORDER);
31-
let mut mul_vec_iter = mul_vec.chunks_exact(2 * GF256_HALF_ORDER);
30+
let mut add_vec_iter = add_into_vec.chunks_exact_mut(4 * 2 * GF256_HALF_ORDER);
31+
let mut mul_vec_iter = mul_vec.chunks_exact(4 * 2 * GF256_HALF_ORDER);
3232

3333
unsafe {
3434
let scalar_simd = _mm256_set1_epi8(scalar as i8);
3535

3636
for (add_vec_chunk, mul_vec_chunk) in add_vec_iter.by_ref().zip(mul_vec_iter.by_ref()) {
37-
let mul_vec_chunk_simd = _mm256_loadu_si256(mul_vec_chunk.as_ptr().cast());
38-
let scaled_res = _mm256_gf2p8mul_epi8(mul_vec_chunk_simd, scalar_simd);
37+
let (mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = {
38+
let (chunk0, rest) = mul_vec_chunk.split_at_unchecked(2 * GF256_HALF_ORDER);
39+
let (chunk1, rest) = rest.split_at_unchecked(2 * GF256_HALF_ORDER);
40+
let (chunk2, chunk3) = rest.split_at_unchecked(2 * GF256_HALF_ORDER);
3941

40-
let add_vec_chunk_simd = _mm256_loadu_si256(add_vec_chunk.as_ptr().cast());
41-
let accum_res = _mm256_xor_si256(add_vec_chunk_simd, scaled_res);
42+
(chunk0, chunk1, chunk2, chunk3)
43+
};
4244

43-
_mm256_storeu_si256(add_vec_chunk.as_mut_ptr().cast(), accum_res);
45+
let mul_vec_chunk0_simd = _mm256_loadu_si256(mul_vec_chunk0.as_ptr().cast());
46+
let mul_vec_chunk1_simd = _mm256_loadu_si256(mul_vec_chunk1.as_ptr().cast());
47+
let mul_vec_chunk2_simd = _mm256_loadu_si256(mul_vec_chunk2.as_ptr().cast());
48+
let mul_vec_chunk3_simd = _mm256_loadu_si256(mul_vec_chunk3.as_ptr().cast());
49+
50+
let scaled_res0 = _mm256_gf2p8mul_epi8(mul_vec_chunk0_simd, scalar_simd);
51+
let scaled_res1 = _mm256_gf2p8mul_epi8(mul_vec_chunk1_simd, scalar_simd);
52+
let scaled_res2 = _mm256_gf2p8mul_epi8(mul_vec_chunk2_simd, scalar_simd);
53+
let scaled_res3 = _mm256_gf2p8mul_epi8(mul_vec_chunk3_simd, scalar_simd);
54+
55+
let (add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = {
56+
let (chunk0, rest) = add_vec_chunk.split_at_mut_unchecked(2 * GF256_HALF_ORDER);
57+
let (chunk1, rest) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER);
58+
let (chunk2, chunk3) = rest.split_at_mut_unchecked(2 * GF256_HALF_ORDER);
59+
60+
(chunk0, chunk1, chunk2, chunk3)
61+
};
62+
63+
let add_vec_chunk0_simd = _mm256_loadu_si256(add_vec_chunk0.as_ptr().cast());
64+
let add_vec_chunk1_simd = _mm256_loadu_si256(add_vec_chunk1.as_ptr().cast());
65+
let add_vec_chunk2_simd = _mm256_loadu_si256(add_vec_chunk2.as_ptr().cast());
66+
let add_vec_chunk3_simd = _mm256_loadu_si256(add_vec_chunk3.as_ptr().cast());
67+
68+
let accum_res0 = _mm256_xor_si256(add_vec_chunk0_simd, scaled_res0);
69+
let accum_res1 = _mm256_xor_si256(add_vec_chunk1_simd, scaled_res1);
70+
let accum_res2 = _mm256_xor_si256(add_vec_chunk2_simd, scaled_res2);
71+
let accum_res3 = _mm256_xor_si256(add_vec_chunk3_simd, scaled_res3);
72+
73+
_mm256_storeu_si256(add_vec_chunk0.as_mut_ptr().cast(), accum_res0);
74+
_mm256_storeu_si256(add_vec_chunk1.as_mut_ptr().cast(), accum_res1);
75+
_mm256_storeu_si256(add_vec_chunk2.as_mut_ptr().cast(), accum_res2);
76+
_mm256_storeu_si256(add_vec_chunk3.as_mut_ptr().cast(), accum_res3);
4477
}
4578
}
4679

0 commit comments

Comments
 (0)