@@ -27,20 +27,53 @@ pub unsafe fn mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
2727
2828#[ target_feature( enable = "gfni" , enable = "avx512vl" ) ]
2929pub unsafe fn mul_vec_by_scalar_then_add_into ( add_into_vec : & mut [ u8 ] , mul_vec : & [ u8 ] , scalar : u8 ) {
30- let mut add_vec_iter = add_into_vec. chunks_exact_mut ( 2 * GF256_HALF_ORDER ) ;
31- let mut mul_vec_iter = mul_vec. chunks_exact ( 2 * GF256_HALF_ORDER ) ;
30+ let mut add_vec_iter = add_into_vec. chunks_exact_mut ( 4 * 2 * GF256_HALF_ORDER ) ;
31+ let mut mul_vec_iter = mul_vec. chunks_exact ( 4 * 2 * GF256_HALF_ORDER ) ;
3232
3333 unsafe {
3434 let scalar_simd = _mm256_set1_epi8 ( scalar as i8 ) ;
3535
3636 for ( add_vec_chunk, mul_vec_chunk) in add_vec_iter. by_ref ( ) . zip ( mul_vec_iter. by_ref ( ) ) {
37- let mul_vec_chunk_simd = _mm256_loadu_si256 ( mul_vec_chunk. as_ptr ( ) . cast ( ) ) ;
38- let scaled_res = _mm256_gf2p8mul_epi8 ( mul_vec_chunk_simd, scalar_simd) ;
37+ let ( mul_vec_chunk0, mul_vec_chunk1, mul_vec_chunk2, mul_vec_chunk3) = {
38+ let ( chunk0, rest) = mul_vec_chunk. split_at_unchecked ( 2 * GF256_HALF_ORDER ) ;
39+ let ( chunk1, rest) = rest. split_at_unchecked ( 2 * GF256_HALF_ORDER ) ;
40+ let ( chunk2, chunk3) = rest. split_at_unchecked ( 2 * GF256_HALF_ORDER ) ;
3941
40- let add_vec_chunk_simd = _mm256_loadu_si256 ( add_vec_chunk . as_ptr ( ) . cast ( ) ) ;
41- let accum_res = _mm256_xor_si256 ( add_vec_chunk_simd , scaled_res ) ;
42+ ( chunk0 , chunk1 , chunk2 , chunk3 )
43+ } ;
4244
43- _mm256_storeu_si256 ( add_vec_chunk. as_mut_ptr ( ) . cast ( ) , accum_res) ;
45+ let mul_vec_chunk0_simd = _mm256_loadu_si256 ( mul_vec_chunk0. as_ptr ( ) . cast ( ) ) ;
46+ let mul_vec_chunk1_simd = _mm256_loadu_si256 ( mul_vec_chunk1. as_ptr ( ) . cast ( ) ) ;
47+ let mul_vec_chunk2_simd = _mm256_loadu_si256 ( mul_vec_chunk2. as_ptr ( ) . cast ( ) ) ;
48+ let mul_vec_chunk3_simd = _mm256_loadu_si256 ( mul_vec_chunk3. as_ptr ( ) . cast ( ) ) ;
49+
50+ let scaled_res0 = _mm256_gf2p8mul_epi8 ( mul_vec_chunk0_simd, scalar_simd) ;
51+ let scaled_res1 = _mm256_gf2p8mul_epi8 ( mul_vec_chunk1_simd, scalar_simd) ;
52+ let scaled_res2 = _mm256_gf2p8mul_epi8 ( mul_vec_chunk2_simd, scalar_simd) ;
53+ let scaled_res3 = _mm256_gf2p8mul_epi8 ( mul_vec_chunk3_simd, scalar_simd) ;
54+
55+ let ( add_vec_chunk0, add_vec_chunk1, add_vec_chunk2, add_vec_chunk3) = {
56+ let ( chunk0, rest) = add_vec_chunk. split_at_mut_unchecked ( 2 * GF256_HALF_ORDER ) ;
57+ let ( chunk1, rest) = rest. split_at_mut_unchecked ( 2 * GF256_HALF_ORDER ) ;
58+ let ( chunk2, chunk3) = rest. split_at_mut_unchecked ( 2 * GF256_HALF_ORDER ) ;
59+
60+ ( chunk0, chunk1, chunk2, chunk3)
61+ } ;
62+
63+ let add_vec_chunk0_simd = _mm256_loadu_si256 ( add_vec_chunk0. as_ptr ( ) . cast ( ) ) ;
64+ let add_vec_chunk1_simd = _mm256_loadu_si256 ( add_vec_chunk1. as_ptr ( ) . cast ( ) ) ;
65+ let add_vec_chunk2_simd = _mm256_loadu_si256 ( add_vec_chunk2. as_ptr ( ) . cast ( ) ) ;
66+ let add_vec_chunk3_simd = _mm256_loadu_si256 ( add_vec_chunk3. as_ptr ( ) . cast ( ) ) ;
67+
68+ let accum_res0 = _mm256_xor_si256 ( add_vec_chunk0_simd, scaled_res0) ;
69+ let accum_res1 = _mm256_xor_si256 ( add_vec_chunk1_simd, scaled_res1) ;
70+ let accum_res2 = _mm256_xor_si256 ( add_vec_chunk2_simd, scaled_res2) ;
71+ let accum_res3 = _mm256_xor_si256 ( add_vec_chunk3_simd, scaled_res3) ;
72+
73+ _mm256_storeu_si256 ( add_vec_chunk0. as_mut_ptr ( ) . cast ( ) , accum_res0) ;
74+ _mm256_storeu_si256 ( add_vec_chunk1. as_mut_ptr ( ) . cast ( ) , accum_res1) ;
75+ _mm256_storeu_si256 ( add_vec_chunk2. as_mut_ptr ( ) . cast ( ) , accum_res2) ;
76+ _mm256_storeu_si256 ( add_vec_chunk3. as_mut_ptr ( ) . cast ( ) , accum_res3) ;
4477 }
4578 }
4679
0 commit comments