@@ -48,16 +48,15 @@ const SEQUENCES: usize = 18 * 18 * 18 * 18;
4848
4949#[ inline( always) ]
5050unsafe fn vmod10 ( a : __m256i ) -> __m256i {
51- // Algo from LLVM
52- let prod02 = _mm256_mul_epu32 ( a, _mm256_set1_epi32 ( 3435973837u32 as i32 ) ) ;
53- let prod13 = _mm256_mul_epu32 (
54- _mm256_shuffle_epi32 :: < 0xf5 > ( a) ,
51+ let ab_hm = _mm256_mul_epu32 (
52+ _mm256_srli_epi64 :: < 32 > ( a) ,
5553 _mm256_set1_epi32 ( 3435973837u32 as i32 ) ,
5654 ) ;
57- let d = _mm256_unpackhi_epi64 (
58- _mm256_unpacklo_epi32 ( prod02, prod13) ,
59- _mm256_unpackhi_epi32 ( prod02, prod13) ,
60- ) ;
55+ let ab_hm = _mm256_and_si256 ( ab_hm, _mm256_set1_epi64x ( 0xFFFFFFFF00000000u64 as i64 ) ) ;
56+ let ab_lm =
57+ _mm256_srli_epi64 :: < 32 > ( _mm256_mul_epu32 ( a, _mm256_set1_epi32 ( 3435973837u32 as i32 ) ) ) ;
58+
59+ let d = _mm256_or_si256 ( ab_lm, ab_hm) ;
6160
6261 let d = _mm256_srli_epi32 :: < 3 > ( d) ;
6362 let c = _mm256_mullo_epi32 ( d, _mm256_set1_epi32 ( 10 ) ) ;
@@ -67,15 +66,14 @@ unsafe fn vmod10(a: __m256i) -> __m256i {
6766#[ inline( always) ]
6867unsafe fn vmod104976 ( a : __m256i ) -> __m256i {
6968 // Algo from LLVM
70- let prod02 = _mm256_mul_epu32 ( a, _mm256_set1_epi32 ( 2681326939u32 as i32 ) ) ;
71- let prod13 = _mm256_mul_epu32 (
72- _mm256_shuffle_epi32 :: < 0xf5 > ( a) ,
69+ let ab_hm = _mm256_mul_epu32 (
70+ _mm256_srli_epi64 :: < 32 > ( a) ,
7371 _mm256_set1_epi32 ( 2681326939u32 as i32 ) ,
7472 ) ;
75- let d = _mm256_unpackhi_epi64 (
76- _mm256_unpacklo_epi32 ( prod02 , prod13 ) ,
77- _mm256_unpackhi_epi32 ( prod02 , prod13 ) ,
78- ) ;
73+ let ab_hm = _mm256_and_si256 ( ab_hm , _mm256_set1_epi64x ( 0xFFFFFFFF00000000u64 as i64 ) ) ;
74+ let ab_lm =
75+ _mm256_srli_epi64 :: < 32 > ( _mm256_mul_epu32 ( a , _mm256_set1_epi32 ( 2681326939u32 as i32 ) ) ) ;
76+ let d = _mm256_or_si256 ( ab_lm , ab_hm ) ;
7977
8078 let d = _mm256_srli_epi32 :: < 16 > ( d) ;
8179 let c = _mm256_mullo_epi32 ( d, _mm256_set1_epi32 ( 104976 ) ) ;
0 commit comments