88#define posix_memalign (p , a , s ) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
99#endif
1010
11+
12+
13+ // computes a << 1
14+ static inline __m128i leftshift1 (__m128i a ) {
15+ const int x = 1 ;
16+ __m128i u64shift = _mm_slli_epi64 (a ,x );
17+ __m128i topbits = _mm_slli_si128 (_mm_srli_epi64 (a ,64 - x ),sizeof (uint64_t ));
18+ return _mm_or_si128 (u64shift , topbits );
19+ }
20+
21+ // computes a << 2
22+ static inline __m128i leftshift2 (__m128i a ) {
23+ const int x = 2 ;
24+ __m128i u64shift = _mm_slli_epi64 (a ,x );
25+ __m128i topbits = _mm_slli_si128 (_mm_srli_epi64 (a ,64 - x ),sizeof (uint64_t ));
26+ return _mm_or_si128 (u64shift , topbits );
27+ }
28+
1129//////////////////
1230// compute the "lazy" modulo with 2^127 + 2 + 1, actually we compute the
1331// modulo with (2^128 + 4 + 2) = 2 * (2^127 + 2 + 1) ,
@@ -31,38 +49,10 @@ static inline __m128i lazymod127(__m128i Alow, __m128i Ahigh) {
3149 // This is correct because the two highest bits of Ahigh are
3250 // assumed to be zero.
3351 ///////////////////////////////////////////////////
34- // We want to take Ahigh and compute ( Ahigh <<1 ) XOR ( Ahigh <<2 )
35- // Except that there is no way to shift an entire XMM register by 1 or 2 bits using a single instruction.
36- // So how do you compute Ahigh <<1 using as few instructions as possible?
37- //
38- // First you do _mm_slli_epi64(Ahigh,1). This is *almost* correct... except that
39- // the 64th bit is not shifted in 65th position.
40- // Well, ok, but we can compute Ahigh >> 8, this given by _mm_srli_si128(Ahigh,1)
41- // _mm_srli_si128(Ahigh,1) has the same bits as Ahigh (except that we lose the lowest 8)
42- // but at different positions.
43- // So let us shift left the result again...
44- // _mm_slli_epi64(_mm_srli_si128(Ahigh,1),1)
45- // If you keep track, this is "almost" equivalent to A >> 7, except that the 72th bit
46- // from A is lost.
47- // From something that is almost A >>7, we can get back something that is almost A << 1
48- // by shifting left by 8 bits...
49- // _mm_slli_si128(_mm_slli_epi64(_mm_srli_si128(Ahigh,1),1),1)
50- // So this is a shift left by 1, except that the 72th bit is lost along with the lowest 8 bits.
51- // We have that _mm_slli_epi64(Ahigh,1) is a shift let by 1 except that the 64th bit
52- // is lost. We can combine the two to get the desired result (just OR them).
53- // The algorithm below is just an optimized version of this where we do both shifts (by 1 and 2)
54- // at the same time and XOR the result.
55- //
56- __m128i shifteddownAhigh = _mm_srli_si128 (Ahigh ,1 );
57- __m128i s1 = _mm_slli_epi64 (Ahigh ,1 );
58- __m128i s2 = _mm_slli_epi64 (Ahigh ,2 );
59- __m128i sd1 = _mm_slli_si128 (_mm_slli_epi64 (shifteddownAhigh ,1 ),1 );
60- __m128i sd2 = _mm_slli_si128 (_mm_slli_epi64 (shifteddownAhigh ,2 ),1 );
61- s1 = _mm_or_si128 (s1 ,sd1 );
62- s2 = _mm_or_si128 (s2 ,sd2 );
63- __m128i reduced = _mm_xor_si128 (s1 ,s2 );
64- // combining results
65- __m128i final = _mm_xor_si128 (Alow ,reduced );
52+ // credit for simplified implementation : Jan Wassenberg
53+ __m128i shift1 = leftshift1 (Ahigh );
54+ __m128i shift2 = leftshift2 (Ahigh );
55+ __m128i final = _mm_xor_si128 (_mm_xor_si128 (Alow , shift1 ),shift2 );
6656 return final ;
6757}
6858
0 commit comments