Skip to content

Commit 5e6abcf

Browse files
committed
With cleaner lazymod127
1 parent 7b0d46b commit 5e6abcf

File tree

1 file changed

+22
-32
lines changed

1 file changed

+22
-32
lines changed

src/clhash.c

Lines changed: 22 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,24 @@
88
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
99
#endif
1010

11+
12+
13+
// computes a << 1
14+
static inline __m128i leftshift1(__m128i a) {
15+
const int x = 1;
16+
__m128i u64shift = _mm_slli_epi64(a,x);
17+
__m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t));
18+
return _mm_or_si128(u64shift, topbits);
19+
}
20+
21+
// computes a << 2
22+
static inline __m128i leftshift2(__m128i a) {
23+
const int x = 2;
24+
__m128i u64shift = _mm_slli_epi64(a,x);
25+
__m128i topbits = _mm_slli_si128(_mm_srli_epi64(a,64 - x),sizeof(uint64_t));
26+
return _mm_or_si128(u64shift, topbits);
27+
}
28+
1129
//////////////////
1230
// compute the "lazy" modulo with 2^127 + 2 + 1, actually we compute the
1331
// modulo with (2^128 + 4 + 2) = 2 * (2^127 + 2 + 1) ,
@@ -31,38 +49,10 @@ static inline __m128i lazymod127(__m128i Alow, __m128i Ahigh) {
3149
// This is correct because the two highest bits of Ahigh are
3250
// assumed to be zero.
3351
///////////////////////////////////////////////////
34-
// We want to take Ahigh and compute ( Ahigh <<1 ) XOR ( Ahigh <<2 )
35-
// Except that there is no way to shift an entire XMM register by 1 or 2 bits using a single instruction.
36-
// So how do you compute Ahigh <<1 using as few instructions as possible?
37-
//
38-
// First you do _mm_slli_epi64(Ahigh,1). This is *almost* correct... except that
39-
// the 64th bit is not shifted in 65th position.
40-
// Well, ok, but we can compute Ahigh >> 8, this given by _mm_srli_si128(Ahigh,1)
41-
// _mm_srli_si128(Ahigh,1) has the same bits as Ahigh (except that we lose the lowest 8)
42-
// but at different positions.
43-
// So let us shift left the result again...
44-
// _mm_slli_epi64(_mm_srli_si128(Ahigh,1),1)
45-
// If you keep track, this is "almost" equivalent to A >> 7, except that the 72th bit
46-
// from A is lost.
47-
// From something that is almost A >>7, we can get back something that is almost A << 1
48-
// by shifting left by 8 bits...
49-
// _mm_slli_si128(_mm_slli_epi64(_mm_srli_si128(Ahigh,1),1),1)
50-
// So this is a shift left by 1, except that the 72th bit is lost along with the lowest 8 bits.
51-
// We have that _mm_slli_epi64(Ahigh,1) is a shift let by 1 except that the 64th bit
52-
// is lost. We can combine the two to get the desired result (just OR them).
53-
// The algorithm below is just an optimized version of this where we do both shifts (by 1 and 2)
54-
// at the same time and XOR the result.
55-
//
56-
__m128i shifteddownAhigh = _mm_srli_si128(Ahigh,1);
57-
__m128i s1 = _mm_slli_epi64(Ahigh,1);
58-
__m128i s2 = _mm_slli_epi64(Ahigh,2);
59-
__m128i sd1 = _mm_slli_si128(_mm_slli_epi64(shifteddownAhigh,1),1);
60-
__m128i sd2 = _mm_slli_si128(_mm_slli_epi64(shifteddownAhigh,2),1);
61-
s1 = _mm_or_si128(s1,sd1);
62-
s2 = _mm_or_si128(s2,sd2);
63-
__m128i reduced = _mm_xor_si128(s1,s2);
64-
// combining results
65-
__m128i final = _mm_xor_si128(Alow,reduced);
52+
// credit for simplified implementation : Jan Wassenberg
53+
__m128i shift1 = leftshift1(Ahigh);
54+
__m128i shift2 = leftshift2(Ahigh);
55+
__m128i final = _mm_xor_si128(_mm_xor_si128(Alow, shift1),shift2);
6656
return final;
6757
}
6858

0 commit comments

Comments
 (0)