Skip to content

Commit 45a5305

Browse files
floodyberryhackmod
authored andcommitted
use yuriy kaminskiy's trick for rotating 32bits by 16
1 parent a6e85da commit 45a5305

File tree

1 file changed

+10
-18
lines changed

1 file changed

+10
-18
lines changed

src/scryptjane/scrypt-jane-mix_chacha-sse2.h

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
5252
a1(scrypt_chacha_sse2_loop: )
5353
a2(paddd xmm0,xmm1)
5454
a2(pxor xmm3,xmm0)
55-
a2(movdqa xmm6,xmm3)
56-
a2(pslld xmm3,16)
57-
a2(psrld xmm6,16)
58-
a2(pxor xmm3,xmm6)
55+
a3(pshuflw xmm3,xmm3,0xb1)
56+
a3(pshufhw xmm3,xmm3,0xb1)
5957
a2(paddd xmm2,xmm3)
6058
a2(pxor xmm1,xmm2)
6159
a2(movdqa xmm6,xmm1)
@@ -80,10 +78,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
8078
a2(sub eax,2)
8179
a2(paddd xmm0,xmm1)
8280
a2(pxor xmm3,xmm0)
83-
a2(movdqa xmm6,xmm3)
84-
a2(pslld xmm3,16)
85-
a2(psrld xmm6,16)
86-
a2(pxor xmm3,xmm6)
81+
a3(pshuflw xmm3,xmm3,0xb1)
82+
a3(pshufhw xmm3,xmm3,0xb1)
8783
a2(paddd xmm2,xmm3)
8884
a2(pxor xmm1,xmm2)
8985
a2(movdqa xmm6,xmm1)
@@ -180,10 +176,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
180176
a1(scrypt_chacha_sse2_loop: )
181177
a2(paddd xmm0,xmm1)
182178
a2(pxor xmm3,xmm0)
183-
a2(movdqa xmm6,xmm3)
184-
a2(pslld xmm3,16)
185-
a2(psrld xmm6,16)
186-
a2(pxor xmm3,xmm6)
179+
a3(pshuflw xmm3,xmm3,0xb1)
180+
a3(pshufhw xmm3,xmm3,0xb1)
187181
a2(paddd xmm2,xmm3)
188182
a2(pxor xmm1,xmm2)
189183
a2(movdqa xmm6,xmm1)
@@ -208,10 +202,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
208202
a2(sub rax,2)
209203
a2(paddd xmm0,xmm1)
210204
a2(pxor xmm3,xmm0)
211-
a2(movdqa xmm6,xmm3)
212-
a2(pslld xmm3,16)
213-
a2(psrld xmm6,16)
214-
a2(pxor xmm3,xmm6)
205+
a3(pshuflw xmm3,xmm3,0xb1)
206+
a3(pshufhw xmm3,xmm3,0xb1)
215207
a2(paddd xmm2,xmm3)
216208
a2(pxor xmm1,xmm2)
217209
a2(movdqa xmm6,xmm1)
@@ -308,7 +300,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
308300
x0 = _mm_add_epi32(x0, x1);
309301
x3 = _mm_xor_si128(x3, x0);
310302
x4 = x3;
311-
x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
303+
x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
312304
x2 = _mm_add_epi32(x2, x3);
313305
x1 = _mm_xor_si128(x1, x2);
314306
x4 = x1;
@@ -327,7 +319,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
327319
x0 = _mm_add_epi32(x0, x1);
328320
x3 = _mm_xor_si128(x3, x0);
329321
x4 = x3;
330-
x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
322+
x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
331323
x2 = _mm_add_epi32(x2, x3);
332324
x1 = _mm_xor_si128(x1, x2);
333325
x4 = x1;

0 commit comments

Comments
 (0)