@@ -52,10 +52,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
52
52
a1 (scrypt_chacha_sse2_loop : )
53
53
a2 (paddd xmm0 ,xmm1 )
54
54
a2 (pxor xmm3 ,xmm0 )
55
- a2 (movdqa xmm6 ,xmm3 )
56
- a2 (pslld xmm3 ,16 )
57
- a2 (psrld xmm6 ,16 )
58
- a2 (pxor xmm3 ,xmm6 )
55
+ a3 (pshuflw xmm3 ,xmm3 ,0xb1 )
56
+ a3 (pshufhw xmm3 ,xmm3 ,0xb1 )
59
57
a2 (paddd xmm2 ,xmm3 )
60
58
a2 (pxor xmm1 ,xmm2 )
61
59
a2 (movdqa xmm6 ,xmm1 )
@@ -80,10 +78,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
80
78
a2 (sub eax ,2 )
81
79
a2 (paddd xmm0 ,xmm1 )
82
80
a2 (pxor xmm3 ,xmm0 )
83
- a2 (movdqa xmm6 ,xmm3 )
84
- a2 (pslld xmm3 ,16 )
85
- a2 (psrld xmm6 ,16 )
86
- a2 (pxor xmm3 ,xmm6 )
81
+ a3 (pshuflw xmm3 ,xmm3 ,0xb1 )
82
+ a3 (pshufhw xmm3 ,xmm3 ,0xb1 )
87
83
a2 (paddd xmm2 ,xmm3 )
88
84
a2 (pxor xmm1 ,xmm2 )
89
85
a2 (movdqa xmm6 ,xmm1 )
@@ -180,10 +176,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
180
176
a1 (scrypt_chacha_sse2_loop : )
181
177
a2 (paddd xmm0 ,xmm1 )
182
178
a2 (pxor xmm3 ,xmm0 )
183
- a2 (movdqa xmm6 ,xmm3 )
184
- a2 (pslld xmm3 ,16 )
185
- a2 (psrld xmm6 ,16 )
186
- a2 (pxor xmm3 ,xmm6 )
179
+ a3 (pshuflw xmm3 ,xmm3 ,0xb1 )
180
+ a3 (pshufhw xmm3 ,xmm3 ,0xb1 )
187
181
a2 (paddd xmm2 ,xmm3 )
188
182
a2 (pxor xmm1 ,xmm2 )
189
183
a2 (movdqa xmm6 ,xmm1 )
@@ -208,10 +202,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
208
202
a2 (sub rax ,2 )
209
203
a2 (paddd xmm0 ,xmm1 )
210
204
a2 (pxor xmm3 ,xmm0 )
211
- a2 (movdqa xmm6 ,xmm3 )
212
- a2 (pslld xmm3 ,16 )
213
- a2 (psrld xmm6 ,16 )
214
- a2 (pxor xmm3 ,xmm6 )
205
+ a3 (pshuflw xmm3 ,xmm3 ,0xb1 )
206
+ a3 (pshufhw xmm3 ,xmm3 ,0xb1 )
215
207
a2 (paddd xmm2 ,xmm3 )
216
208
a2 (pxor xmm1 ,xmm2 )
217
209
a2 (movdqa xmm6 ,xmm1 )
@@ -308,7 +300,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
308
300
x0 = _mm_add_epi32 (x0 , x1 );
309
301
x3 = _mm_xor_si128 (x3 , x0 );
310
302
x4 = x3 ;
311
- x3 = _mm_or_si128 ( _mm_slli_epi32 (x3 , 16 ), _mm_srli_epi32 ( x4 , 16 ) );
303
+ x3 = _mm_shufflehi_epi16 ( _mm_shufflelo_epi16 (x3 , 0xb1 ), 0xb1 );
312
304
x2 = _mm_add_epi32 (x2 , x3 );
313
305
x1 = _mm_xor_si128 (x1 , x2 );
314
306
x4 = x1 ;
@@ -327,7 +319,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
327
319
x0 = _mm_add_epi32 (x0 , x1 );
328
320
x3 = _mm_xor_si128 (x3 , x0 );
329
321
x4 = x3 ;
330
- x3 = _mm_or_si128 ( _mm_slli_epi32 (x3 , 16 ), _mm_srli_epi32 ( x4 , 16 ) );
322
+ x3 = _mm_shufflehi_epi16 ( _mm_shufflelo_epi16 (x3 , 0xb1 ), 0xb1 );
331
323
x2 = _mm_add_epi32 (x2 , x3 );
332
324
x1 = _mm_xor_si128 (x1 , x2 );
333
325
x4 = x1 ;
0 commit comments