Skip to content

Commit 9a614fb

Browse files
authored
Merge pull request #92 from hackmod/scryptjane-update
update scryptjane
2 parents c8dc692 + bb622d8 commit 9a614fb

15 files changed

+1688
-165
lines changed

src/scryptjane/scrypt-jane-mix_chacha-avx.h

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* x86 */
2-
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
2+
#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
33

44
#define SCRYPT_CHACHA_AVX
55

@@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_avx)
2020
a2(shl edx,6)
2121
a2(lea ecx,[edx-64])
2222
a2(and eax, eax)
23-
a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
24-
a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
23+
a2(mov ebx, 0x01000302)
24+
a2(vmovd xmm4, ebx)
25+
a2(mov ebx, 0x05040706)
26+
a2(vmovd xmm0, ebx)
27+
a2(mov ebx, 0x09080b0a)
28+
a2(vmovd xmm1, ebx)
29+
a2(mov ebx, 0x0d0c0f0e)
30+
a2(vmovd xmm2, ebx)
31+
a2(mov ebx, 0x02010003)
32+
a2(vmovd xmm5, ebx)
33+
a2(mov ebx, 0x06050407)
34+
a2(vmovd xmm3, ebx)
35+
a2(mov ebx, 0x0a09080b)
36+
a2(vmovd xmm6, ebx)
37+
a2(mov ebx, 0x0e0d0c0f)
38+
a2(vmovd xmm7, ebx)
39+
a3(vpunpckldq xmm4, xmm4, xmm0)
40+
a3(vpunpckldq xmm5, xmm5, xmm3)
41+
a3(vpunpckldq xmm1, xmm1, xmm2)
42+
a3(vpunpckldq xmm6, xmm6, xmm7)
43+
a3(vpunpcklqdq xmm4, xmm4, xmm1)
44+
a3(vpunpcklqdq xmm5, xmm5, xmm6)
2545
a2(vmovdqa xmm0,[ecx+esi+0])
2646
a2(vmovdqa xmm1,[ecx+esi+16])
2747
a2(vmovdqa xmm2,[ecx+esi+32])
2848
a2(vmovdqa xmm3,[ecx+esi+48])
29-
a1(jz scrypt_ChunkMix_avx_no_xor1)
49+
aj(jz scrypt_ChunkMix_avx_no_xor1)
3050
a3(vpxor xmm0,xmm0,[ecx+eax+0])
3151
a3(vpxor xmm1,xmm1,[ecx+eax+16])
3252
a3(vpxor xmm2,xmm2,[ecx+eax+32])
@@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
4060
a3(vpxor xmm1,xmm1,[esi+ecx+16])
4161
a3(vpxor xmm2,xmm2,[esi+ecx+32])
4262
a3(vpxor xmm3,xmm3,[esi+ecx+48])
43-
a1(jz scrypt_ChunkMix_avx_no_xor2)
63+
aj(jz scrypt_ChunkMix_avx_no_xor2)
4464
a3(vpxor xmm0,xmm0,[eax+ecx+0])
4565
a3(vpxor xmm1,xmm1,[eax+ecx+16])
4666
a3(vpxor xmm2,xmm2,[eax+ecx+32])
@@ -71,7 +91,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
7191
a3(vpsrld xmm6,xmm1,25)
7292
a3(vpslld xmm1,xmm1,7)
7393
a3(vpxor xmm1,xmm1,xmm6)
74-
a2(sub eax,2)
7594
a3(vpaddd xmm0,xmm0,xmm1)
7695
a3(vpxor xmm3,xmm3,xmm0)
7796
a3(vpshufb xmm3,xmm3,xmm4)
@@ -85,13 +104,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
85104
a3(vpshufb xmm3,xmm3,xmm5)
86105
a3(vpshufd xmm0,xmm0,0x39)
87106
a3(vpaddd xmm2,xmm2,xmm3)
88-
a3(pshufd xmm3,xmm3,0x4e)
107+
a3(vpshufd xmm3,xmm3,0x4e)
89108
a3(vpxor xmm1,xmm1,xmm2)
90-
a3(pshufd xmm2,xmm2,0x93)
109+
a3(vpshufd xmm2,xmm2,0x93)
91110
a3(vpsrld xmm6,xmm1,25)
92111
a3(vpslld xmm1,xmm1,7)
93112
a3(vpxor xmm1,xmm1,xmm6)
94-
a1(ja scrypt_chacha_avx_loop)
113+
a2(sub eax,2)
114+
aj(ja scrypt_chacha_avx_loop)
95115
a3(vpaddd xmm0,xmm0,[esp+0])
96116
a3(vpaddd xmm1,xmm1,[esp+16])
97117
a3(vpaddd xmm2,xmm2,[esp+32])
@@ -108,21 +128,21 @@ asm_naked_fn(scrypt_ChunkMix_avx)
108128
a2(vmovdqa [eax+32],xmm2)
109129
a2(vmovdqa [eax+48],xmm3)
110130
a2(mov eax,[ebp+28])
111-
a1(jne scrypt_ChunkMix_avx_loop)
131+
aj(jne scrypt_ChunkMix_avx_loop)
112132
a2(mov esp,ebp)
113133
a1(pop ebp)
114134
a1(pop esi)
115135
a1(pop edi)
116136
a1(pop ebx)
117-
a1(ret 16)
137+
aret(16)
118138
asm_naked_fn_end(scrypt_ChunkMix_avx)
119139

120140
#endif
121141

122142

123143

124144
/* x64 */
125-
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
145+
#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
126146

127147
#define SCRYPT_CHACHA_AVX
128148

@@ -134,13 +154,21 @@ asm_naked_fn(scrypt_ChunkMix_avx)
134154
a2(lea rax,[rsi+r9])
135155
a2(lea r9,[rdx+r9])
136156
a2(and rdx, rdx)
137-
a2(vmovdqa xmm4,[ssse3_rotl16_32bit])
138-
a2(vmovdqa xmm5,[ssse3_rotl8_32bit])
139157
a2(vmovdqa xmm0,[rax+0])
140158
a2(vmovdqa xmm1,[rax+16])
141159
a2(vmovdqa xmm2,[rax+32])
142160
a2(vmovdqa xmm3,[rax+48])
143-
a1(jz scrypt_ChunkMix_avx_no_xor1)
161+
a2(mov r8, 0x0504070601000302)
162+
a2(mov rax, 0x0d0c0f0e09080b0a)
163+
a2(movd xmm4, r8)
164+
a2(movd xmm6, rax)
165+
a2(mov r8, 0x0605040702010003)
166+
a2(mov rax, 0x0e0d0c0f0a09080b)
167+
a2(movd xmm5, r8)
168+
a2(movd xmm7, rax)
169+
a3(vpunpcklqdq xmm4, xmm4, xmm6)
170+
a3(vpunpcklqdq xmm5, xmm5, xmm7)
171+
aj(jz scrypt_ChunkMix_avx_no_xor1)
144172
a3(vpxor xmm0,xmm0,[r9+0])
145173
a3(vpxor xmm1,xmm1,[r9+16])
146174
a3(vpxor xmm2,xmm2,[r9+32])
@@ -154,7 +182,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
154182
a3(vpxor xmm1,xmm1,[rsi+r9+16])
155183
a3(vpxor xmm2,xmm2,[rsi+r9+32])
156184
a3(vpxor xmm3,xmm3,[rsi+r9+48])
157-
a1(jz scrypt_ChunkMix_avx_no_xor2)
185+
aj(jz scrypt_ChunkMix_avx_no_xor2)
158186
a3(vpxor xmm0,xmm0,[rdx+r9+0])
159187
a3(vpxor xmm1,xmm1,[rdx+r9+16])
160188
a3(vpxor xmm2,xmm2,[rdx+r9+32])
@@ -185,7 +213,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
185213
a3(vpsrld xmm12,xmm1,25)
186214
a3(vpslld xmm1,xmm1,7)
187215
a3(vpxor xmm1,xmm1,xmm12)
188-
a2(sub rax,2)
189216
a3(vpaddd xmm0,xmm0,xmm1)
190217
a3(vpxor xmm3,xmm3,xmm0)
191218
a3(vpshufb xmm3,xmm3,xmm4)
@@ -199,13 +226,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
199226
a3(vpshufb xmm3,xmm3,xmm5)
200227
a3(vpshufd xmm0,xmm0,0x39)
201228
a3(vpaddd xmm2,xmm2,xmm3)
202-
a3(pshufd xmm3,xmm3,0x4e)
229+
a3(vpshufd xmm3,xmm3,0x4e)
203230
a3(vpxor xmm1,xmm1,xmm2)
204-
a3(pshufd xmm2,xmm2,0x93)
231+
a3(vpshufd xmm2,xmm2,0x93)
205232
a3(vpsrld xmm12,xmm1,25)
206233
a3(vpslld xmm1,xmm1,7)
207234
a3(vpxor xmm1,xmm1,xmm12)
208-
a1(ja scrypt_chacha_avx_loop)
235+
a2(sub rax,2)
236+
aj(ja scrypt_chacha_avx_loop)
209237
a3(vpaddd xmm0,xmm0,xmm8)
210238
a3(vpaddd xmm1,xmm1,xmm9)
211239
a3(vpaddd xmm2,xmm2,xmm10)
@@ -221,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
221249
a2(vmovdqa [rax+16],xmm1)
222250
a2(vmovdqa [rax+32],xmm2)
223251
a2(vmovdqa [rax+48],xmm3)
224-
a1(jne scrypt_ChunkMix_avx_loop)
252+
aj(jne scrypt_ChunkMix_avx_loop)
225253
a1(ret)
226254
asm_naked_fn_end(scrypt_ChunkMix_avx)
227255

@@ -233,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
233261

234262
#define SCRYPT_CHACHA_AVX
235263

236-
static void NOINLINE
264+
static void asm_calling_convention NOINLINE
237265
scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
238266
uint32_t i, blocksPerChunk = r * 2, half = 0;
239267
xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3;

src/scryptjane/scrypt-jane-mix_chacha-sse2.h

Lines changed: 22 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* x86 */
2-
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
2+
#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
33

44
#define SCRYPT_CHACHA_SSE2
55

@@ -24,7 +24,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
2424
a2(movdqa xmm1,[ecx+esi+16])
2525
a2(movdqa xmm2,[ecx+esi+32])
2626
a2(movdqa xmm3,[ecx+esi+48])
27-
a1(jz scrypt_ChunkMix_sse2_no_xor1)
27+
aj(jz scrypt_ChunkMix_sse2_no_xor1)
2828
a2(pxor xmm0,[ecx+eax+0])
2929
a2(pxor xmm1,[ecx+eax+16])
3030
a2(pxor xmm2,[ecx+eax+32])
@@ -38,7 +38,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
3838
a2(pxor xmm1,[esi+ecx+16])
3939
a2(pxor xmm2,[esi+ecx+32])
4040
a2(pxor xmm3,[esi+ecx+48])
41-
a1(jz scrypt_ChunkMix_sse2_no_xor2)
41+
aj(jz scrypt_ChunkMix_sse2_no_xor2)
4242
a2(pxor xmm0,[eax+ecx+0])
4343
a2(pxor xmm1,[eax+ecx+16])
4444
a2(pxor xmm2,[eax+ecx+32])
@@ -52,10 +52,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
5252
a1(scrypt_chacha_sse2_loop: )
5353
a2(paddd xmm0,xmm1)
5454
a2(pxor xmm3,xmm0)
55-
a2(movdqa xmm6,xmm3)
56-
a2(pslld xmm3,16)
57-
a2(psrld xmm6,16)
58-
a2(pxor xmm3,xmm6)
55+
a3(pshuflw xmm3,xmm3,0xb1)
56+
a3(pshufhw xmm3,xmm3,0xb1)
5957
a2(paddd xmm2,xmm3)
6058
a2(pxor xmm1,xmm2)
6159
a2(movdqa xmm6,xmm1)
@@ -80,10 +78,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
8078
a2(sub eax,2)
8179
a2(paddd xmm0,xmm1)
8280
a2(pxor xmm3,xmm0)
83-
a2(movdqa xmm6,xmm3)
84-
a2(pslld xmm3,16)
85-
a2(psrld xmm6,16)
86-
a2(pxor xmm3,xmm6)
81+
a3(pshuflw xmm3,xmm3,0xb1)
82+
a3(pshufhw xmm3,xmm3,0xb1)
8783
a2(paddd xmm2,xmm3)
8884
a2(pxor xmm1,xmm2)
8985
a2(movdqa xmm6,xmm1)
@@ -105,7 +101,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
105101
a2(pslld xmm1,7)
106102
a2(psrld xmm6,25)
107103
a2(pxor xmm1,xmm6)
108-
a1(ja scrypt_chacha_sse2_loop)
104+
aj(ja scrypt_chacha_sse2_loop)
109105
a2(paddd xmm0,[esp+0])
110106
a2(paddd xmm1,xmm4)
111107
a2(paddd xmm2,xmm5)
@@ -122,21 +118,21 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
122118
a2(movdqa [eax+32],xmm2)
123119
a2(movdqa [eax+48],xmm3)
124120
a2(mov eax,[ebp+28])
125-
a1(jne scrypt_ChunkMix_sse2_loop)
121+
aj(jne scrypt_ChunkMix_sse2_loop)
126122
a2(mov esp,ebp)
127123
a1(pop ebp)
128124
a1(pop esi)
129125
a1(pop edi)
130126
a1(pop ebx)
131-
a1(ret 16)
127+
aret(16)
132128
asm_naked_fn_end(scrypt_ChunkMix_sse2)
133129

134130
#endif
135131

136132

137133

138134
/* x64 */
139-
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED))
135+
#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) && !defined(CPU_X86_FORCE_INTRINSICS)
140136

141137
#define SCRYPT_CHACHA_SSE2
142138

@@ -152,7 +148,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
152148
a2(movdqa xmm1,[rax+16])
153149
a2(movdqa xmm2,[rax+32])
154150
a2(movdqa xmm3,[rax+48])
155-
a1(jz scrypt_ChunkMix_sse2_no_xor1)
151+
aj(jz scrypt_ChunkMix_sse2_no_xor1)
156152
a2(pxor xmm0,[r9+0])
157153
a2(pxor xmm1,[r9+16])
158154
a2(pxor xmm2,[r9+32])
@@ -166,7 +162,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
166162
a2(pxor xmm1,[rsi+r9+16])
167163
a2(pxor xmm2,[rsi+r9+32])
168164
a2(pxor xmm3,[rsi+r9+48])
169-
a1(jz scrypt_ChunkMix_sse2_no_xor2)
165+
aj(jz scrypt_ChunkMix_sse2_no_xor2)
170166
a2(pxor xmm0,[rdx+r9+0])
171167
a2(pxor xmm1,[rdx+r9+16])
172168
a2(pxor xmm2,[rdx+r9+32])
@@ -180,10 +176,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
180176
a1(scrypt_chacha_sse2_loop: )
181177
a2(paddd xmm0,xmm1)
182178
a2(pxor xmm3,xmm0)
183-
a2(movdqa xmm6,xmm3)
184-
a2(pslld xmm3,16)
185-
a2(psrld xmm6,16)
186-
a2(pxor xmm3,xmm6)
179+
a3(pshuflw xmm3,xmm3,0xb1)
180+
a3(pshufhw xmm3,xmm3,0xb1)
187181
a2(paddd xmm2,xmm3)
188182
a2(pxor xmm1,xmm2)
189183
a2(movdqa xmm6,xmm1)
@@ -208,10 +202,8 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
208202
a2(sub rax,2)
209203
a2(paddd xmm0,xmm1)
210204
a2(pxor xmm3,xmm0)
211-
a2(movdqa xmm6,xmm3)
212-
a2(pslld xmm3,16)
213-
a2(psrld xmm6,16)
214-
a2(pxor xmm3,xmm6)
205+
a3(pshuflw xmm3,xmm3,0xb1)
206+
a3(pshufhw xmm3,xmm3,0xb1)
215207
a2(paddd xmm2,xmm3)
216208
a2(pxor xmm1,xmm2)
217209
a2(movdqa xmm6,xmm1)
@@ -233,7 +225,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
233225
a2(pslld xmm1,7)
234226
a2(psrld xmm6,25)
235227
a2(pxor xmm1,xmm6)
236-
a1(ja scrypt_chacha_sse2_loop)
228+
aj(ja scrypt_chacha_sse2_loop)
237229
a2(paddd xmm0,xmm8)
238230
a2(paddd xmm1,xmm9)
239231
a2(paddd xmm2,xmm10)
@@ -249,7 +241,7 @@ asm_naked_fn(scrypt_ChunkMix_sse2)
249241
a2(movdqa [rax+16],xmm1)
250242
a2(movdqa [rax+32],xmm2)
251243
a2(movdqa [rax+48],xmm3)
252-
a1(jne scrypt_ChunkMix_sse2_loop)
244+
aj(jne scrypt_ChunkMix_sse2_loop)
253245
a1(ret)
254246
asm_naked_fn_end(scrypt_ChunkMix_sse2)
255247

@@ -261,7 +253,7 @@ asm_naked_fn_end(scrypt_ChunkMix_sse2)
261253

262254
#define SCRYPT_CHACHA_SSE2
263255

264-
static void NOINLINE
256+
static void NOINLINE asm_calling_convention
265257
scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) {
266258
uint32_t i, blocksPerChunk = r * 2, half = 0;
267259
xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3;
@@ -308,7 +300,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
308300
x0 = _mm_add_epi32(x0, x1);
309301
x3 = _mm_xor_si128(x3, x0);
310302
x4 = x3;
311-
x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
303+
x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
312304
x2 = _mm_add_epi32(x2, x3);
313305
x1 = _mm_xor_si128(x1, x2);
314306
x4 = x1;
@@ -327,7 +319,7 @@ scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]
327319
x0 = _mm_add_epi32(x0, x1);
328320
x3 = _mm_xor_si128(x3, x0);
329321
x4 = x3;
330-
x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16));
322+
x3 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(x3, 0xb1), 0xb1);
331323
x2 = _mm_add_epi32(x2, x3);
332324
x1 = _mm_xor_si128(x1, x2);
333325
x4 = x1;

0 commit comments

Comments
 (0)