1
1
/* x86 */
2
- #if defined(X86ASM_AVX ) && (!defined(SCRYPT_CHOOSE_COMPILETIME ) || !defined(SCRYPT_CHACHA_INCLUDED ))
2
+ #if defined(X86ASM_AVX ) && (!defined(SCRYPT_CHOOSE_COMPILETIME ) || !defined(SCRYPT_CHACHA_INCLUDED )) && !defined( CPU_X86_FORCE_INTRINSICS )
3
3
4
4
#define SCRYPT_CHACHA_AVX
5
5
@@ -20,13 +20,33 @@ asm_naked_fn(scrypt_ChunkMix_avx)
20
20
a2 (shl edx ,6 )
21
21
a2 (lea ecx , [edx - 64 ])
22
22
a2 (and eax , eax )
23
- a2 (vmovdqa xmm4 ,[ssse3_rotl16_32bit ])
24
- a2 (vmovdqa xmm5 ,[ssse3_rotl8_32bit ])
23
+ a2 (mov ebx , 0x01000302 )
24
+ a2 (vmovd xmm4 , ebx )
25
+ a2 (mov ebx , 0x05040706 )
26
+ a2 (vmovd xmm0 , ebx )
27
+ a2 (mov ebx , 0x09080b0a )
28
+ a2 (vmovd xmm1 , ebx )
29
+ a2 (mov ebx , 0x0d0c0f0e )
30
+ a2 (vmovd xmm2 , ebx )
31
+ a2 (mov ebx , 0x02010003 )
32
+ a2 (vmovd xmm5 , ebx )
33
+ a2 (mov ebx , 0x06050407 )
34
+ a2 (vmovd xmm3 , ebx )
35
+ a2 (mov ebx , 0x0a09080b )
36
+ a2 (vmovd xmm6 , ebx )
37
+ a2 (mov ebx , 0x0e0d0c0f )
38
+ a2 (vmovd xmm7 , ebx )
39
+ a3 (vpunpckldq xmm4 , xmm4 , xmm0 )
40
+ a3 (vpunpckldq xmm5 , xmm5 , xmm3 )
41
+ a3 (vpunpckldq xmm1 , xmm1 , xmm2 )
42
+ a3 (vpunpckldq xmm6 , xmm6 , xmm7 )
43
+ a3 (vpunpcklqdq xmm4 , xmm4 , xmm1 )
44
+ a3 (vpunpcklqdq xmm5 , xmm5 , xmm6 )
25
45
a2 (vmovdqa xmm0 ,[ecx + esi + 0 ])
26
46
a2 (vmovdqa xmm1 ,[ecx + esi + 16 ])
27
47
a2 (vmovdqa xmm2 ,[ecx + esi + 32 ])
28
48
a2 (vmovdqa xmm3 ,[ecx + esi + 48 ])
29
- a1 (jz scrypt_ChunkMix_avx_no_xor1 )
49
+ aj (jz scrypt_ChunkMix_avx_no_xor1 )
30
50
a3 (vpxor xmm0 ,xmm0 ,[ecx + eax + 0 ])
31
51
a3 (vpxor xmm1 ,xmm1 ,[ecx + eax + 16 ])
32
52
a3 (vpxor xmm2 ,xmm2 ,[ecx + eax + 32 ])
@@ -40,7 +60,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
40
60
a3 (vpxor xmm1 ,xmm1 ,[esi + ecx + 16 ])
41
61
a3 (vpxor xmm2 ,xmm2 ,[esi + ecx + 32 ])
42
62
a3 (vpxor xmm3 ,xmm3 ,[esi + ecx + 48 ])
43
- a1 (jz scrypt_ChunkMix_avx_no_xor2 )
63
+ aj (jz scrypt_ChunkMix_avx_no_xor2 )
44
64
a3 (vpxor xmm0 ,xmm0 ,[eax + ecx + 0 ])
45
65
a3 (vpxor xmm1 ,xmm1 ,[eax + ecx + 16 ])
46
66
a3 (vpxor xmm2 ,xmm2 ,[eax + ecx + 32 ])
@@ -71,7 +91,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
71
91
a3 (vpsrld xmm6 ,xmm1 ,25 )
72
92
a3 (vpslld xmm1 ,xmm1 ,7 )
73
93
a3 (vpxor xmm1 ,xmm1 ,xmm6 )
74
- a2 (sub eax ,2 )
75
94
a3 (vpaddd xmm0 ,xmm0 ,xmm1 )
76
95
a3 (vpxor xmm3 ,xmm3 ,xmm0 )
77
96
a3 (vpshufb xmm3 ,xmm3 ,xmm4 )
@@ -85,13 +104,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
85
104
a3 (vpshufb xmm3 ,xmm3 ,xmm5 )
86
105
a3 (vpshufd xmm0 ,xmm0 ,0x39 )
87
106
a3 (vpaddd xmm2 ,xmm2 ,xmm3 )
88
- a3 (pshufd xmm3 ,xmm3 ,0x4e )
107
+ a3 (vpshufd xmm3 ,xmm3 ,0x4e )
89
108
a3 (vpxor xmm1 ,xmm1 ,xmm2 )
90
- a3 (pshufd xmm2 ,xmm2 ,0x93 )
109
+ a3 (vpshufd xmm2 ,xmm2 ,0x93 )
91
110
a3 (vpsrld xmm6 ,xmm1 ,25 )
92
111
a3 (vpslld xmm1 ,xmm1 ,7 )
93
112
a3 (vpxor xmm1 ,xmm1 ,xmm6 )
94
- a1 (ja scrypt_chacha_avx_loop )
113
+ a2 (sub eax ,2 )
114
+ aj (ja scrypt_chacha_avx_loop )
95
115
a3 (vpaddd xmm0 ,xmm0 ,[esp + 0 ])
96
116
a3 (vpaddd xmm1 ,xmm1 ,[esp + 16 ])
97
117
a3 (vpaddd xmm2 ,xmm2 ,[esp + 32 ])
@@ -108,21 +128,21 @@ asm_naked_fn(scrypt_ChunkMix_avx)
108
128
a2 (vmovdqa [eax + 32 ],xmm2 )
109
129
a2 (vmovdqa [eax + 48 ],xmm3 )
110
130
a2 (mov eax , [ebp + 28 ])
111
- a1 (jne scrypt_ChunkMix_avx_loop )
131
+ aj (jne scrypt_ChunkMix_avx_loop )
112
132
a2 (mov esp ,ebp )
113
133
a1 (pop ebp )
114
134
a1 (pop esi )
115
135
a1 (pop edi )
116
136
a1 (pop ebx )
117
- a1 ( ret 16 )
137
+ aret ( 16 )
118
138
asm_naked_fn_end (scrypt_ChunkMix_avx )
119
139
120
140
#endif
121
141
122
142
123
143
124
144
/* x64 */
125
- #if defined(X86_64ASM_AVX ) && (!defined(SCRYPT_CHOOSE_COMPILETIME ) || !defined(SCRYPT_CHACHA_INCLUDED ))
145
+ #if defined(X86_64ASM_AVX ) && (!defined(SCRYPT_CHOOSE_COMPILETIME ) || !defined(SCRYPT_CHACHA_INCLUDED )) && !defined( CPU_X86_FORCE_INTRINSICS )
126
146
127
147
#define SCRYPT_CHACHA_AVX
128
148
@@ -134,13 +154,21 @@ asm_naked_fn(scrypt_ChunkMix_avx)
134
154
a2 (lea rax , [rsi + r9 ])
135
155
a2 (lea r9 ,[rdx + r9 ])
136
156
a2 (and rdx , rdx )
137
- a2 (vmovdqa xmm4 ,[ssse3_rotl16_32bit ])
138
- a2 (vmovdqa xmm5 , [ssse3_rotl8_32bit ])
139
157
a2 (vmovdqa xmm0 ,[rax + 0 ])
140
158
a2 (vmovdqa xmm1 , [rax + 16 ])
141
159
a2 (vmovdqa xmm2 ,[rax + 32 ])
142
160
a2 (vmovdqa xmm3 , [rax + 48 ])
143
- a1 (jz scrypt_ChunkMix_avx_no_xor1 )
161
+ a2 (mov r8 , 0x0504070601000302 )
162
+ a2 (mov rax , 0x0d0c0f0e09080b0a )
163
+ a2 (movd xmm4 , r8 )
164
+ a2 (movd xmm6 , rax )
165
+ a2 (mov r8 , 0x0605040702010003 )
166
+ a2 (mov rax , 0x0e0d0c0f0a09080b )
167
+ a2 (movd xmm5 , r8 )
168
+ a2 (movd xmm7 , rax )
169
+ a3 (vpunpcklqdq xmm4 , xmm4 , xmm6 )
170
+ a3 (vpunpcklqdq xmm5 , xmm5 , xmm7 )
171
+ aj (jz scrypt_ChunkMix_avx_no_xor1 )
144
172
a3 (vpxor xmm0 ,xmm0 ,[r9 + 0 ])
145
173
a3 (vpxor xmm1 ,xmm1 ,[r9 + 16 ])
146
174
a3 (vpxor xmm2 ,xmm2 ,[r9 + 32 ])
@@ -154,7 +182,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
154
182
a3 (vpxor xmm1 ,xmm1 ,[rsi + r9 + 16 ])
155
183
a3 (vpxor xmm2 ,xmm2 ,[rsi + r9 + 32 ])
156
184
a3 (vpxor xmm3 ,xmm3 ,[rsi + r9 + 48 ])
157
- a1 (jz scrypt_ChunkMix_avx_no_xor2 )
185
+ aj (jz scrypt_ChunkMix_avx_no_xor2 )
158
186
a3 (vpxor xmm0 ,xmm0 ,[rdx + r9 + 0 ])
159
187
a3 (vpxor xmm1 ,xmm1 ,[rdx + r9 + 16 ])
160
188
a3 (vpxor xmm2 ,xmm2 ,[rdx + r9 + 32 ])
@@ -185,7 +213,6 @@ asm_naked_fn(scrypt_ChunkMix_avx)
185
213
a3 (vpsrld xmm12 ,xmm1 ,25 )
186
214
a3 (vpslld xmm1 ,xmm1 ,7 )
187
215
a3 (vpxor xmm1 ,xmm1 ,xmm12 )
188
- a2 (sub rax ,2 )
189
216
a3 (vpaddd xmm0 ,xmm0 ,xmm1 )
190
217
a3 (vpxor xmm3 ,xmm3 ,xmm0 )
191
218
a3 (vpshufb xmm3 ,xmm3 ,xmm4 )
@@ -199,13 +226,14 @@ asm_naked_fn(scrypt_ChunkMix_avx)
199
226
a3 (vpshufb xmm3 ,xmm3 ,xmm5 )
200
227
a3 (vpshufd xmm0 ,xmm0 ,0x39 )
201
228
a3 (vpaddd xmm2 ,xmm2 ,xmm3 )
202
- a3 (pshufd xmm3 ,xmm3 ,0x4e )
229
+ a3 (vpshufd xmm3 ,xmm3 ,0x4e )
203
230
a3 (vpxor xmm1 ,xmm1 ,xmm2 )
204
- a3 (pshufd xmm2 ,xmm2 ,0x93 )
231
+ a3 (vpshufd xmm2 ,xmm2 ,0x93 )
205
232
a3 (vpsrld xmm12 ,xmm1 ,25 )
206
233
a3 (vpslld xmm1 ,xmm1 ,7 )
207
234
a3 (vpxor xmm1 ,xmm1 ,xmm12 )
208
- a1 (ja scrypt_chacha_avx_loop )
235
+ a2 (sub rax ,2 )
236
+ aj (ja scrypt_chacha_avx_loop )
209
237
a3 (vpaddd xmm0 ,xmm0 ,xmm8 )
210
238
a3 (vpaddd xmm1 ,xmm1 ,xmm9 )
211
239
a3 (vpaddd xmm2 ,xmm2 ,xmm10 )
@@ -221,7 +249,7 @@ asm_naked_fn(scrypt_ChunkMix_avx)
221
249
a2 (vmovdqa [rax + 16 ],xmm1 )
222
250
a2 (vmovdqa [rax + 32 ],xmm2 )
223
251
a2 (vmovdqa [rax + 48 ],xmm3 )
224
- a1 (jne scrypt_ChunkMix_avx_loop )
252
+ aj (jne scrypt_ChunkMix_avx_loop )
225
253
a1 (ret )
226
254
asm_naked_fn_end (scrypt_ChunkMix_avx )
227
255
@@ -233,7 +261,7 @@ asm_naked_fn_end(scrypt_ChunkMix_avx)
233
261
234
262
#define SCRYPT_CHACHA_AVX
235
263
236
- static void NOINLINE
264
+ static void asm_calling_convention NOINLINE
237
265
scrypt_ChunkMix_avx (uint32_t * Bout /*[chunkBytes]*/ , uint32_t * Bin /*[chunkBytes]*/ , uint32_t * Bxor /*[chunkBytes]*/ , uint32_t r ) {
238
266
uint32_t i , blocksPerChunk = r * 2 , half = 0 ;
239
267
xmmi * xmmp ,x0 ,x1 ,x2 ,x3 ,x6 ,t0 ,t1 ,t2 ,t3 ;
0 commit comments