1
1
#if defined(CPU_X86 ) && (defined(COMPILER_MSVC ) || defined(COMPILER_GCC ))
2
2
#define X86ASM
3
+
3
4
/* gcc 2.95 royally screws up stack alignments on variables */
4
- #if (defined(COMPILER_MSVC6PP_AND_LATER ) || (defined(COMPILER_GCC ) && (COMPILER_GCC >= 30000 )))
5
+ #if (( defined(COMPILER_MSVC ) && ( COMPILER_MSVC >= COMPILER_MSVC_VS6PP ) ) || (defined(COMPILER_GCC ) && (COMPILER_GCC >= 30000 )))
5
6
#define X86ASM_SSE
6
7
#define X86ASM_SSE2
7
8
#endif
8
- #if ((defined(COMPILER_MSVC ) && (COMPILER_MSVC >= 1400 )) || (defined(COMPILER_GCC ) && (COMPILER_GCC >= 40102 )))
9
+ #if ((defined(COMPILER_MSVC ) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005 )) || (defined(COMPILER_GCC ) && (COMPILER_GCC >= 40102 )))
9
10
#define X86ASM_SSSE3
10
11
#endif
11
- #if ((defined(COMPILER_GCC ) && (COMPILER_GCC >= 40400 )))
12
+ #if ((defined(COMPILER_MSVC ) && ( COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1 )) || (defined( COMPILER_GCC ) && (COMPILER_GCC >= 40400 )))
12
13
#define X86ASM_AVX
14
+ #define X86ASM_XOP
15
+ #endif
16
+ #if ((defined(COMPILER_MSVC ) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012 )) || (defined(COMPILER_GCC ) && (COMPILER_GCC >= 40700 )))
17
+ #define X86ASM_AVX2
13
18
#endif
14
19
#endif
15
20
21
26
#endif
22
27
#if (COMPILER_GCC >= 40400 )
23
28
#define X86_64ASM_AVX
29
+ #define X86_64ASM_XOP
30
+ #endif
31
+ #if (COMPILER_GCC >= 40700 )
32
+ #define X86_64ASM_AVX2
24
33
#endif
25
34
#endif
26
35
32
41
#if defined(CPU_X86_64 ) || defined(X86ASM_SSE2 )
33
42
#define X86_INTRINSIC_SSE2
34
43
#endif
35
- #if (COMPILER_MSVC >= 1400 )
44
+ #if (COMPILER_MSVC >= COMPILER_MSVC_VS2005 )
36
45
#define X86_INTRINSIC_SSSE3
37
46
#endif
47
+ #if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1 )
48
+ #define X86_INTRINSIC_AVX
49
+ #define X86_INTRINSIC_XOP
50
+ #endif
51
+ #if (COMPILER_MSVC >= COMPILER_MSVC_VS2012 )
52
+ #define X86_INTRINSIC_AVX2
53
+ #endif
38
54
#endif
39
55
40
56
#if defined(COMPILER_GCC ) && defined(CPU_X86_FORCE_INTRINSICS )
51
67
#if defined(__AVX__ )
52
68
#define X86_INTRINSIC_AVX
53
69
#endif
70
+ #if defined(__XOP__ )
71
+ #define X86_INTRINSIC_XOP
72
+ #endif
73
+ #if defined(__AVX2__ )
74
+ #define X86_INTRINSIC_AVX2
75
+ #endif
54
76
#endif
55
77
56
78
/* only use simd on windows (or SSE2 on gcc)! */
57
79
#if defined(CPU_X86_FORCE_INTRINSICS ) || defined(X86_INTRINSIC )
58
80
#if defined(X86_INTRINSIC_SSE )
59
- #define X86_INTRINSIC
60
81
#include <mmintrin.h>
61
82
#include <xmmintrin.h>
62
83
typedef __m64 qmm ;
63
84
typedef __m128 xmm ;
64
85
typedef __m128d xmmd ;
65
86
#endif
66
87
#if defined(X86_INTRINSIC_SSE2 )
67
- #define X86_INTRINSIC_SSE2
68
88
#include <emmintrin.h>
69
89
typedef __m128i xmmi ;
70
90
#endif
71
91
#if defined(X86_INTRINSIC_SSSE3 )
72
- #define X86_INTRINSIC_SSSE3
73
92
#include <tmmintrin.h>
74
93
#endif
94
+ #if defined(X86_INTRINSIC_AVX )
95
+ #include <immintrin.h>
96
+ #endif
97
+ #if defined(X86_INTRINSIC_XOP )
98
+ #if defined(COMPILER_MSVC )
99
+ #include <intrin.h>
100
+ #else
101
+ #include <x86intrin.h>
102
+ #endif
103
+ #endif
104
+ #if defined(X86_INTRINSIC_AVX2 )
105
+ typedef __m256i ymmi ;
106
+ #endif
75
107
#endif
76
108
77
-
78
109
#if defined(X86_INTRINSIC_SSE2 )
79
110
typedef union packedelem8_t {
80
111
uint8_t u [16 ];
108
139
#endif
109
140
110
141
#if defined(X86_INTRINSIC_SSSE3 )
111
- static const packedelem8 MM16 ssse3_rotl16_32bit = {{2 ,3 ,0 ,1 ,6 ,7 ,4 ,5 ,10 ,11 ,8 ,9 ,14 ,15 ,12 ,13 }};
112
- static const packedelem8 MM16 ssse3_rotl8_32bit = {{3 ,0 ,1 ,2 ,7 ,4 ,5 ,6 ,11 ,8 ,9 ,10 ,15 ,12 ,13 ,14 }};
142
+ static const packedelem8 ALIGN ( 16 ) ssse3_rotl16_32bit = {{2 ,3 ,0 ,1 ,6 ,7 ,4 ,5 ,10 ,11 ,8 ,9 ,14 ,15 ,12 ,13 }};
143
+ static const packedelem8 ALIGN ( 16 ) ssse3_rotl8_32bit = {{3 ,0 ,1 ,2 ,7 ,4 ,5 ,6 ,11 ,8 ,9 ,10 ,15 ,12 ,13 ,14 }};
113
144
#endif
114
145
115
146
/*
@@ -190,7 +221,9 @@ typedef enum cpu_flags_x86_t {
190
221
cpu_ssse3 = 1 << 4 ,
191
222
cpu_sse4_1 = 1 << 5 ,
192
223
cpu_sse4_2 = 1 << 6 ,
193
- cpu_avx = 1 << 7
224
+ cpu_avx = 1 << 7 ,
225
+ cpu_xop = 1 << 8 ,
226
+ cpu_avx2 = 1 << 9
194
227
} cpu_flags_x86 ;
195
228
196
229
typedef enum cpu_vendors_x86_t {
@@ -237,6 +270,7 @@ get_cpuid(x86_regs *regs, uint32_t flags) {
237
270
238
271
asm_gcc ()
239
272
a1 (push cpuid_bx )
273
+ a2 (xor ecx , ecx )
240
274
a1 (cpuid )
241
275
a2 (mov [%1 + 0 ], eax )
242
276
a2 (mov [%1 + 4 ], ebx )
@@ -273,7 +307,7 @@ detect_cpu(void) {
273
307
union { uint8_t s [12 ]; uint32_t i [3 ]; } vendor_string ;
274
308
cpu_vendors_x86 vendor = cpu_nobody ;
275
309
x86_regs regs ;
276
- uint32_t max_level ;
310
+ uint32_t max_level , max_ext_level ;
277
311
size_t cpu_flags = 0 ;
278
312
#if defined(X86ASM_AVX ) || defined(X86_64ASM_AVX )
279
313
uint64_t xgetbv_flags ;
@@ -319,7 +353,22 @@ detect_cpu(void) {
319
353
if (regs .edx & (1 << 26 )) cpu_flags |= cpu_sse2 ;
320
354
if (regs .edx & (1 << 25 )) cpu_flags |= cpu_sse ;
321
355
if (regs .edx & (1 << 23 )) cpu_flags |= cpu_mmx ;
322
-
356
+
357
+ if (cpu_flags & cpu_avx ) {
358
+ if (max_level >= 7 ) {
359
+ get_cpuid (& regs , 7 );
360
+ if (regs .ebx & (1 << 5 )) cpu_flags |= cpu_avx2 ;
361
+ }
362
+
363
+ get_cpuid (& regs , 0x80000000 );
364
+ max_ext_level = regs .eax ;
365
+ if (max_ext_level >= 0x80000001 ) {
366
+ get_cpuid (& regs , 0x80000001 );
367
+ if (regs .ecx & (1 << 11 )) cpu_flags |= cpu_xop ;
368
+ }
369
+ }
370
+
371
+
323
372
#if defined(SCRYPT_TEST_SPEED )
324
373
cpu_flags &= cpu_detect_mask ;
325
374
#endif
@@ -330,7 +379,9 @@ detect_cpu(void) {
330
379
#if defined(SCRYPT_TEST_SPEED )
331
380
static const char *
332
381
get_top_cpuflag_desc (size_t flag ) {
333
- if (flag & cpu_avx ) return "AVX" ;
382
+ if (flag & cpu_avx2 ) return "AVX2" ;
383
+ else if (flag & cpu_xop ) return "XOP" ;
384
+ else if (flag & cpu_avx ) return "AVX" ;
334
385
else if (flag & cpu_sse4_2 ) return "SSE4.2" ;
335
386
else if (flag & cpu_sse4_1 ) return "SSE4.1" ;
336
387
else if (flag & cpu_ssse3 ) return "SSSE3" ;
@@ -343,6 +394,16 @@ get_top_cpuflag_desc(size_t flag) {
343
394
344
395
/* enable the highest system-wide option */
345
396
#if defined(SCRYPT_CHOOSE_COMPILETIME )
397
+ #if !defined(__AVX2__ )
398
+ #undef X86_64ASM_AVX2
399
+ #undef X86ASM_AVX2
400
+ #undef X86_INTRINSIC_AVX2
401
+ #endif
402
+ #if !defined(__XOP__ )
403
+ #undef X86_64ASM_XOP
404
+ #undef X86ASM_XOP
405
+ #undef X86_INTRINSIC_XOP
406
+ #endif
346
407
#if !defined(__AVX__ )
347
408
#undef X86_64ASM_AVX
348
409
#undef X86ASM_AVX
0 commit comments