Skip to content

Commit bb622d8

Browse files
floodyberryhackmod
authored andcommitted
add xop,avx2 detection
1 parent 3fa94e8 commit bb622d8

File tree

4 files changed

+108
-24
lines changed

4 files changed

+108
-24
lines changed

src/scryptjane/scrypt-jane-portable-x86.h

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC))
22
#define X86ASM
3+
34
/* gcc 2.95 royally screws up stack alignments on variables */
4-
#if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
5+
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS6PP)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000)))
56
#define X86ASM_SSE
67
#define X86ASM_SSE2
78
#endif
8-
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
9+
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2005)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102)))
910
#define X86ASM_SSSE3
1011
#endif
11-
#if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
12+
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40400)))
1213
#define X86ASM_AVX
14+
#define X86ASM_XOP
15+
#endif
16+
#if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= COMPILER_MSVC_VS2012)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40700)))
17+
#define X86ASM_AVX2
1318
#endif
1419
#endif
1520

@@ -21,6 +26,10 @@
2126
#endif
2227
#if (COMPILER_GCC >= 40400)
2328
#define X86_64ASM_AVX
29+
#define X86_64ASM_XOP
30+
#endif
31+
#if (COMPILER_GCC >= 40700)
32+
#define X86_64ASM_AVX2
2433
#endif
2534
#endif
2635

@@ -32,9 +41,16 @@
3241
#if defined(CPU_X86_64) || defined(X86ASM_SSE2)
3342
#define X86_INTRINSIC_SSE2
3443
#endif
35-
#if (COMPILER_MSVC >= 1400)
44+
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2005)
3645
#define X86_INTRINSIC_SSSE3
3746
#endif
47+
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2010SP1)
48+
#define X86_INTRINSIC_AVX
49+
#define X86_INTRINSIC_XOP
50+
#endif
51+
#if (COMPILER_MSVC >= COMPILER_MSVC_VS2012)
52+
#define X86_INTRINSIC_AVX2
53+
#endif
3854
#endif
3955

4056
#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS)
@@ -51,30 +67,45 @@
5167
#if defined(__AVX__)
5268
#define X86_INTRINSIC_AVX
5369
#endif
70+
#if defined(__XOP__)
71+
#define X86_INTRINSIC_XOP
72+
#endif
73+
#if defined(__AVX2__)
74+
#define X86_INTRINSIC_AVX2
75+
#endif
5476
#endif
5577

5678
/* only use simd on windows (or SSE2 on gcc)! */
5779
#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC)
5880
#if defined(X86_INTRINSIC_SSE)
59-
#define X86_INTRINSIC
6081
#include <mmintrin.h>
6182
#include <xmmintrin.h>
6283
typedef __m64 qmm;
6384
typedef __m128 xmm;
6485
typedef __m128d xmmd;
6586
#endif
6687
#if defined(X86_INTRINSIC_SSE2)
67-
#define X86_INTRINSIC_SSE2
6888
#include <emmintrin.h>
6989
typedef __m128i xmmi;
7090
#endif
7191
#if defined(X86_INTRINSIC_SSSE3)
72-
#define X86_INTRINSIC_SSSE3
7392
#include <tmmintrin.h>
7493
#endif
94+
#if defined(X86_INTRINSIC_AVX)
95+
#include <immintrin.h>
96+
#endif
97+
#if defined(X86_INTRINSIC_XOP)
98+
#if defined(COMPILER_MSVC)
99+
#include <intrin.h>
100+
#else
101+
#include <x86intrin.h>
102+
#endif
103+
#endif
104+
#if defined(X86_INTRINSIC_AVX2)
105+
typedef __m256i ymmi;
106+
#endif
75107
#endif
76108

77-
78109
#if defined(X86_INTRINSIC_SSE2)
79110
typedef union packedelem8_t {
80111
uint8_t u[16];
@@ -108,8 +139,8 @@
108139
#endif
109140

110141
#if defined(X86_INTRINSIC_SSSE3)
111-
static const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
112-
static const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
142+
static const packedelem8 ALIGN(16) ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}};
143+
static const packedelem8 ALIGN(16) ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}};
113144
#endif
114145

115146
/*
@@ -190,7 +221,9 @@ typedef enum cpu_flags_x86_t {
190221
cpu_ssse3 = 1 << 4,
191222
cpu_sse4_1 = 1 << 5,
192223
cpu_sse4_2 = 1 << 6,
193-
cpu_avx = 1 << 7
224+
cpu_avx = 1 << 7,
225+
cpu_xop = 1 << 8,
226+
cpu_avx2 = 1 << 9
194227
} cpu_flags_x86;
195228

196229
typedef enum cpu_vendors_x86_t {
@@ -237,6 +270,7 @@ get_cpuid(x86_regs *regs, uint32_t flags) {
237270

238271
asm_gcc()
239272
a1(push cpuid_bx)
273+
a2(xor ecx, ecx)
240274
a1(cpuid)
241275
a2(mov [%1 + 0], eax)
242276
a2(mov [%1 + 4], ebx)
@@ -273,7 +307,7 @@ detect_cpu(void) {
273307
union { uint8_t s[12]; uint32_t i[3]; } vendor_string;
274308
cpu_vendors_x86 vendor = cpu_nobody;
275309
x86_regs regs;
276-
uint32_t max_level;
310+
uint32_t max_level, max_ext_level;
277311
size_t cpu_flags = 0;
278312
#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX)
279313
uint64_t xgetbv_flags;
@@ -319,7 +353,22 @@ detect_cpu(void) {
319353
if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2;
320354
if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse;
321355
if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx;
322-
356+
357+
if (cpu_flags & cpu_avx) {
358+
if (max_level >= 7) {
359+
get_cpuid(&regs, 7);
360+
if (regs.ebx & (1 << 5)) cpu_flags |= cpu_avx2;
361+
}
362+
363+
get_cpuid(&regs, 0x80000000);
364+
max_ext_level = regs.eax;
365+
if (max_ext_level >= 0x80000001) {
366+
get_cpuid(&regs, 0x80000001);
367+
if (regs.ecx & (1 << 11)) cpu_flags |= cpu_xop;
368+
}
369+
}
370+
371+
323372
#if defined(SCRYPT_TEST_SPEED)
324373
cpu_flags &= cpu_detect_mask;
325374
#endif
@@ -330,7 +379,9 @@ detect_cpu(void) {
330379
#if defined(SCRYPT_TEST_SPEED)
331380
static const char *
332381
get_top_cpuflag_desc(size_t flag) {
333-
if (flag & cpu_avx) return "AVX";
382+
if (flag & cpu_avx2) return "AVX2";
383+
else if (flag & cpu_xop) return "XOP";
384+
else if (flag & cpu_avx) return "AVX";
334385
else if (flag & cpu_sse4_2) return "SSE4.2";
335386
else if (flag & cpu_sse4_1) return "SSE4.1";
336387
else if (flag & cpu_ssse3) return "SSSE3";
@@ -343,6 +394,16 @@ get_top_cpuflag_desc(size_t flag) {
343394

344395
/* enable the highest system-wide option */
345396
#if defined(SCRYPT_CHOOSE_COMPILETIME)
397+
#if !defined(__AVX2__)
398+
#undef X86_64ASM_AVX2
399+
#undef X86ASM_AVX2
400+
#undef X86_INTRINSIC_AVX2
401+
#endif
402+
#if !defined(__XOP__)
403+
#undef X86_64ASM_XOP
404+
#undef X86ASM_XOP
405+
#undef X86_INTRINSIC_XOP
406+
#endif
346407
#if !defined(__AVX__)
347408
#undef X86_64ASM_AVX
348409
#undef X86ASM_AVX

src/scryptjane/scrypt-jane-portable.h

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,29 @@
3636

3737
/* determine compiler */
3838
#if defined(_MSC_VER)
39-
#define COMPILER_MSVC _MSC_VER
40-
#if ((COMPILER_MSVC > 1200) || defined(_mm_free))
41-
#define COMPILER_MSVC6PP_AND_LATER
39+
#define COMPILER_MSVC_VS6 120000000
40+
#define COMPILER_MSVC_VS6PP 121000000
41+
#define COMPILER_MSVC_VS2002 130000000
42+
#define COMPILER_MSVC_VS2003 131000000
43+
#define COMPILER_MSVC_VS2005 140050727
44+
#define COMPILER_MSVC_VS2008 150000000
45+
#define COMPILER_MSVC_VS2008SP1 150030729
46+
#define COMPILER_MSVC_VS2010 160000000
47+
#define COMPILER_MSVC_VS2010SP1 160040219
48+
#define COMPILER_MSVC_VS2012RC 170000000
49+
#define COMPILER_MSVC_VS2012 170050727
50+
51+
#if _MSC_FULL_VER > 100000000
52+
#define COMPILER_MSVC (_MSC_FULL_VER)
53+
#else
54+
#define COMPILER_MSVC (_MSC_FULL_VER * 10)
4255
#endif
43-
#if (COMPILER_MSVC >= 1500)
44-
#define COMPILER_HAS_TMMINTRIN
56+
57+
#if ((_MSC_VER == 1200) && defined(_mm_free))
58+
#undef COMPILER_MSVC
59+
#define COMPILER_MSVC COMPILER_MSVC_VS6PP
4560
#endif
46-
61+
4762
#pragma warning(disable : 4127) /* conditional expression is constant */
4863
#pragma warning(disable : 4100) /* unreferenced formal parameter */
4964

@@ -75,7 +90,7 @@
7590
#define STDCALL __stdcall
7691
#undef NAKED
7792
#define NAKED __declspec(naked)
78-
#define MM16 __declspec(align(16))
93+
#define ALIGN(n) __declspec(align(n))
7994
#endif
8095
#if defined(__ICC)
8196
#define COMPILER_INTEL
@@ -113,7 +128,7 @@
113128
#define CDECL __attribute__((cdecl))
114129
#undef STDCALL
115130
#define STDCALL __attribute__((stdcall))
116-
#define MM16 __attribute__((aligned(16)))
131+
#define ALIGN(n) __attribute__((aligned(n)))
117132
#include <stdint.h>
118133
#endif
119134
#if defined(__MINGW32__) || defined(__MINGW64__)

src/scryptjane/scrypt-jane-romix-basic.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ static int
3434
scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) {
3535
/* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */
3636
const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS;
37-
scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
37+
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
38+
scrypt_mix_word_t ALIGN(32) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
39+
#else
40+
scrypt_mix_word_t ALIGN(16) chunk[2][4 * SCRYPT_BLOCK_WORDS], v;
41+
#endif
3842
uint8_t final[16];
3943
size_t i;
4044

src/scryptjane/scrypt-jane-romix-template.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919
*/
2020
static void asm_calling_convention
2121
SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) {
22-
scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block;
22+
#if (defined(X86ASM_AVX2) || defined(X86_64ASM_AVX2) || defined(X86_INTRINSIC_AVX2))
23+
scrypt_mix_word_t ALIGN(32) X[SCRYPT_BLOCK_WORDS], *block;
24+
#else
25+
scrypt_mix_word_t ALIGN(16) X[SCRYPT_BLOCK_WORDS], *block;
26+
#endif
2327
uint32_t i, j, blocksPerChunk = r * 2, half = 0;
2428

2529
/* 1: X = B_{2r - 1} */

0 commit comments

Comments
 (0)