Skip to content

Commit 3a3eabe

Browse files
committed
Merge #13386: SHA256 implementations based on Intel SHA Extensions
66b2cf1 Use immintrin.h everywhere for intrinsics (Pieter Wuille) 4c935e2 Add SHA256 implementation using using Intel SHA intrinsics (Pieter Wuille) 268400d [Refactor] CPU feature detection logic for SHA256 (Pieter Wuille) Pull request description: Based on #13191. This adds SHA256 implementations that use Intel's SHA Extension instructions (using intrinsics). This needs GCC 4.9 or Clang 3.4. In addition to #13191, two extra implementations are provided: * (a) A variable-length SHA256 implementation using SHA extensions. * (b) A 2-way 64-byte input double-SHA256 implementation using SHA extensions. Benchmarks for 9001-element Merkle tree root computation on an AMD Ryzen 1800X system: * Using generic C++ code (pre-#10821): 6.1ms * Using SSE4 (master, #10821): 4.6ms * Using 4-way SSE4 specialized for 64-byte inputs (#13191): 2.8ms * Using 8-way AVX2 specialized for 64-byte inputs (#13191): 2.1ms * Using 2-way SHA-NI specialized for 64-byte inputs (this PR): 0.56ms Benchmarks for 32-byte SHA256 on the same system: * Using SSE4 (master, #10821): 190ns * Using SHA-NI (this PR): 53ns Benchmarks for 1000000-byte SHA256 on the same system: * Using SSE4 (master, #10821): 2.5ms * Using SHA-NI (this PR): 0.51ms Tree-SHA512: 2b319e33b22579f815d91f9daf7994a5e1e799c4f73c13e15070dd54ba71f3f6438ccf77ae9cbd1ce76f972d9cbeb5f0edfea3d86f101bbc1055db70e42743b7
2 parents 7e74c54 + 66b2cf1 commit 3a3eabe

File tree

7 files changed

+464
-32
lines changed

7 files changed

+464
-32
lines changed

configure.ac

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ fi
320320
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
321321
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
322322
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
323+
AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]])
323324

324325
TEMP_CXXFLAGS="$CXXFLAGS"
325326
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
@@ -348,11 +349,7 @@ CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
348349
AC_MSG_CHECKING(for SSE4.1 intrinsics)
349350
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
350351
#include <stdint.h>
351-
#if defined(_MSC_VER)
352352
#include <immintrin.h>
353-
#elif defined(__GNUC__)
354-
#include <x86intrin.h>
355-
#endif
356353
]],[[
357354
__m128i l = _mm_set1_epi32(0);
358355
return _mm_extract_epi32(l, 3);
@@ -367,11 +364,7 @@ CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
367364
AC_MSG_CHECKING(for AVX2 intrinsics)
368365
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
369366
#include <stdint.h>
370-
#if defined(_MSC_VER)
371367
#include <immintrin.h>
372-
#elif defined(__GNUC__) && defined(__AVX2__)
373-
#include <x86intrin.h>
374-
#endif
375368
]],[[
376369
__m256i l = _mm256_set1_epi32(0);
377370
return _mm256_extract_epi32(l, 7);
@@ -381,6 +374,23 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
381374
)
382375
CXXFLAGS="$TEMP_CXXFLAGS"
383376

377+
TEMP_CXXFLAGS="$CXXFLAGS"
378+
CXXFLAGS="$CXXFLAGS $SHANI_CXXFLAGS"
379+
AC_MSG_CHECKING(for SHA-NI intrinsics)
380+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
381+
#include <stdint.h>
382+
#include <immintrin.h>
383+
]],[[
384+
__m128i i = _mm_set1_epi32(0);
385+
__m128i j = _mm_set1_epi32(1);
386+
__m128i k = _mm_set1_epi32(2);
387+
return _mm_extract_epi32(_mm_sha256rnds2_epu32(i, i, k), 0);
388+
]])],
389+
[ AC_MSG_RESULT(yes); enable_shani=yes; AC_DEFINE(ENABLE_SHANI, 1, [Define this symbol to build code that uses SHA-NI intrinsics]) ],
390+
[ AC_MSG_RESULT(no)]
391+
)
392+
CXXFLAGS="$TEMP_CXXFLAGS"
393+
384394
CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"
385395

386396
AC_ARG_WITH([utils],
@@ -1309,6 +1319,7 @@ AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
13091319
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
13101320
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
13111321
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
1322+
AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes])
13121323
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])
13131324

13141325
AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
@@ -1353,6 +1364,7 @@ AC_SUBST(SANITIZER_LDFLAGS)
13531364
AC_SUBST(SSE42_CXXFLAGS)
13541365
AC_SUBST(SSE41_CXXFLAGS)
13551366
AC_SUBST(AVX2_CXXFLAGS)
1367+
AC_SUBST(SHANI_CXXFLAGS)
13561368
AC_SUBST(LIBTOOL_APP_LDFLAGS)
13571369
AC_SUBST(USE_UPNP)
13581370
AC_SUBST(USE_QRCODE)

src/Makefile.am

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ if ENABLE_AVX2
5252
LIBBITCOIN_CRYPTO_AVX2 = crypto/libbitcoin_crypto_avx2.a
5353
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX2)
5454
endif
55+
if ENABLE_SHANI
56+
LIBBITCOIN_CRYPTO_SHANI = crypto/libbitcoin_crypto_shani.a
57+
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_SHANI)
58+
endif
5559

5660
$(LIBSECP256K1): $(wildcard secp256k1/src/*.h) $(wildcard secp256k1/src/*.c) $(wildcard secp256k1/include/*)
5761
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C $(@D) $(@F)
@@ -318,6 +322,12 @@ crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
318322
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
319323
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp
320324

325+
crypto_libbitcoin_crypto_shani_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
326+
crypto_libbitcoin_crypto_shani_a_CPPFLAGS = $(AM_CPPFLAGS)
327+
crypto_libbitcoin_crypto_shani_a_CXXFLAGS += $(SHANI_CXXFLAGS)
328+
crypto_libbitcoin_crypto_shani_a_CPPFLAGS += -DENABLE_SHANI
329+
crypto_libbitcoin_crypto_shani_a_SOURCES = crypto/sha256_shani.cpp
330+
321331
# consensus: shared between all executables that validate any consensus rules.
322332
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
323333
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)

src/Makefile.test.include

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ test_test_bitcoin_fuzzy_LDADD = \
137137
$(LIBBITCOIN_CRYPTO) \
138138
$(LIBBITCOIN_CRYPTO_SSE41) \
139139
$(LIBBITCOIN_CRYPTO_AVX2) \
140+
$(LIBBITCOIN_CRYPTO_SHANI) \
140141
$(LIBSECP256K1)
141142

142143
test_test_bitcoin_fuzzy_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)

src/crypto/sha256.cpp

Lines changed: 74 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ namespace sha256d64_avx2
2929
void Transform_8way(unsigned char* out, const unsigned char* in);
3030
}
3131

32+
namespace sha256d64_shani
33+
{
34+
void Transform_2way(unsigned char* out, const unsigned char* in);
35+
}
36+
37+
namespace sha256_shani
38+
{
39+
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks);
40+
}
41+
3242
// Internal implementation code.
3343
namespace
3444
{
@@ -448,6 +458,7 @@ void TransformD64Wrapper(unsigned char* out, const unsigned char* in)
448458

449459
TransformType Transform = sha256::Transform;
450460
TransformD64Type TransformD64 = sha256::TransformD64;
461+
TransformD64Type TransformD64_2way = nullptr;
451462
TransformD64Type TransformD64_4way = nullptr;
452463
TransformD64Type TransformD64_8way = nullptr;
453464

@@ -512,6 +523,13 @@ bool SelfTest() {
512523
TransformD64(out, data + 1);
513524
if (!std::equal(out, out + 32, result_d64)) return false;
514525

526+
// Test TransformD64_2way, if available.
527+
if (TransformD64_2way) {
528+
unsigned char out[64];
529+
TransformD64_2way(out, data + 1);
530+
if (!std::equal(out, out + 64, result_d64)) return false;
531+
}
532+
515533
// Test TransformD64_4way, if available.
516534
if (TransformD64_4way) {
517535
unsigned char out[128];
@@ -556,32 +574,64 @@ std::string SHA256AutoDetect()
556574
{
557575
std::string ret = "standard";
558576
#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__) || defined(__i386__))
559-
(void)AVXEnabled; // Silence unused warning (in case ENABLE_AVX2 is not defined)
577+
bool have_sse4 = false;
578+
bool have_xsave = false;
579+
bool have_avx = false;
580+
bool have_avx2 = false;
581+
bool have_shani = false;
582+
bool enabled_avx = false;
583+
584+
(void)AVXEnabled;
585+
(void)have_sse4;
586+
(void)have_avx;
587+
(void)have_xsave;
588+
(void)have_avx2;
589+
(void)have_shani;
590+
(void)enabled_avx;
591+
560592
uint32_t eax, ebx, ecx, edx;
561593
cpuid(1, 0, eax, ebx, ecx, edx);
562-
if ((ecx >> 19) & 1) {
594+
have_sse4 = (ecx >> 19) & 1;
595+
have_xsave = (ecx >> 27) & 1;
596+
have_avx = (ecx >> 28) & 1;
597+
if (have_xsave && have_avx) {
598+
enabled_avx = AVXEnabled();
599+
}
600+
if (have_sse4) {
601+
cpuid(7, 0, eax, ebx, ecx, edx);
602+
have_avx2 = (ebx >> 5) & 1;
603+
have_shani = (ebx >> 29) & 1;
604+
}
605+
606+
#if defined(ENABLE_SHANI) && !defined(BUILD_BITCOIN_INTERNAL)
607+
if (have_shani) {
608+
Transform = sha256_shani::Transform;
609+
TransformD64 = TransformD64Wrapper<sha256_shani::Transform>;
610+
TransformD64_2way = sha256d64_shani::Transform_2way;
611+
ret = "shani(1way,2way)";
612+
have_sse4 = false; // Disable SSE4/AVX2;
613+
have_avx2 = false;
614+
}
615+
#endif
616+
617+
if (have_sse4) {
563618
#if defined(__x86_64__) || defined(__amd64__)
564619
Transform = sha256_sse4::Transform;
565620
TransformD64 = TransformD64Wrapper<sha256_sse4::Transform>;
621+
ret = "sse4(1way)";
566622
#endif
567623
#if defined(ENABLE_SSE41) && !defined(BUILD_BITCOIN_INTERNAL)
568624
TransformD64_4way = sha256d64_sse41::Transform_4way;
569-
ret = "sse4(1way+4way)";
570-
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
571-
if (((ecx >> 27) & 1) && ((ecx >> 28) & 1)) { // XSAVE and AVX
572-
cpuid(7, 0, eax, ebx, ecx, edx);
573-
if ((ebx >> 5) & 1) { // AVX2 flag
574-
if (AVXEnabled()) { // OS has enabled AVX registers
575-
TransformD64_8way = sha256d64_avx2::Transform_8way;
576-
ret += ",avx2(8way)";
577-
}
578-
}
579-
}
580-
#endif
581-
#else
582-
ret = "sse4";
625+
ret += ",sse41(4way)";
583626
#endif
584627
}
628+
629+
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
630+
if (have_avx2 && have_avx && enabled_avx) {
631+
TransformD64_8way = sha256d64_avx2::Transform_8way;
632+
ret += ",avx2(8way)";
633+
}
634+
#endif
585635
#endif
586636

587637
assert(SelfTest());
@@ -663,6 +713,14 @@ void SHA256D64(unsigned char* out, const unsigned char* in, size_t blocks)
663713
blocks -= 4;
664714
}
665715
}
716+
if (TransformD64_2way) {
717+
while (blocks >= 2) {
718+
TransformD64_2way(out, in);
719+
out += 64;
720+
in += 128;
721+
blocks -= 2;
722+
}
723+
}
666724
while (blocks) {
667725
TransformD64(out, in);
668726
out += 32;

src/crypto/sha256_avx2.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
#ifdef ENABLE_AVX2
22

33
#include <stdint.h>
4-
#if defined(_MSC_VER)
54
#include <immintrin.h>
6-
#elif defined(__GNUC__)
7-
#include <x86intrin.h>
8-
#endif
95

106
#include <crypto/sha256.h>
117
#include <crypto/common.h>

0 commit comments

Comments
 (0)