Skip to content

Commit 0de7cc8

Browse files
committed
Merge #13191: Specialized double-SHA256 with 64 byte inputs with SSE4.1 and AVX2
4defdfa [MOVEONLY] Move unused Merkle branch code to tests (Pieter Wuille) 4437d6e 8-way AVX2 implementation for double SHA256 on 64-byte inputs (Pieter Wuille) 230294b 4-way SSE4.1 implementation for double SHA256 on 64-byte inputs (Pieter Wuille) 1f0e7ca Use SHA256D64 in Merkle root computation (Pieter Wuille) d0c9632 Specialized double sha256 for 64 byte inputs (Pieter Wuille) 57f3463 Refactor SHA256 code (Pieter Wuille) 0df0178 Benchmark Merkle root computation (Pieter Wuille) Pull request description: This introduces a framework for specialized double-SHA256 with 64 byte inputs. 4 different implementations are provided: * Generic C++ (reusing the normal SHA256 code) * Specialized C++ for 64-byte inputs, but no special instructions * 4-way using SSE4.1 intrinsics * 8-way using AVX2 intrinsics On my own system (AVX2 capable), I get these benchmarks for computing the Merkle root of 9001 leaves (supported lengths / special instructions / parallellism): * 7.2 ms with varsize/naive/1way (master, non-SSE4 hardware) * 5.8 ms with size64/naive/1way (this PR, non-SSE4 capable systems) * 4.8 ms with varsize/SSE4/1way (master, SSE4 hardware) * 2.9 ms with size64/SSE4/4way (this PR, SSE4 hardware) * 1.1 ms with size64/AVX2/8way (this PR, AVX2 hardware) Tree-SHA512: efa32d48b32820d9ce788ead4eb583949265be8c2e5f538c94bc914e92d131a57f8c1ee26c6f998e81fb0e30675d4e2eddc3360bcf632676249036018cff343e
2 parents 2722a1f + 4defdfa commit 0de7cc8

16 files changed

+1347
-204
lines changed

configure.ac

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ fi
312312
# be compiled with them, rather that specific objects/libs may use them after checking for runtime
313313
# compatibility.
314314
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
315+
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
316+
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
315317

316318
TEMP_CXXFLAGS="$CXXFLAGS"
317319
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
@@ -335,6 +337,44 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
335337
)
336338
CXXFLAGS="$TEMP_CXXFLAGS"
337339

340+
TEMP_CXXFLAGS="$CXXFLAGS"
341+
CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
342+
AC_MSG_CHECKING(for SSE4.1 intrinsics)
343+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
344+
#include <stdint.h>
345+
#if defined(_MSC_VER)
346+
#include <immintrin.h>
347+
#elif defined(__GNUC__)
348+
#include <x86intrin.h>
349+
#endif
350+
]],[[
351+
__m128i l = _mm_set1_epi32(0);
352+
return _mm_extract_epi32(l, 3);
353+
]])],
354+
[ AC_MSG_RESULT(yes); enable_sse41=yes; AC_DEFINE(ENABLE_SSE41, 1, [Define this symbol to build code that uses SSE4.1 intrinsics]) ],
355+
[ AC_MSG_RESULT(no)]
356+
)
357+
CXXFLAGS="$TEMP_CXXFLAGS"
358+
359+
TEMP_CXXFLAGS="$CXXFLAGS"
360+
CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
361+
AC_MSG_CHECKING(for AVX2 intrinsics)
362+
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
363+
#include <stdint.h>
364+
#if defined(_MSC_VER)
365+
#include <immintrin.h>
366+
#elif defined(__GNUC__) && defined(__AVX2__)
367+
#include <x86intrin.h>
368+
#endif
369+
]],[[
370+
__m256i l = _mm256_set1_epi32(0);
371+
return _mm256_extract_epi32(l, 7);
372+
]])],
373+
[ AC_MSG_RESULT(yes); enable_avx2=yes; AC_DEFINE(ENABLE_AVX2, 1, [Define this symbol to build code that uses AVX2 intrinsics]) ],
374+
[ AC_MSG_RESULT(no)]
375+
)
376+
CXXFLAGS="$TEMP_CXXFLAGS"
377+
338378
CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"
339379

340380
AC_ARG_WITH([utils],
@@ -1253,6 +1293,8 @@ AM_CONDITIONAL([USE_LCOV],[test x$use_lcov = xyes])
12531293
AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes])
12541294
AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
12551295
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
1296+
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
1297+
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
12561298
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])
12571299

12581300
AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
@@ -1295,6 +1337,8 @@ AC_SUBST(PIE_FLAGS)
12951337
AC_SUBST(SANITIZER_CXXFLAGS)
12961338
AC_SUBST(SANITIZER_LDFLAGS)
12971339
AC_SUBST(SSE42_CXXFLAGS)
1340+
AC_SUBST(SSE41_CXXFLAGS)
1341+
AC_SUBST(AVX2_CXXFLAGS)
12981342
AC_SUBST(LIBTOOL_APP_LDFLAGS)
12991343
AC_SUBST(USE_UPNP)
13001344
AC_SUBST(USE_QRCODE)

src/Makefile.am

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ LIBBITCOIN_CONSENSUS=libbitcoin_consensus.a
3030
LIBBITCOIN_CLI=libbitcoin_cli.a
3131
LIBBITCOIN_UTIL=libbitcoin_util.a
3232
LIBBITCOIN_CRYPTO=crypto/libbitcoin_crypto.a
33+
LIBBITCOIN_CRYPTO_SSE41=crypto/libbitcoin_crypto_sse41.a
34+
LIBBITCOIN_CRYPTO_AVX2=crypto/libbitcoin_crypto_avx2.a
3335
LIBBITCOINQT=qt/libbitcoinqt.a
3436
LIBSECP256K1=secp256k1/libsecp256k1.la
3537

@@ -50,6 +52,8 @@ $(LIBSECP256K1): $(wildcard secp256k1/src/*) $(wildcard secp256k1/include/*)
5052
# But to build the less dependent modules first, we manually select their order here:
5153
EXTRA_LIBRARIES += \
5254
$(LIBBITCOIN_CRYPTO) \
55+
$(LIBBITCOIN_CRYPTO_SSE41) \
56+
$(LIBBITCOIN_CRYPTO_AVX2) \
5357
$(LIBBITCOIN_UTIL) \
5458
$(LIBBITCOIN_COMMON) \
5559
$(LIBBITCOIN_CONSENSUS) \
@@ -289,6 +293,22 @@ if USE_ASM
289293
crypto_libbitcoin_crypto_a_SOURCES += crypto/sha256_sse4.cpp
290294
endif
291295

296+
crypto_libbitcoin_crypto_sse41_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
297+
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS = $(AM_CPPFLAGS)
298+
if ENABLE_SSE41
299+
crypto_libbitcoin_crypto_sse41_a_CXXFLAGS += $(SSE41_CXXFLAGS)
300+
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS += -DENABLE_SSE41
301+
endif
302+
crypto_libbitcoin_crypto_sse41_a_SOURCES = crypto/sha256_sse41.cpp
303+
304+
crypto_libbitcoin_crypto_avx2_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
305+
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS = $(AM_CPPFLAGS)
306+
if ENABLE_AVX2
307+
crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
308+
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
309+
endif
310+
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp
311+
292312
# consensus: shared between all executables that validate any consensus rules.
293313
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
294314
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
@@ -411,6 +431,8 @@ bitcoind_LDADD = \
411431
$(LIBBITCOIN_ZMQ) \
412432
$(LIBBITCOIN_CONSENSUS) \
413433
$(LIBBITCOIN_CRYPTO) \
434+
$(LIBBITCOIN_CRYPTO_SSE41) \
435+
$(LIBBITCOIN_CRYPTO_AVX2) \
414436
$(LIBLEVELDB) \
415437
$(LIBLEVELDB_SSE42) \
416438
$(LIBMEMENV) \
@@ -432,7 +454,9 @@ bitcoin_cli_LDADD = \
432454
$(LIBBITCOIN_CLI) \
433455
$(LIBUNIVALUE) \
434456
$(LIBBITCOIN_UTIL) \
435-
$(LIBBITCOIN_CRYPTO)
457+
$(LIBBITCOIN_CRYPTO) \
458+
$(LIBBITCOIN_CRYPTO_SSE41) \
459+
$(LIBBITCOIN_CRYPTO_AVX2)
436460

437461
bitcoin_cli_LDADD += $(BOOST_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(EVENT_LIBS)
438462
#
@@ -453,6 +477,8 @@ bitcoin_tx_LDADD = \
453477
$(LIBBITCOIN_UTIL) \
454478
$(LIBBITCOIN_CONSENSUS) \
455479
$(LIBBITCOIN_CRYPTO) \
480+
$(LIBBITCOIN_CRYPTO_SSE41) \
481+
$(LIBBITCOIN_CRYPTO_AVX2) \
456482
$(LIBSECP256K1)
457483

458484
bitcoin_tx_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)

src/Makefile.bench.include

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ bench_bench_bitcoin_SOURCES = \
2121
bench/rollingbloom.cpp \
2222
bench/crypto_hash.cpp \
2323
bench/ccoins_caching.cpp \
24+
bench/merkle_root.cpp \
2425
bench/mempool_eviction.cpp \
2526
bench/verify_script.cpp \
2627
bench/base58.cpp \
@@ -38,6 +39,8 @@ bench_bench_bitcoin_LDADD = \
3839
$(LIBBITCOIN_UTIL) \
3940
$(LIBBITCOIN_CONSENSUS) \
4041
$(LIBBITCOIN_CRYPTO) \
42+
$(LIBBITCOIN_CRYPTO_SSE41) \
43+
$(LIBBITCOIN_CRYPTO_AVX2) \
4144
$(LIBLEVELDB) \
4245
$(LIBLEVELDB_SSE42) \
4346
$(LIBMEMENV) \

src/Makefile.qt.include

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ endif
408408
if ENABLE_ZMQ
409409
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
410410
endif
411-
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
411+
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
412412
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
413413
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
414414
qt_bitcoin_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)

src/Makefile.qttest.include

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ endif
6262
if ENABLE_ZMQ
6363
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
6464
endif
65-
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
65+
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) \
6666
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
6767
$(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
6868
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)

src/Makefile.test.include

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ test_test_bitcoin_LDADD =
109109
if ENABLE_WALLET
110110
test_test_bitcoin_LDADD += $(LIBBITCOIN_WALLET)
111111
endif
112-
test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
112+
113+
test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) \
113114
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
114115
test_test_bitcoin_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
115116

@@ -134,6 +135,8 @@ test_test_bitcoin_fuzzy_LDADD = \
134135
$(LIBBITCOIN_UTIL) \
135136
$(LIBBITCOIN_CONSENSUS) \
136137
$(LIBBITCOIN_CRYPTO) \
138+
$(LIBBITCOIN_CRYPTO_SSE41) \
139+
$(LIBBITCOIN_CRYPTO_AVX2) \
137140
$(LIBSECP256K1)
138141

139142
test_test_bitcoin_fuzzy_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)

src/bench/crypto_hash.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ static void SHA256_32b(benchmark::State& state)
5252
}
5353
}
5454

55+
static void SHA256D64_1024(benchmark::State& state)
56+
{
57+
std::vector<uint8_t> in(64 * 1024, 0);
58+
while (state.KeepRunning()) {
59+
SHA256D64(in.data(), in.data(), 1024);
60+
}
61+
}
62+
5563
static void SHA512(benchmark::State& state)
5664
{
5765
uint8_t hash[CSHA512::OUTPUT_SIZE];
@@ -94,5 +102,6 @@ BENCHMARK(SHA512, 330);
94102

95103
BENCHMARK(SHA256_32b, 4700 * 1000);
96104
BENCHMARK(SipHash_32b, 40 * 1000 * 1000);
105+
BENCHMARK(SHA256D64_1024, 7400);
97106
BENCHMARK(FastRandom_32bit, 110 * 1000 * 1000);
98107
BENCHMARK(FastRandom_1bit, 440 * 1000 * 1000);

src/bench/merkle_root.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Copyright (c) 2016 The Bitcoin Core developers
2+
// Distributed under the MIT software license, see the accompanying
3+
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4+
5+
#include "bench.h"
6+
7+
#include "uint256.h"
8+
#include "random.h"
9+
#include "consensus/merkle.h"
10+
11+
static void MerkleRoot(benchmark::State& state)
12+
{
13+
FastRandomContext rng(true);
14+
std::vector<uint256> leaves;
15+
leaves.resize(9001);
16+
for (auto& item : leaves) {
17+
item = rng.rand256();
18+
}
19+
while (state.KeepRunning()) {
20+
bool mutation = false;
21+
uint256 hash = ComputeMerkleRoot(std::vector<uint256>(leaves), &mutation);
22+
leaves[mutation] = hash;
23+
}
24+
}
25+
26+
BENCHMARK(MerkleRoot, 800);

0 commit comments

Comments
 (0)