Skip to content

Commit dab18f0

Browse files
author
MacroFake
committed
Merge bitcoin/bitcoin#24946: Unroll the ChaCha20 inner loop for performance
81c09ee Unroll the ChaCha20 inner loop for performance (Pieter Wuille) Pull request description: Unrolling the inner ChaCha20 loop gives a ~15% speedup for me in the CHACHA20_* benchmarks. It's a simple change, this performance helps with RNG generation, and will matter more for BIP324. ACKs for top commit: martinus: tested ACK 81c09ee with clang++ 13.0.1, test `CHACHA20_1MB`: MarcoFalke: ACK 81c09ee 🍟 Tree-SHA512: 108bd0ba573bb08de92d611e7be7c09a2c2700f9655f44129b87f9b71f7e101dfc6bd345783e7b4b9b40f0b003913cf59187f422da8cdb5b20887f7855b2611a
2 parents 8abe79a + 81c09ee commit dab18f0

File tree

1 file changed

+28
-20
lines changed

1 file changed

+28
-20
lines changed

src/crypto/chacha20.cpp

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ constexpr static inline uint32_t rotl32(uint32_t v, int c) { return (v << c) | (
1818
a += b; d = rotl32(d ^ a, 8); \
1919
c += d; b = rotl32(b ^ c, 7);
2020

21+
#define REPEAT10(a) do { {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; {a}; } while(0)
22+
2123
static const unsigned char sigma[] = "expand 32-byte k";
2224
static const unsigned char tau[] = "expand 16-byte k";
2325

@@ -119,16 +121,19 @@ void ChaCha20::Keystream(unsigned char* c, size_t bytes)
119121
x13 = j13;
120122
x14 = j14;
121123
x15 = j15;
122-
for (i = 20;i > 0;i -= 2) {
123-
QUARTERROUND( x0, x4, x8,x12)
124-
QUARTERROUND( x1, x5, x9,x13)
125-
QUARTERROUND( x2, x6,x10,x14)
126-
QUARTERROUND( x3, x7,x11,x15)
127-
QUARTERROUND( x0, x5,x10,x15)
128-
QUARTERROUND( x1, x6,x11,x12)
129-
QUARTERROUND( x2, x7, x8,x13)
130-
QUARTERROUND( x3, x4, x9,x14)
131-
}
124+
125+
// The 20 inner ChaCha20 rounds are unrolled here for performance.
126+
REPEAT10(
127+
QUARTERROUND( x0, x4, x8,x12);
128+
QUARTERROUND( x1, x5, x9,x13);
129+
QUARTERROUND( x2, x6,x10,x14);
130+
QUARTERROUND( x3, x7,x11,x15);
131+
QUARTERROUND( x0, x5,x10,x15);
132+
QUARTERROUND( x1, x6,x11,x12);
133+
QUARTERROUND( x2, x7, x8,x13);
134+
QUARTERROUND( x3, x4, x9,x14);
135+
);
136+
132137
x0 += j0;
133138
x1 += j1;
134139
x2 += j2;
@@ -231,16 +236,19 @@ void ChaCha20::Crypt(const unsigned char* m, unsigned char* c, size_t bytes)
231236
x13 = j13;
232237
x14 = j14;
233238
x15 = j15;
234-
for (i = 20;i > 0;i -= 2) {
235-
QUARTERROUND( x0, x4, x8,x12)
236-
QUARTERROUND( x1, x5, x9,x13)
237-
QUARTERROUND( x2, x6,x10,x14)
238-
QUARTERROUND( x3, x7,x11,x15)
239-
QUARTERROUND( x0, x5,x10,x15)
240-
QUARTERROUND( x1, x6,x11,x12)
241-
QUARTERROUND( x2, x7, x8,x13)
242-
QUARTERROUND( x3, x4, x9,x14)
243-
}
239+
240+
// The 20 inner ChaCha20 rounds are unrolled here for performance.
241+
REPEAT10(
242+
QUARTERROUND( x0, x4, x8,x12);
243+
QUARTERROUND( x1, x5, x9,x13);
244+
QUARTERROUND( x2, x6,x10,x14);
245+
QUARTERROUND( x3, x7,x11,x15);
246+
QUARTERROUND( x0, x5,x10,x15);
247+
QUARTERROUND( x1, x6,x11,x12);
248+
QUARTERROUND( x2, x7, x8,x13);
249+
QUARTERROUND( x3, x4, x9,x14);
250+
);
251+
244252
x0 += j0;
245253
x1 += j1;
246254
x2 += j2;

0 commit comments

Comments
 (0)