Skip to content

Commit fe06298

Browse files
prusnakrollmeistersipa
committed
Implement sha256_arm_shani::Transform
Co-Authored-By: Rauli Kumpulainen <[email protected]> Co-Authored-By: Pieter Wuille <[email protected]>
1 parent 48a72fa commit fe06298

File tree

1 file changed

+177
-0
lines changed

1 file changed

+177
-0
lines changed

src/crypto/sha256_arm_shani.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,190 @@
99

1010
#ifdef ENABLE_ARM_SHANI
1111

12+
#include <array>
1213
#include <cstdint>
1314
#include <cstddef>
15+
#include <arm_acle.h>
16+
#include <arm_neon.h>
17+
18+
namespace {
19+
alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20+
{
21+
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22+
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23+
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24+
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25+
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26+
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27+
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28+
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29+
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30+
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31+
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32+
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33+
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34+
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35+
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36+
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37+
};
38+
}
1439

1540
namespace sha256_arm_shani {
1641
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
1742
{
43+
uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44+
uint32x4_t MSG0, MSG1, MSG2, MSG3;
45+
uint32x4_t TMP0, TMP2;
46+
47+
// Load state
48+
STATE0 = vld1q_u32(&s[0]);
49+
STATE1 = vld1q_u32(&s[4]);
50+
51+
while (blocks--)
52+
{
53+
// Save state
54+
ABEF_SAVE = STATE0;
55+
CDGH_SAVE = STATE1;
56+
57+
// Load and convert input chunk to Big Endian
58+
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59+
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60+
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61+
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62+
chunk += 64;
63+
64+
// Original implemenation preloaded message and constant addition which was 1-3% slower.
65+
// Now included as first step in quad round code saving one Q Neon register
66+
// "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67+
68+
// Rounds 1-4
69+
TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70+
TMP2 = STATE0;
71+
MSG0 = vsha256su0q_u32(MSG0, MSG1);
72+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74+
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75+
76+
// Rounds 5-8
77+
TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78+
TMP2 = STATE0;
79+
MSG1 = vsha256su0q_u32(MSG1, MSG2);
80+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82+
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83+
84+
// Rounds 9-12
85+
TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86+
TMP2 = STATE0;
87+
MSG2 = vsha256su0q_u32(MSG2, MSG3);
88+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90+
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91+
92+
// Rounds 13-16
93+
TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94+
TMP2 = STATE0;
95+
MSG3 = vsha256su0q_u32(MSG3, MSG0);
96+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98+
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99+
100+
// Rounds 17-20
101+
TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102+
TMP2 = STATE0;
103+
MSG0 = vsha256su0q_u32(MSG0, MSG1);
104+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106+
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107+
108+
// Rounds 21-24
109+
TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110+
TMP2 = STATE0;
111+
MSG1 = vsha256su0q_u32(MSG1, MSG2);
112+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114+
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115+
116+
// Rounds 25-28
117+
TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118+
TMP2 = STATE0;
119+
MSG2 = vsha256su0q_u32(MSG2, MSG3);
120+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122+
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123+
124+
// Rounds 29-32
125+
TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126+
TMP2 = STATE0;
127+
MSG3 = vsha256su0q_u32(MSG3, MSG0);
128+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130+
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131+
132+
// Rounds 33-36
133+
TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134+
TMP2 = STATE0;
135+
MSG0 = vsha256su0q_u32(MSG0, MSG1);
136+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138+
MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139+
140+
// Rounds 37-40
141+
TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142+
TMP2 = STATE0;
143+
MSG1 = vsha256su0q_u32(MSG1, MSG2);
144+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146+
MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147+
148+
// Rounds 41-44
149+
TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150+
TMP2 = STATE0;
151+
MSG2 = vsha256su0q_u32(MSG2, MSG3);
152+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154+
MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155+
156+
// Rounds 45-48
157+
TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158+
TMP2 = STATE0;
159+
MSG3 = vsha256su0q_u32(MSG3, MSG0);
160+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162+
MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163+
164+
// Rounds 49-52
165+
TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166+
TMP2 = STATE0;
167+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169+
170+
// Rounds 53-56
171+
TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172+
TMP2 = STATE0;
173+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175+
176+
// Rounds 57-60
177+
TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178+
TMP2 = STATE0;
179+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181+
182+
// Rounds 61-64
183+
TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184+
TMP2 = STATE0;
185+
STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186+
STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187+
188+
// Update state
189+
STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190+
STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191+
}
18192

193+
// Save final state
194+
vst1q_u32(&s[0], STATE0);
195+
vst1q_u32(&s[4], STATE1);
19196
}
20197
}
21198

0 commit comments

Comments
 (0)