Skip to content

Commit 247c98d

Browse files
committed
Implement SHA256 acceleration on ARM64 platforms using CPU instructions
1 parent 5ff256a commit 247c98d

File tree

12 files changed

+267
-0
lines changed

12 files changed

+267
-0
lines changed

src/Crypto/Crypto.vcxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,10 @@
255255
<ClCompile Include="SerpentFast.c" />
256256
<ClCompile Include="SerpentFast_simd.cpp" />
257257
<ClCompile Include="Sha2.c" />
258+
<ClCompile Include="sha256_armv8.c">
259+
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
260+
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
261+
</ClCompile>
258262
<ClCompile Include="Sha2Intel.c" />
259263
<ClCompile Include="Streebog.c" />
260264
<ClCompile Include="t1ha2.c" />

src/Crypto/Crypto.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@
9696
<ClCompile Include="Aes_hw_armv8.c">
9797
<Filter>Source Files</Filter>
9898
</ClCompile>
99+
<ClCompile Include="sha256_armv8.c">
100+
<Filter>Source Files</Filter>
101+
</ClCompile>
99102
</ItemGroup>
100103
<ItemGroup>
101104
<ClInclude Include="Aes.h">

src/Crypto/Sha2.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,10 @@ extern "C"
315315
void VC_CDECL sha256_compress_nayuki(uint_32t state[8], const uint_8t block[64]);
316316
#endif
317317

318+
#if CRYPTOPP_ARM_SHA2_AVAILABLE
319+
void sha256_compress_digest_armv8(const void* input_data, uint_32t digest[8], uint_64t num_blks);
320+
#endif
321+
318322
#if defined(__cplusplus)
319323
}
320324
#endif
@@ -757,6 +761,13 @@ void SSE2Sha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
757761
}
758762
#endif
759763

764+
#if CRYPTOPP_ARM_SHA2_AVAILABLE
765+
void ArmSha256Transform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
766+
{
767+
sha256_compress_digest_armv8(mp, ctx->hash, num_blks);
768+
}
769+
#endif
770+
760771
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
761772
void Sha256AsmTransform(sha256_ctx* ctx, void* mp, uint_64t num_blks)
762773
{
@@ -805,6 +816,12 @@ void sha256_begin(sha256_ctx* ctx)
805816
else
806817
#endif
807818

819+
#if CRYPTOPP_ARM_SHA2_AVAILABLE
820+
if (HasSHA256())
821+
sha256transfunc = ArmSha256Transform;
822+
else
823+
#endif
824+
808825
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32
809826
sha256transfunc = Sha256AsmTransform;
810827
#else

src/Crypto/config.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,19 @@
240240
# endif // Platforms
241241
#endif
242242

243+
// ARMv8 and SHA-1, SHA-256. -march=armv8-a+crypto or above must be present
244+
// Requires GCC 4.8, Clang 3.3 or Visual Studio 2017
245+
#if !defined(CRYPTOPP_ARM_SHA_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ARM_SHA)
246+
# if defined(__aarch32__) || defined(__aarch64__) || defined(_M_ARM64)
247+
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || \
248+
(CRYPTOPP_LLVM_CLANG_VERSION >= 30300) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300) || \
249+
(CRYPTOPP_MSC_VERSION >= 1916)
250+
# define CRYPTOPP_ARM_SHA1_AVAILABLE 1
251+
# define CRYPTOPP_ARM_SHA2_AVAILABLE 1
252+
# endif // Compilers
253+
# endif // Platforms
254+
#endif
255+
243256
// Undo the ASM and Intrinsic related defines due to X32.
244257
#if CRYPTOPP_BOOL_X32
245258
# undef CRYPTOPP_BOOL_X64

src/Crypto/cpu.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,9 +475,13 @@ void DisableCPUExtendedFeatures ()
475475
#ifndef HWCAP_AES
476476
# define HWCAP_AES (1 << 3)
477477
#endif
478+
#ifndef HWCAP_SHA2
479+
# define HWCAP_SHA2 (1 << 6)
480+
#endif
478481
#endif
479482

480483
volatile int g_hasAESARM = 0;
484+
volatile int g_hasSHA256ARM = 0;
481485

482486
inline int CPU_QueryAES()
483487
{
@@ -503,9 +507,34 @@ inline int CPU_QueryAES()
503507
#endif
504508
}
505509

510+
inline int CPU_QuerySHA2()
511+
{
512+
#if defined(CRYPTOPP_ARM_SHA2_AVAILABLE)
513+
#if defined(__linux__) && defined(__aarch64__)
514+
if ((getauxval(AT_HWCAP) & HWCAP_SHA2) != 0)
515+
return 1;
516+
#elif defined(__APPLE__) && defined(__aarch64__)
517+
// Apple Sillcon (M1) and later
518+
return 1;
519+
#elif defined(_WIN32) && defined(_M_ARM64)
520+
#ifdef TC_WINDOWS_DRIVER
521+
if (ExIsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0)
522+
return 1;
523+
#else
524+
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0)
525+
return 1;
526+
#endif
527+
#endif
528+
return 0;
529+
#else
530+
return 0;
531+
#endif
532+
}
533+
506534
void DetectArmFeatures()
507535
{
508536
g_hasAESARM = CPU_QueryAES();
537+
g_hasSHA256ARM = CPU_QuerySHA2();
509538
}
510539

511540
#endif

src/Crypto/cpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,9 +298,11 @@ extern "C" {
298298
#endif
299299

300300
extern volatile int g_hasAESARM;
301+
extern volatile int g_hasSHA256ARM;
301302
void DetectArmFeatures();
302303

303304
#define HasAESNI() g_hasAESARM
305+
#define HasSHA256() g_hasSHA256ARM
304306

305307
#if defined(__cplusplus)
306308
}

src/Crypto/sha256_armv8.c

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/*
2+
* SHA-256 using CPU instructions in ARMv8
3+
*
4+
* Contributed by Jeffrey Walton. Based on public domain code by
5+
* Johannes Schneiders, Skip Hovsmith and Barry O'Rourke.
6+
*
7+
* Further changes (C) 2020 Jack Lloyd
8+
*
9+
* Botan is released under the Simplified BSD License (see license.txt)
10+
*/
11+
12+
/* Modified and adapted for VeraCrypt */
13+
14+
#include "Common/Tcdefs.h"
15+
#if !defined(_UEFI)
16+
#include <memory.h>
17+
#include <stdlib.h>
18+
#endif
19+
#include "cpu.h"
20+
#include "misc.h"
21+
22+
#if CRYPTOPP_ARM_SHA2_AVAILABLE
23+
24+
#include <arm_neon.h>
25+
26+
CRYPTOPP_ALIGN_DATA(64) static const uint32 K[] = {
27+
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
28+
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
29+
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
30+
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
31+
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
32+
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33+
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
34+
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
35+
};
36+
37+
void sha256_compress_digest_armv8(void* input_data, uint32 digest[8], uint64 num_blks) {
38+
39+
40+
// Load initial values
41+
uint32x4_t STATE0 = vld1q_u32(&digest[0]);
42+
uint32x4_t STATE1 = vld1q_u32(&digest[4]);
43+
44+
// Intermediate void* cast due to https://llvm.org/bugs/show_bug.cgi?id=20670
45+
const uint32* input32 = (const uint32*)(const void*)input_data;
46+
47+
while (num_blks > 0) {
48+
// Save current state
49+
const uint32x4_t ABCD_SAVE = STATE0;
50+
const uint32x4_t EFGH_SAVE = STATE1;
51+
52+
uint32x4_t MSG0 = vld1q_u32(input32 + 0);
53+
uint32x4_t MSG1 = vld1q_u32(input32 + 4);
54+
uint32x4_t MSG2 = vld1q_u32(input32 + 8);
55+
uint32x4_t MSG3 = vld1q_u32(input32 + 12);
56+
57+
MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
58+
MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
59+
MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
60+
MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
61+
62+
uint32x4_t MSG_K, TSTATE;
63+
64+
// Rounds 0-3
65+
MSG_K = vaddq_u32(MSG0, vld1q_u32(&K[4 * 0]));
66+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
67+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
68+
STATE0 = TSTATE;
69+
MSG0 = vsha256su1q_u32(vsha256su0q_u32(MSG0, MSG1), MSG2, MSG3);
70+
71+
// Rounds 4-7
72+
MSG_K = vaddq_u32(MSG1, vld1q_u32(&K[4 * 1]));
73+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
74+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
75+
STATE0 = TSTATE;
76+
MSG1 = vsha256su1q_u32(vsha256su0q_u32(MSG1, MSG2), MSG3, MSG0);
77+
78+
// Rounds 8-11
79+
MSG_K = vaddq_u32(MSG2, vld1q_u32(&K[4 * 2]));
80+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
81+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
82+
STATE0 = TSTATE;
83+
MSG2 = vsha256su1q_u32(vsha256su0q_u32(MSG2, MSG3), MSG0, MSG1);
84+
85+
// Rounds 12-15
86+
MSG_K = vaddq_u32(MSG3, vld1q_u32(&K[4 * 3]));
87+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
88+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
89+
STATE0 = TSTATE;
90+
MSG3 = vsha256su1q_u32(vsha256su0q_u32(MSG3, MSG0), MSG1, MSG2);
91+
92+
// Rounds 16-19
93+
MSG_K = vaddq_u32(MSG0, vld1q_u32(&K[4 * 4]));
94+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
95+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
96+
STATE0 = TSTATE;
97+
MSG0 = vsha256su1q_u32(vsha256su0q_u32(MSG0, MSG1), MSG2, MSG3);
98+
99+
// Rounds 20-23
100+
MSG_K = vaddq_u32(MSG1, vld1q_u32(&K[4 * 5]));
101+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
102+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
103+
STATE0 = TSTATE;
104+
MSG1 = vsha256su1q_u32(vsha256su0q_u32(MSG1, MSG2), MSG3, MSG0);
105+
106+
// Rounds 24-27
107+
MSG_K = vaddq_u32(MSG2, vld1q_u32(&K[4 * 6]));
108+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
109+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
110+
STATE0 = TSTATE;
111+
MSG2 = vsha256su1q_u32(vsha256su0q_u32(MSG2, MSG3), MSG0, MSG1);
112+
113+
// Rounds 28-31
114+
MSG_K = vaddq_u32(MSG3, vld1q_u32(&K[4 * 7]));
115+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
116+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
117+
STATE0 = TSTATE;
118+
MSG3 = vsha256su1q_u32(vsha256su0q_u32(MSG3, MSG0), MSG1, MSG2);
119+
120+
// Rounds 32-35
121+
MSG_K = vaddq_u32(MSG0, vld1q_u32(&K[4 * 8]));
122+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
123+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
124+
STATE0 = TSTATE;
125+
MSG0 = vsha256su1q_u32(vsha256su0q_u32(MSG0, MSG1), MSG2, MSG3);
126+
127+
// Rounds 36-39
128+
MSG_K = vaddq_u32(MSG1, vld1q_u32(&K[4 * 9]));
129+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
130+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
131+
STATE0 = TSTATE;
132+
MSG1 = vsha256su1q_u32(vsha256su0q_u32(MSG1, MSG2), MSG3, MSG0);
133+
134+
// Rounds 40-43
135+
MSG_K = vaddq_u32(MSG2, vld1q_u32(&K[4 * 10]));
136+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
137+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
138+
STATE0 = TSTATE;
139+
MSG2 = vsha256su1q_u32(vsha256su0q_u32(MSG2, MSG3), MSG0, MSG1);
140+
141+
// Rounds 44-47
142+
MSG_K = vaddq_u32(MSG3, vld1q_u32(&K[4 * 11]));
143+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
144+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
145+
STATE0 = TSTATE;
146+
MSG3 = vsha256su1q_u32(vsha256su0q_u32(MSG3, MSG0), MSG1, MSG2);
147+
148+
// Rounds 48-51
149+
MSG_K = vaddq_u32(MSG0, vld1q_u32(&K[4 * 12]));
150+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
151+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
152+
STATE0 = TSTATE;
153+
154+
// Rounds 52-55
155+
MSG_K = vaddq_u32(MSG1, vld1q_u32(&K[4 * 13]));
156+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
157+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
158+
STATE0 = TSTATE;
159+
160+
// Rounds 56-59
161+
MSG_K = vaddq_u32(MSG2, vld1q_u32(&K[4 * 14]));
162+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
163+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
164+
STATE0 = TSTATE;
165+
166+
// Rounds 60-63
167+
MSG_K = vaddq_u32(MSG3, vld1q_u32(&K[4 * 15]));
168+
TSTATE = vsha256hq_u32(STATE0, STATE1, MSG_K);
169+
STATE1 = vsha256h2q_u32(STATE1, STATE0, MSG_K);
170+
STATE0 = TSTATE;
171+
172+
// Add back to state
173+
STATE0 = vaddq_u32(STATE0, ABCD_SAVE);
174+
STATE1 = vaddq_u32(STATE1, EFGH_SAVE);
175+
176+
input32 += 64 / 4;
177+
num_blks--;
178+
}
179+
180+
// Save state
181+
vst1q_u32(&digest[0], STATE0);
182+
vst1q_u32(&digest[4], STATE1);
183+
}
184+
#endif

src/Driver/Driver.vcxproj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,10 @@ copy $(OutDir)veracrypt.inf "$(SolutionDir)Debug\Setup Files\veracrypt.inf"</Com
272272
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
273273
</ClCompile>
274274
<ClCompile Include="..\Crypto\Sha2.c" />
275+
<ClCompile Include="..\Crypto\sha256_armv8.c">
276+
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
277+
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
278+
</ClCompile>
275279
<ClCompile Include="..\Crypto\Sha2Intel.c" />
276280
<ClCompile Include="..\Crypto\Streebog.c" />
277281
<ClCompile Include="..\Crypto\t1ha2.c" />

src/Driver/Driver.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@
168168
<ClCompile Include="..\Crypto\Aes_hw_armv8.c">
169169
<Filter>Crypto\Source Files</Filter>
170170
</ClCompile>
171+
<ClCompile Include="..\Crypto\sha256_armv8.c">
172+
<Filter>Crypto\Source Files</Filter>
173+
</ClCompile>
171174
</ItemGroup>
172175
<ItemGroup>
173176
<ClInclude Include="..\Common\Tcdefs.h">
-9.65 KB
Binary file not shown.

0 commit comments

Comments
 (0)