Skip to content

Commit b028517

Browse files
Nicoshevfacebook-github-bot
authored andcommitted
Add NEON fast-crc32 implementation
Summary: Folly has a fast crc32c NEON implementation, but lacks an equivalent for crc32. This diff adds a fast crc32 NEON-based algorithm. New implementation is almost 3 times faster for long inputs: before: crc32_2048 84.50ns 11.83M crc32_4096 166.60ns 6.00M crc32_8192 324.88ns 3.08M crc32_16384 641.44ns 1.56M crc32_32768 1.27us 784.50K crc32_65536 3.20us 312.04K crc32_131072 5.75us 173.80K crc32_262144 10.80us 92.61K crc32_524288 21.06us 47.48K after: crc32_2048 79.40ns 12.59M crc32_4096 120.85ns 8.27M crc32_8192 190.63ns 5.25M crc32_16384 314.20ns 3.18M crc32_32768 561.82ns 1.78M crc32_65536 1.04us 962.04K crc32_131072 1.99us 502.21K crc32_262144 3.88us 257.48K crc32_524288 7.69us 130.11K Reviewed By: Gownta Differential Revision: D70103378 fbshipit-source-id: 84c49edd6dc7b2ef5af48003f37613ae0e495dbd
1 parent b110c81 commit b028517

File tree

6 files changed

+259
-5
lines changed

6 files changed

+259
-5
lines changed

folly/external/fast-crc32/BUCK

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,20 @@ cpp_library(
6262
headers = [
6363
"neon_eor3_crc32c_v8s2x4_s3.h",
6464
],
65-
arch_preprocessor_flags = {
66-
"aarch64": [
67-
"-march=armv8.2-a+crypto+crc+sha3",
68-
],
69-
},
65+
deps = [
66+
"//folly:portability",
67+
"//folly/system:aux_vector",
68+
],
69+
)
70+
71+
cpp_library(
72+
name = "neon_eor3_crc32_v9s3x2e_s3",
73+
srcs = [
74+
"neon_eor3_crc32_v9s3x2e_s3.cpp",
75+
],
76+
headers = [
77+
"neon_eor3_crc32_v9s3x2e_s3.h",
78+
],
7079
deps = [
7180
"//folly:portability",
7281
"//folly/system:aux_vector",
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
/* @Generated by https://github.com/corsix/fast-crc32/ using: */
2+
/* ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 */
3+
/* MIT licensed */
4+
5+
#include "folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h"
6+
#include <folly/system/AuxVector.h> // @manual
7+
#include <folly/Portability.h>
8+
9+
#include <stddef.h>
10+
#include <stdint.h>
11+
12+
#define CRC_EXPORT extern
13+
14+
#if !(FOLLY_AARCH64 && FOLLY_NEON && FOLLY_ARM_FEATURE_CRYPTO && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_SHA3)
15+
#include <stdlib.h>
16+
namespace folly::detail {
17+
CRC_EXPORT uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t*, size_t, uint32_t) {
18+
abort(); // not implemented on this platform
19+
}
20+
21+
CRC_EXPORT bool has_neon_eor3_crc32_v9s3x2e_s3() {
22+
return false;
23+
}
24+
}
25+
#else
26+
#include <arm_acle.h>
27+
#include <arm_neon.h>
28+
29+
#if defined(_MSC_VER)
30+
#define CRC_AINLINE static __forceinline
31+
#define CRC_ALIGN(n) __declspec(align(n))
32+
#else
33+
#define CRC_AINLINE static __inline __attribute__((always_inline))
34+
#define CRC_ALIGN(n) __attribute__((aligned(n)))
35+
#endif
36+
37+
namespace folly::detail {
38+
CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
39+
return vreinterpretq_u64_p128(vmull_p64(a[0], b[0]));
40+
}
41+
42+
CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
43+
return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
44+
}
45+
46+
CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
47+
return vreinterpretq_u64_p128(vmull_p64(a, b));
48+
}
49+
50+
static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
51+
uint64_t stack = ~(uint64_t)1;
52+
uint32_t acc, low;
53+
for (; n > 191; n = (n >> 1) - 16) {
54+
stack = (stack << 1) + (n & 1);
55+
}
56+
stack = ~stack;
57+
acc = ((uint32_t)0x80000000) >> (n & 31);
58+
for (n >>= 5; n; --n) {
59+
acc = __crc32w(acc, 0);
60+
}
61+
while ((low = stack & 1), stack >>= 1) {
62+
poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
63+
uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
64+
acc = __crc32d(0, y << low);
65+
}
66+
return acc;
67+
}
68+
69+
CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
70+
return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
71+
}
72+
73+
FOLLY_TARGET_ATTRIBUTE("+crc")
74+
CRC_EXPORT bool has_neon_eor3_crc32_v9s3x2e_s3() {
75+
static ElfHwCaps caps;
76+
77+
return caps.aarch64_fp() && caps.aarch64_asimd() && caps.aarch64_pmull() &&
78+
caps.aarch64_crc32() && caps.aarch64_sha3();
79+
}
80+
81+
CRC_EXPORT uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t* buf, size_t len, uint32_t crc0) {
82+
for (; len && ((uintptr_t)buf & 7); --len) {
83+
crc0 = __crc32b(crc0, *buf++);
84+
}
85+
if (((uintptr_t)buf & 8) && len >= 8) {
86+
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
87+
buf += 8;
88+
len -= 8;
89+
}
90+
if (len >= 192) {
91+
const uint8_t* end = buf + len;
92+
size_t blk = (len - 0) / 192;
93+
size_t klen = blk * 16;
94+
const uint8_t* buf2 = buf + klen * 3;
95+
const uint8_t* limit = buf + klen - 32;
96+
uint32_t crc1 = 0;
97+
uint32_t crc2 = 0;
98+
uint64x2_t vc0;
99+
uint64x2_t vc1;
100+
uint64x2_t vc2;
101+
uint64_t vc;
102+
/* First vector chunk. */
103+
uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
104+
uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
105+
uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
106+
uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
107+
uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
108+
uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
109+
uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
110+
uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
111+
uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
112+
uint64x2_t k;
113+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
114+
buf2 += 144;
115+
/* Main loop. */
116+
while (buf <= limit) {
117+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
118+
y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
119+
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
120+
y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
121+
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
122+
y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
123+
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
124+
y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
125+
y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
126+
x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
127+
x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
128+
x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
129+
x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
130+
x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
131+
x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
132+
x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
133+
x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
134+
x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
135+
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
136+
crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
137+
crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
138+
crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
139+
crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
140+
crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
141+
buf += 16;
142+
buf2 += 144;
143+
}
144+
/* Reduce x0 ... x8 to just x0. */
145+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
146+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
147+
x0 = veor3q_u64(x0, y0, x1);
148+
x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
149+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
150+
y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
151+
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
152+
y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
153+
x0 = veor3q_u64(x0, y0, x1);
154+
x2 = veor3q_u64(x2, y2, x3);
155+
x4 = veor3q_u64(x4, y4, x5);
156+
x6 = veor3q_u64(x6, y6, x7);
157+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
158+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
159+
y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
160+
x0 = veor3q_u64(x0, y0, x2);
161+
x4 = veor3q_u64(x4, y4, x6);
162+
{ static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
163+
y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
164+
x0 = veor3q_u64(x0, y0, x4);
165+
/* Final scalar chunk. */
166+
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
167+
crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
168+
crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
169+
crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
170+
crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
171+
crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
172+
vc0 = crc_shift(crc0, klen * 2 + blk * 144);
173+
vc1 = crc_shift(crc1, klen + blk * 144);
174+
vc2 = crc_shift(crc2, 0 + blk * 144);
175+
vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
176+
/* Reduce 128 bits to 32 bits, and multiply by x^32. */
177+
crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
178+
crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
179+
buf = buf2;
180+
len = end - buf;
181+
}
182+
if (len >= 32) {
183+
size_t klen = ((len - 8) / 24) * 8;
184+
uint32_t crc1 = 0;
185+
uint32_t crc2 = 0;
186+
uint64x2_t vc0;
187+
uint64x2_t vc1;
188+
uint64_t vc;
189+
/* Main loop. */
190+
do {
191+
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
192+
crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
193+
crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
194+
buf += 8;
195+
len -= 24;
196+
} while (len >= 32);
197+
vc0 = crc_shift(crc0, klen * 2 + 8);
198+
vc1 = crc_shift(crc1, klen + 8);
199+
vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
200+
/* Final 8 bytes. */
201+
buf += klen * 2;
202+
crc0 = crc2;
203+
crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
204+
len -= 8;
205+
}
206+
for (; len >= 8; buf += 8, len -= 8) {
207+
crc0 = __crc32d(crc0, *(const uint64_t*)buf);
208+
}
209+
for (; len; --len) {
210+
crc0 = __crc32b(crc0, *buf++);
211+
}
212+
return crc0;
213+
}
214+
} // namespace folly::detail
215+
#endif
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#pragma once
2+
#include <cstddef>
3+
#include <cstdint>
4+
5+
namespace folly::detail {
6+
uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t* buf, size_t len, uint32_t crc0);
7+
bool has_neon_eor3_crc32_v9s3x2e_s3();
8+
}

folly/hash/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ cpp_library(
1111
"//folly/detail:traponavx512",
1212
"//folly/external/fast-crc32:avx512_crc32c_v8s3x4", # @manual
1313
"//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2", # @manual
14+
"//folly/external/fast-crc32:neon_eor3_crc32_v9s3x2e_s3", # @manual
1415
"//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3", # @manual
1516
"//folly/external/fast-crc32:sse_crc32c_v8s3x3", # @manual
1617
"//folly/hash/detail:checksum_detail",

folly/hash/Checksum.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <folly/detail/TrapOnAvx512.h>
2626
#include <folly/external/fast-crc32/avx512_crc32c_v8s3x4.h> // @manual
2727
#include <folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h> // @manual
28+
#include <folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h> // @manual
2829
#include <folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h> // @manual
2930
#include <folly/external/fast-crc32/sse_crc32c_v8s3x3.h> // @manual
3031
#include <folly/hash/detail/ChecksumDetail.h>
@@ -100,6 +101,10 @@ bool crc32c_hw_supported_neon_eor3_sha3() {
100101
return false;
101102
}
102103

104+
bool crc32_hw_supported_neon_eor3_sha3() {
105+
return false;
106+
}
107+
103108
#elif FOLLY_ARM_FEATURE_CRC32
104109

105110
// crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp
@@ -121,6 +126,11 @@ bool crc32c_hw_supported_neon() {
121126
return has_neon;
122127
}
123128

129+
bool crc32_hw_supported_neon_eor3_sha3() {
130+
static bool has_neon_eor3 = has_neon_eor3_crc32_v9s3x2e_s3();
131+
return has_neon_eor3;
132+
}
133+
124134
bool crc32c_hw_supported_neon_eor3_sha3() {
125135
static bool has_neon_eor3 = has_neon_eor3_crc32c_v8s2x4_s3();
126136
return has_neon_eor3;
@@ -159,6 +169,10 @@ bool crc32c_hw_supported_neon() {
159169
return false;
160170
}
161171

172+
bool crc32_hw_supported_neon_eor3_sha3() {
173+
return false;
174+
}
175+
162176
bool crc32c_hw_supported_neon_eor3_sha3() {
163177
return false;
164178
}
@@ -230,6 +244,12 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
230244
}
231245

232246
uint32_t crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
247+
#if FOLLY_AARCH64
248+
if (nbytes >= 2048 && detail::crc32_hw_supported_neon_eor3_sha3()) {
249+
return detail::neon_eor3_crc32_v9s3x2e_s3(data, nbytes, startingChecksum);
250+
}
251+
#endif
252+
233253
if (detail::crc32_hw_supported()) {
234254
return detail::crc32_hw(data, nbytes, startingChecksum);
235255
} else {

folly/hash/test/BUCK

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ cpp_unittest(
1313
"//folly:random",
1414
"//folly/external/fast-crc32:avx512_crc32c_v8s3x4",
1515
"//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2",
16+
"//folly/external/fast-crc32:neon_eor3_crc32_v9s3x2e_s3",
1617
"//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3",
1718
"//folly/external/fast-crc32:sse_crc32c_v8s3x3",
1819
"//folly/hash:checksum",

0 commit comments

Comments
 (0)