Add NEON fast-crc32 implementation

Nicoshev · facebook-github-bot · commit b028517fc58f · 2025-02-25T06:34:11.000-08:00
Summary:
Folly has a fast crc32c NEON implementation, but lacks an equivalent for crc32.
This diff adds a fast crc32 NEON-based algorithm.

New implementation is almost 3 times faster for long inputs:

before:

crc32_2048                                                 84.50ns    11.83M
crc32_4096                                                166.60ns     6.00M
crc32_8192                                                324.88ns     3.08M
crc32_16384                                               641.44ns     1.56M
crc32_32768                                                 1.27us   784.50K
crc32_65536                                                 3.20us   312.04K
crc32_131072                                                5.75us   173.80K
crc32_262144                                               10.80us    92.61K
crc32_524288                                               21.06us    47.48K

after:

crc32_2048                                                 79.40ns    12.59M
crc32_4096                                                120.85ns     8.27M
crc32_8192                                                190.63ns     5.25M
crc32_16384                                               314.20ns     3.18M
crc32_32768                                               561.82ns     1.78M
crc32_65536                                                 1.04us   962.04K
crc32_131072                                                1.99us   502.21K
crc32_262144                                                3.88us   257.48K
crc32_524288                                                7.69us   130.11K

Reviewed By: Gownta

Differential Revision: D70103378

fbshipit-source-id: 84c49edd6dc7b2ef5af48003f37613ae0e495dbd
diff --git a/folly/external/fast-crc32/BUCK b/folly/external/fast-crc32/BUCK
@@ -62,11 +62,20 @@ cpp_library(
     headers = [
         "neon_eor3_crc32c_v8s2x4_s3.h",
     ],
-    arch_preprocessor_flags = {
-        "aarch64": [
-            "-march=armv8.2-a+crypto+crc+sha3",
-        ],
-    },
+    deps = [
+        "//folly:portability",
+        "//folly/system:aux_vector",
+    ],
+)
+
+cpp_library(
+    name = "neon_eor3_crc32_v9s3x2e_s3",
+    srcs = [
+        "neon_eor3_crc32_v9s3x2e_s3.cpp",
+    ],
+    headers = [
+        "neon_eor3_crc32_v9s3x2e_s3.h",
+    ],
     deps = [
         "//folly:portability",
         "//folly/system:aux_vector",
diff --git a/folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.cpp b/folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.cpp
@@ -0,0 +1,215 @@
+/* @Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon_eor3 -p crc32 -a v9s3x2e_s3 */
+/* MIT licensed */
+
+#include "folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h"
+#include <folly/system/AuxVector.h> // @manual
+#include <folly/Portability.h>
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRC_EXPORT extern
+
+#if !(FOLLY_AARCH64 && FOLLY_NEON && FOLLY_ARM_FEATURE_CRYPTO && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_SHA3)
+#include <stdlib.h>
+namespace folly::detail {
+CRC_EXPORT uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t*, size_t, uint32_t) {
+  abort(); // not implemented on this platform
+}
+
+CRC_EXPORT bool has_neon_eor3_crc32_v9s3x2e_s3() {
+  return false;
+}
+}
+#else
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#if defined(_MSC_VER)
+#define CRC_AINLINE static __forceinline
+#define CRC_ALIGN(n) __declspec(align(n))
+#else
+#define CRC_AINLINE static __inline __attribute__((always_inline))
+#define CRC_ALIGN(n) __attribute__((aligned(n)))
+#endif
+
+namespace folly::detail {
+CRC_AINLINE uint64x2_t clmul_lo(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_p128(vmull_p64(a[0], b[0]));
+}
+
+CRC_AINLINE uint64x2_t clmul_hi(uint64x2_t a, uint64x2_t b) {
+  return vreinterpretq_u64_p128(vmull_high_p64(vreinterpretq_p64_u64(a), vreinterpretq_p64_u64(b)));
+}
+
+CRC_AINLINE uint64x2_t clmul_scalar(uint32_t a, uint32_t b) {
+  return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+
+static uint32_t xnmodp(uint64_t n) /* x^n mod P, in log(n) time */ {
+  uint64_t stack = ~(uint64_t)1;
+  uint32_t acc, low;
+  for (; n > 191; n = (n >> 1) - 16) {
+    stack = (stack << 1) + (n & 1);
+  }
+  stack = ~stack;
+  acc = ((uint32_t)0x80000000) >> (n & 31);
+  for (n >>= 5; n; --n) {
+    acc = __crc32w(acc, 0);
+  }
+  while ((low = stack & 1), stack >>= 1) {
+    poly8x8_t x = vreinterpret_p8_u64(vmov_n_u64(acc));
+    uint64_t y = vgetq_lane_u64(vreinterpretq_u64_p16(vmull_p8(x, x)), 0);
+    acc = __crc32d(0, y << low);
+  }
+  return acc;
+}
+
+CRC_AINLINE uint64x2_t crc_shift(uint32_t crc, size_t nbytes) {
+  return clmul_scalar(crc, xnmodp(nbytes * 8 - 33));
+}
+
+FOLLY_TARGET_ATTRIBUTE("+crc")
+CRC_EXPORT bool has_neon_eor3_crc32_v9s3x2e_s3() {
+  static ElfHwCaps caps;
+
+  return caps.aarch64_fp() && caps.aarch64_asimd() && caps.aarch64_pmull() &&
+      caps.aarch64_crc32() && caps.aarch64_sha3();
+}
+
+CRC_EXPORT uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t* buf, size_t len, uint32_t crc0) {
+  for (; len && ((uintptr_t)buf & 7); --len) {
+    crc0 = __crc32b(crc0, *buf++);
+  }
+  if (((uintptr_t)buf & 8) && len >= 8) {
+    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
+    buf += 8;
+    len -= 8;
+  }
+  if (len >= 192) {
+    const uint8_t* end = buf + len;
+    size_t blk = (len - 0) / 192;
+    size_t klen = blk * 16;
+    const uint8_t* buf2 = buf + klen * 3;
+    const uint8_t* limit = buf + klen - 32;
+    uint32_t crc1 = 0;
+    uint32_t crc2 = 0;
+    uint64x2_t vc0;
+    uint64x2_t vc1;
+    uint64x2_t vc2;
+    uint64_t vc;
+    /* First vector chunk. */
+    uint64x2_t x0 = vld1q_u64((const uint64_t*)buf2), y0;
+    uint64x2_t x1 = vld1q_u64((const uint64_t*)(buf2 + 16)), y1;
+    uint64x2_t x2 = vld1q_u64((const uint64_t*)(buf2 + 32)), y2;
+    uint64x2_t x3 = vld1q_u64((const uint64_t*)(buf2 + 48)), y3;
+    uint64x2_t x4 = vld1q_u64((const uint64_t*)(buf2 + 64)), y4;
+    uint64x2_t x5 = vld1q_u64((const uint64_t*)(buf2 + 80)), y5;
+    uint64x2_t x6 = vld1q_u64((const uint64_t*)(buf2 + 96)), y6;
+    uint64x2_t x7 = vld1q_u64((const uint64_t*)(buf2 + 112)), y7;
+    uint64x2_t x8 = vld1q_u64((const uint64_t*)(buf2 + 128)), y8;
+    uint64x2_t k;
+    { static const uint64_t CRC_ALIGN(16) k_[] = {0x26b70c3d, 0x3f41287a}; k = vld1q_u64(k_); }
+    buf2 += 144;
+    /* Main loop. */
+    while (buf <= limit) {
+      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+      y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+      y5 = clmul_lo(x5, k), x5 = clmul_hi(x5, k);
+      y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+      y7 = clmul_lo(x7, k), x7 = clmul_hi(x7, k);
+      y8 = clmul_lo(x8, k), x8 = clmul_hi(x8, k);
+      x0 = veor3q_u64(x0, y0, vld1q_u64((const uint64_t*)buf2));
+      x1 = veor3q_u64(x1, y1, vld1q_u64((const uint64_t*)(buf2 + 16)));
+      x2 = veor3q_u64(x2, y2, vld1q_u64((const uint64_t*)(buf2 + 32)));
+      x3 = veor3q_u64(x3, y3, vld1q_u64((const uint64_t*)(buf2 + 48)));
+      x4 = veor3q_u64(x4, y4, vld1q_u64((const uint64_t*)(buf2 + 64)));
+      x5 = veor3q_u64(x5, y5, vld1q_u64((const uint64_t*)(buf2 + 80)));
+      x6 = veor3q_u64(x6, y6, vld1q_u64((const uint64_t*)(buf2 + 96)));
+      x7 = veor3q_u64(x7, y7, vld1q_u64((const uint64_t*)(buf2 + 112)));
+      x8 = veor3q_u64(x8, y8, vld1q_u64((const uint64_t*)(buf2 + 128)));
+      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
+      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
+      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
+      crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
+      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
+      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
+      buf += 16;
+      buf2 += 144;
+    }
+    /* Reduce x0 ... x8 to just x0. */
+    { static const uint64_t CRC_ALIGN(16) k_[] = {0xae689191, 0xccaa009e}; k = vld1q_u64(k_); }
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    x0 = veor3q_u64(x0, y0, x1);
+    x1 = x2, x2 = x3, x3 = x4, x4 = x5, x5 = x6, x6 = x7, x7 = x8;
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+    y6 = clmul_lo(x6, k), x6 = clmul_hi(x6, k);
+    x0 = veor3q_u64(x0, y0, x1);
+    x2 = veor3q_u64(x2, y2, x3);
+    x4 = veor3q_u64(x4, y4, x5);
+    x6 = veor3q_u64(x6, y6, x7);
+    { static const uint64_t CRC_ALIGN(16) k_[] = {0xf1da05aa, 0x81256527}; k = vld1q_u64(k_); }
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y4 = clmul_lo(x4, k), x4 = clmul_hi(x4, k);
+    x0 = veor3q_u64(x0, y0, x2);
+    x4 = veor3q_u64(x4, y4, x6);
+    { static const uint64_t CRC_ALIGN(16) k_[] = {0x8f352d95, 0x1d9513d7}; k = vld1q_u64(k_); }
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    x0 = veor3q_u64(x0, y0, x4);
+    /* Final scalar chunk. */
+    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
+    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
+    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
+    crc0 = __crc32d(crc0, *(const uint64_t*)(buf + 8));
+    crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen + 8));
+    crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2 + 8));
+    vc0 = crc_shift(crc0, klen * 2 + blk * 144);
+    vc1 = crc_shift(crc1, klen + blk * 144);
+    vc2 = crc_shift(crc2, 0 + blk * 144);
+    vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);
+    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+    crc0 = __crc32d(0, vgetq_lane_u64(x0, 0));
+    crc0 = __crc32d(crc0, vc ^ vgetq_lane_u64(x0, 1));
+    buf = buf2;
+    len = end - buf;
+  }
+  if (len >= 32) {
+    size_t klen = ((len - 8) / 24) * 8;
+    uint32_t crc1 = 0;
+    uint32_t crc2 = 0;
+    uint64x2_t vc0;
+    uint64x2_t vc1;
+    uint64_t vc;
+    /* Main loop. */
+    do {
+      crc0 = __crc32d(crc0, *(const uint64_t*)buf);
+      crc1 = __crc32d(crc1, *(const uint64_t*)(buf + klen));
+      crc2 = __crc32d(crc2, *(const uint64_t*)(buf + klen * 2));
+      buf += 8;
+      len -= 24;
+    } while (len >= 32);
+    vc0 = crc_shift(crc0, klen * 2 + 8);
+    vc1 = crc_shift(crc1, klen + 8);
+    vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);
+    /* Final 8 bytes. */
+    buf += klen * 2;
+    crc0 = crc2;
+    crc0 = __crc32d(crc0, *(const uint64_t*)buf ^ vc), buf += 8;
+    len -= 8;
+  }
+  for (; len >= 8; buf += 8, len -= 8) {
+    crc0 = __crc32d(crc0, *(const uint64_t*)buf);
+  }
+  for (; len; --len) {
+    crc0 = __crc32b(crc0, *buf++);
+  }
+  return crc0;
+}
+} // namespace folly::detail
+#endif
diff --git a/folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h b/folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+
+namespace folly::detail {
+uint32_t neon_eor3_crc32_v9s3x2e_s3(const uint8_t* buf, size_t len, uint32_t crc0);
+bool has_neon_eor3_crc32_v9s3x2e_s3();
+}
diff --git a/folly/hash/BUCK b/folly/hash/BUCK
@@ -11,6 +11,7 @@ cpp_library(
         "//folly/detail:traponavx512",
         "//folly/external/fast-crc32:avx512_crc32c_v8s3x4",  # @manual
         "//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2",  # @manual
+        "//folly/external/fast-crc32:neon_eor3_crc32_v9s3x2e_s3",  # @manual
         "//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3",  # @manual
         "//folly/external/fast-crc32:sse_crc32c_v8s3x3",  # @manual
         "//folly/hash/detail:checksum_detail",
diff --git a/folly/hash/Checksum.cpp b/folly/hash/Checksum.cpp
@@ -25,6 +25,7 @@
 #include <folly/detail/TrapOnAvx512.h>
 #include <folly/external/fast-crc32/avx512_crc32c_v8s3x4.h> // @manual
 #include <folly/external/fast-crc32/neon_crc32c_v3s4x2e_v2.h> // @manual
+#include <folly/external/fast-crc32/neon_eor3_crc32_v9s3x2e_s3.h> // @manual
 #include <folly/external/fast-crc32/neon_eor3_crc32c_v8s2x4_s3.h> // @manual
 #include <folly/external/fast-crc32/sse_crc32c_v8s3x3.h> // @manual
 #include <folly/hash/detail/ChecksumDetail.h>
@@ -100,6 +101,10 @@ bool crc32c_hw_supported_neon_eor3_sha3() {
   return false;
 }
 
+bool crc32_hw_supported_neon_eor3_sha3() {
+  return false;
+}
+
 #elif FOLLY_ARM_FEATURE_CRC32
 
 // crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp
@@ -121,6 +126,11 @@ bool crc32c_hw_supported_neon() {
   return has_neon;
 }
 
+bool crc32_hw_supported_neon_eor3_sha3() {
+  static bool has_neon_eor3 = has_neon_eor3_crc32_v9s3x2e_s3();
+  return has_neon_eor3;
+}
+
 bool crc32c_hw_supported_neon_eor3_sha3() {
   static bool has_neon_eor3 = has_neon_eor3_crc32c_v8s2x4_s3();
   return has_neon_eor3;
@@ -159,6 +169,10 @@ bool crc32c_hw_supported_neon() {
   return false;
 }
 
+bool crc32_hw_supported_neon_eor3_sha3() {
+  return false;
+}
+
 bool crc32c_hw_supported_neon_eor3_sha3() {
   return false;
 }
@@ -230,6 +244,12 @@ uint32_t crc32c(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
 }
 
 uint32_t crc32(const uint8_t* data, size_t nbytes, uint32_t startingChecksum) {
+#if FOLLY_AARCH64
+  if (nbytes >= 2048 && detail::crc32_hw_supported_neon_eor3_sha3()) {
+    return detail::neon_eor3_crc32_v9s3x2e_s3(data, nbytes, startingChecksum);
+  }
+#endif
+
   if (detail::crc32_hw_supported()) {
     return detail::crc32_hw(data, nbytes, startingChecksum);
   } else {
diff --git a/folly/hash/test/BUCK b/folly/hash/test/BUCK
@@ -13,6 +13,7 @@ cpp_unittest(
         "//folly:random",
         "//folly/external/fast-crc32:avx512_crc32c_v8s3x4",
         "//folly/external/fast-crc32:neon_crc32c_v3s4x2e_v2",
+        "//folly/external/fast-crc32:neon_eor3_crc32_v9s3x2e_s3",
         "//folly/external/fast-crc32:neon_eor3_crc32c_v8s2x4_s3",
         "//folly/external/fast-crc32:sse_crc32c_v8s3x3",
         "//folly/hash:checksum",