Use even faster reduction algorithm in FinalizePclmulStream()

Abseil Team · copybara-github · commit 99275763ac36 · 2025-03-24T09:59:22.000-07:00
My previous CL optimized the Barrett reduction.  But since this is CRC32C and
scalar instructions for it are available, there is actually no need for Barrett
reduction at all.  Just use two 64-bit CRC32C instructions to reduce fullCRC.

This improves CRC32C performance on 2048-byte messages on Skylake by another 2%
or so.

PiperOrigin-RevId: 739977426
Change-Id: I4611af88cd32ed7a995e772a13c30e3bdcec8de9
diff --git a/absl/crc/internal/crc32_x86_arm_combined_simd.h b/absl/crc/internal/crc32_x86_arm_combined_simd.h
@@ -99,19 +99,12 @@ V128 V128_PMul10(const V128 l, const V128 r);
 // Produces a XOR operation of |l| and |r|.
 V128 V128_Xor(const V128 l, const V128 r);
 
-// Produces an AND operation of |l| and |r|.
-V128 V128_And(const V128 l, const V128 r);
-
 // Sets the lower half of a 128 bit register to the given 64-bit value and
 // zeroes the upper half.
 // dst[63:0] := |r|
 // dst[127:64] := |0|
 V128 V128_From64WithZeroFill(const uint64_t r);
 
-// Shift |l| right by |imm| bytes while shifting in zeros.
-template <int imm>
-V128 V128_ShiftRight(const V128 l);
-
 // Extracts a 32-bit integer from |l|, selected with |imm|.
 template <int imm>
 int V128_Extract32(const V128 l);
@@ -170,17 +163,10 @@ inline V128 V128_PMul10(const V128 l, const V128 r) {
 
 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
 
-inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
-
 inline V128 V128_From64WithZeroFill(const uint64_t r) {
   return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
 }
 
-template <int imm>
-inline V128 V128_ShiftRight(const V128 l) {
-  return _mm_srli_si128(l, imm);
-}
-
 template <int imm>
 inline int V128_Extract32(const V128 l) {
   return _mm_extract_epi32(l, imm);
@@ -261,20 +247,12 @@ inline V128 V128_PMul10(const V128 l, const V128 r) {
 
 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
 
-inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
-
 inline V128 V128_From64WithZeroFill(const uint64_t r){
   constexpr uint64x2_t kZero = {0, 0};
   return vsetq_lane_u64(r, kZero, 0);
 }
 
 
-template <int imm>
-inline V128 V128_ShiftRight(const V128 l) {
-  return vreinterpretq_u64_s8(
-      vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
-}
-
 template <int imm>
 inline int V128_Extract32(const V128 l) {
   return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -288,40 +288,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
     V128 fullCRC = V128_Xor(low, high);
     fullCRC = V128_Xor(fullCRC, partialCRC2);
 
-    // Reduce the 128-bit polynomial fullCRC into a 32-bit scalar value.
-
-    // Multiply fullCRC by x^32 (as required for CRC32C) and reduce it to 96
-    // bits. Call that result t0. Store 'x^32 * t0' back into fullCRC.
-    //
-    // More concretely, multiply the high 64 terms by '(x^95 mod G) * x^32 * x'
-    // and the low 64 terms by x^64, then add the two products together.
-    //
-    // Note that CRC32C is a least-significant-bit-first CRC. Therefore, the
-    // mapping between bits and polynomial coefficients is reflected, and each
-    // carryless multiplication implicitly introduces an extra factor of x.
-    fullCRC = V128_Xor(V128_PMul10(fullCRC, reductionMultiplicands),
-                       V128_ShiftRight<8>(fullCRC));
-
-    // Calculate floor(t0 / G), i.e. the value by which G needs to be multiplied
-    // to cancel out the x^32 and higher terms of t0. To do this, first do:
-    //
-    //    t1 := floor(t0 / x^32) * floor(x^95 / G) * x
-    //
-    // Then, floor(t1 / x^64) contains the desired value floor(t0 / G).
-    reductionMultiplicands =
-        V128_Load(reinterpret_cast<const V128*>(kBarrettReduction));
-    V128 t1 = V128_PMulLow(fullCRC, reductionMultiplicands);
-
-    // Cancel out the x^32 and higher terms of t0 by subtracting the needed
-    // multiple of G. But since fullCRC contains x^32 * t0, it is more
-    // convenient to use the same multiple of G * x^32. Also, the x^32 term of G
-    // makes no difference in the result if mod x^32 is applied (as
-    // V128_Extract32 does). Thus, the calculation is as follows:
-    //
-    //    fullCRC := fullCRC - (floor(t0 / G) * ((G - x^32) * x^31) * x)
-    //    crc := (fullCRC / x^32) mod x^32
-    fullCRC = V128_Xor(fullCRC, V128_PMul10(t1, reductionMultiplicands));
-    return static_cast<uint64_t>(V128_Extract32<2>(fullCRC));
+    // Reduce fullCRC into scalar value.
+    uint32_t crc = 0;
+    crc = CRC32_u64(crc, V128_Extract64<0>(fullCRC));
+    crc = CRC32_u64(crc, V128_Extract64<1>(fullCRC));
+    return crc;
   }
 
   // Update crc with 64 bytes of data from p.
@@ -352,11 +323,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
       0x00000000f20c0dfe,
       // (x^95 mod G) * x^32
       0x00000000493c7d27};
-  alignas(16) static constexpr uint64_t kBarrettReduction[2] = {
-      // floor(x^95 / G)
-      0x4869ec38dea713f1,
-      // (G - x^32) * x^31
-      0x0000000105ec76f0};
 
   // Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
   // size. Each group is CRCed in parallel then combined at the end of the