Optimize CRC32AcceleratedX86ARMCombinedMultipleStreams::Extend by interleaving the CRC32_u64 calls at a lower level.

nafi3000 · copybara-github · commit 5ad0bfb7ab42 · 2025-08-26T16:07:36.000-07:00
`CRC32_u64` generates `CRC32` x86 instruction which has 3 cycle latency. Because of that, the `crc` variable below causes a loop carried dependency of 3 cycles per iteration. ``` for (int i = 0; i < 8; i++) { crc = CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); p += 8; } ``` Total latency for a 64-byte block is 29 cycles (codegen: https://godbolt.org/z/zxsrGMEPs, llvm-mca: https://godbolt.org/z/xrTMhhd1E). So, it is more efficient to interleave (up to 3 calls because of the 3 cycle latency) the `CRC32_u64` calls at a lower level. Even if we interleave 3 streams, the total latency for (three) 64-byte blocks is 33 cycles (codegen: https://godbolt.org/z/5ojzPdj3h, llvm-mca: https://godbolt.org/z/5cEPxvddW). And this is without considering any inlining. PiperOrigin-RevId: 799757460 Change-Id: I80118d5c1736ae31d69e5624c94cc0a6513ef28f
diff --git a/absl/crc/internal/crc_x86_arm_combined.cc b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -317,6 +317,46 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
     return crc;
   }
 
+  // Same as Process64BytesCRC, but just interleaved for 2 streams.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesCRC2Streams(
+      const uint8_t* p0, const uint8_t* p1, uint64_t* crc) const {
+    uint64_t crc0 = crc[0];
+    uint64_t crc1 = crc[1];
+    for (int i = 0; i < 8; i++) {
+      crc0 = CRC32_u64(static_cast<uint32_t>(crc0),
+                       absl::little_endian::Load64(p0));
+      crc1 = CRC32_u64(static_cast<uint32_t>(crc1),
+                       absl::little_endian::Load64(p1));
+      p0 += 8;
+      p1 += 8;
+    }
+    crc[0] = crc0;
+    crc[1] = crc1;
+  }
+
+  // Same as Process64BytesCRC, but just interleaved for 3 streams.
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesCRC3Streams(
+      const uint8_t* p0, const uint8_t* p1, const uint8_t* p2,
+      uint64_t* crc) const {
+    uint64_t crc0 = crc[0];
+    uint64_t crc1 = crc[1];
+    uint64_t crc2 = crc[2];
+    for (int i = 0; i < 8; i++) {
+      crc0 = CRC32_u64(static_cast<uint32_t>(crc0),
+                       absl::little_endian::Load64(p0));
+      crc1 = CRC32_u64(static_cast<uint32_t>(crc1),
+                       absl::little_endian::Load64(p1));
+      crc2 = CRC32_u64(static_cast<uint32_t>(crc2),
+                       absl::little_endian::Load64(p2));
+      p0 += 8;
+      p1 += 8;
+      p2 += 8;
+    }
+    crc[0] = crc0;
+    crc[1] = crc1;
+    crc[2] = crc2;
+  }
+
   // Constants generated by './scripts/gen-crc-consts.py x86_pclmul
   // crc32_lsb_0x82f63b78' from the Linux kernel.
   alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = {
@@ -452,9 +492,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
       uint64_t l64_pclmul[kMaxStreams] = {0};
 
       // Peel first iteration, because PCLMULQDQ stream, needs setup.
-      for (size_t i = 0; i < num_crc_streams; i++) {
-        l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
-        crc_streams[i] += 16 * 4;
+      if (num_crc_streams == 1) {
+        l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
+        crc_streams[0] += 16 * 4;
+      } else if (num_crc_streams == 2) {
+        Process64BytesCRC2Streams(crc_streams[0], crc_streams[1], l64_crc);
+        crc_streams[0] += 16 * 4;
+        crc_streams[1] += 16 * 4;
+      } else {
+        Process64BytesCRC3Streams(crc_streams[0], crc_streams[1],
+                                  crc_streams[2], l64_crc);
+        crc_streams[0] += 16 * 4;
+        crc_streams[1] += 16 * 4;
+        crc_streams[2] += 16 * 4;
       }
 
       V128 partialCRC[kMaxStreams][4];
@@ -492,24 +542,28 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
         // }
         // But unrolling and interleaving PCLMULQDQ and CRC blocks manually
         // gives ~2% performance boost.
-        l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
-        crc_streams[0] += 16 * 4;
+        if (num_crc_streams == 1) {
+          l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
+          crc_streams[0] += 16 * 4;
+        } else if (num_crc_streams == 2) {
+          Process64BytesCRC2Streams(crc_streams[0], crc_streams[1], l64_crc);
+          crc_streams[0] += 16 * 4;
+          crc_streams[1] += 16 * 4;
+        } else {
+          Process64BytesCRC3Streams(crc_streams[0], crc_streams[1],
+                                    crc_streams[2], l64_crc);
+          crc_streams[0] += 16 * 4;
+          crc_streams[1] += 16 * 4;
+          crc_streams[2] += 16 * 4;
+        }
         if (num_pclmul_streams > 0) {
           Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
           pclmul_streams[0] += 16 * 4;
         }
-        if (num_crc_streams > 1) {
-          l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
-          crc_streams[1] += 16 * 4;
-        }
         if (num_pclmul_streams > 1) {
           Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
           pclmul_streams[1] += 16 * 4;
         }
-        if (num_crc_streams > 2) {
-          l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
-          crc_streams[2] += 16 * 4;
-        }
         if (num_pclmul_streams > 2) {
           Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
           pclmul_streams[2] += 16 * 4;