Skip to content

Commit 5ad0bfb

Browse files
nafi3000copybara-github
authored andcommitted
Optimize CRC32AcceleratedX86ARMCombinedMultipleStreams::Extend by interleaving the CRC32_u64 calls at a lower level.
`CRC32_u64` generates `CRC32` x86 instruction which has 3 cycle latency. Because of that, the `crc` variable below causes a loop carried dependency of 3 cycles per iteration. ``` for (int i = 0; i < 8; i++) { crc = CRC32_u64(static_cast<uint32_t>(crc), absl::little_endian::Load64(p)); p += 8; } ``` Total latency for a 64-byte block is 29 cycles (codegen: https://godbolt.org/z/zxsrGMEPs, llvm-mca: https://godbolt.org/z/xrTMhhd1E). So, it is more efficient to interleave (up to 3 calls because of the 3 cycle latency) the `CRC32_u64` calls at a lower level. Even if we interleave 3 streams, the total latency for (three) 64-byte blocks is 33 cycles (codegen: https://godbolt.org/z/5ojzPdj3h, llvm-mca: https://godbolt.org/z/5cEPxvddW). And this is without considering any inlining. PiperOrigin-RevId: 799757460 Change-Id: I80118d5c1736ae31d69e5624c94cc0a6513ef28f
1 parent 5d51d83 commit 5ad0bfb

File tree

1 file changed

+67
-13
lines changed

1 file changed

+67
-13
lines changed

absl/crc/internal/crc_x86_arm_combined.cc

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,46 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
317317
return crc;
318318
}
319319

320+
// Same as Process64BytesCRC, but just interleaved for 2 streams.
321+
ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesCRC2Streams(
322+
const uint8_t* p0, const uint8_t* p1, uint64_t* crc) const {
323+
uint64_t crc0 = crc[0];
324+
uint64_t crc1 = crc[1];
325+
for (int i = 0; i < 8; i++) {
326+
crc0 = CRC32_u64(static_cast<uint32_t>(crc0),
327+
absl::little_endian::Load64(p0));
328+
crc1 = CRC32_u64(static_cast<uint32_t>(crc1),
329+
absl::little_endian::Load64(p1));
330+
p0 += 8;
331+
p1 += 8;
332+
}
333+
crc[0] = crc0;
334+
crc[1] = crc1;
335+
}
336+
337+
// Same as Process64BytesCRC, but just interleaved for 3 streams.
338+
ABSL_ATTRIBUTE_ALWAYS_INLINE void Process64BytesCRC3Streams(
339+
const uint8_t* p0, const uint8_t* p1, const uint8_t* p2,
340+
uint64_t* crc) const {
341+
uint64_t crc0 = crc[0];
342+
uint64_t crc1 = crc[1];
343+
uint64_t crc2 = crc[2];
344+
for (int i = 0; i < 8; i++) {
345+
crc0 = CRC32_u64(static_cast<uint32_t>(crc0),
346+
absl::little_endian::Load64(p0));
347+
crc1 = CRC32_u64(static_cast<uint32_t>(crc1),
348+
absl::little_endian::Load64(p1));
349+
crc2 = CRC32_u64(static_cast<uint32_t>(crc2),
350+
absl::little_endian::Load64(p2));
351+
p0 += 8;
352+
p1 += 8;
353+
p2 += 8;
354+
}
355+
crc[0] = crc0;
356+
crc[1] = crc1;
357+
crc[2] = crc2;
358+
}
359+
320360
// Constants generated by './scripts/gen-crc-consts.py x86_pclmul
321361
// crc32_lsb_0x82f63b78' from the Linux kernel.
322362
alignas(16) static constexpr uint64_t kFoldAcross512Bits[2] = {
@@ -452,9 +492,19 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
452492
uint64_t l64_pclmul[kMaxStreams] = {0};
453493

454494
// Peel first iteration, because PCLMULQDQ stream, needs setup.
455-
for (size_t i = 0; i < num_crc_streams; i++) {
456-
l64_crc[i] = Process64BytesCRC(crc_streams[i], l64_crc[i]);
457-
crc_streams[i] += 16 * 4;
495+
if (num_crc_streams == 1) {
496+
l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
497+
crc_streams[0] += 16 * 4;
498+
} else if (num_crc_streams == 2) {
499+
Process64BytesCRC2Streams(crc_streams[0], crc_streams[1], l64_crc);
500+
crc_streams[0] += 16 * 4;
501+
crc_streams[1] += 16 * 4;
502+
} else {
503+
Process64BytesCRC3Streams(crc_streams[0], crc_streams[1],
504+
crc_streams[2], l64_crc);
505+
crc_streams[0] += 16 * 4;
506+
crc_streams[1] += 16 * 4;
507+
crc_streams[2] += 16 * 4;
458508
}
459509

460510
V128 partialCRC[kMaxStreams][4];
@@ -492,24 +542,28 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
492542
// }
493543
// But unrolling and interleaving PCLMULQDQ and CRC blocks manually
494544
// gives ~2% performance boost.
495-
l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
496-
crc_streams[0] += 16 * 4;
545+
if (num_crc_streams == 1) {
546+
l64_crc[0] = Process64BytesCRC(crc_streams[0], l64_crc[0]);
547+
crc_streams[0] += 16 * 4;
548+
} else if (num_crc_streams == 2) {
549+
Process64BytesCRC2Streams(crc_streams[0], crc_streams[1], l64_crc);
550+
crc_streams[0] += 16 * 4;
551+
crc_streams[1] += 16 * 4;
552+
} else {
553+
Process64BytesCRC3Streams(crc_streams[0], crc_streams[1],
554+
crc_streams[2], l64_crc);
555+
crc_streams[0] += 16 * 4;
556+
crc_streams[1] += 16 * 4;
557+
crc_streams[2] += 16 * 4;
558+
}
497559
if (num_pclmul_streams > 0) {
498560
Process64BytesPclmul(pclmul_streams[0], partialCRC[0]);
499561
pclmul_streams[0] += 16 * 4;
500562
}
501-
if (num_crc_streams > 1) {
502-
l64_crc[1] = Process64BytesCRC(crc_streams[1], l64_crc[1]);
503-
crc_streams[1] += 16 * 4;
504-
}
505563
if (num_pclmul_streams > 1) {
506564
Process64BytesPclmul(pclmul_streams[1], partialCRC[1]);
507565
pclmul_streams[1] += 16 * 4;
508566
}
509-
if (num_crc_streams > 2) {
510-
l64_crc[2] = Process64BytesCRC(crc_streams[2], l64_crc[2]);
511-
crc_streams[2] += 16 * 4;
512-
}
513567
if (num_pclmul_streams > 2) {
514568
Process64BytesPclmul(pclmul_streams[2], partialCRC[2]);
515569
pclmul_streams[2] += 16 * 4;

0 commit comments

Comments
 (0)