Skip to content

Commit 274c813

Browse files
conNULLcopybara-github
authored andcommitted
Optimize crc32 Extend by removing obsolete length alignment.
Currently at the start of the Extend() call we process some number of bytes to align the length to a multiple of 16. However, for large inputs we then process another small number of bytes to align the next load address to 8 bytes, undoing the length alignment. At the end of the call, we process the remaining bytes anyway. The initial length alignment is not useful, since it is undone anyway. We never return early here for small inputs since this function is only used for lengths > 64 anyway. Removing this reduces the amount of time we spend processing only a small number of bytes at a time. Also, we can optimize processing the remaining bytes at the end by leveraging the CRC32 instructions for 2,4, and 8 bytes. This looks to be about 2-5% faster on various platforms for typical input sizes. PiperOrigin-RevId: 793697720 Change-Id: Ibe71a51c851863ad40acef7d334694a9ac930f4d
1 parent bd05c09 commit 274c813

File tree

1 file changed

+24
-43
lines changed

1 file changed

+24
-43
lines changed

absl/crc/internal/crc_x86_arm_combined.cc

Lines changed: 24 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,8 @@ template <size_t num_crc_streams, size_t num_pclmul_streams,
350350
class CRC32AcceleratedX86ARMCombinedMultipleStreams
351351
: public CRC32AcceleratedX86ARMCombinedMultipleStreamsBase {
352352
ABSL_ATTRIBUTE_HOT
353-
void Extend(uint32_t* crc, const void* bytes, size_t length) const override {
353+
void Extend(uint32_t* crc, const void* bytes,
354+
const size_t length) const override {
354355
static_assert(num_crc_streams >= 1 && num_crc_streams <= kMaxStreams,
355356
"Invalid number of crc streams");
356357
static_assert(num_pclmul_streams >= 0 && num_pclmul_streams <= kMaxStreams,
@@ -360,47 +361,15 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
360361
uint32_t l = *crc;
361362
uint64_t l64;
362363

363-
// We have dedicated instruction for 1,2,4 and 8 bytes.
364-
if (length & 8) {
365-
ABSL_INTERNAL_STEP8(l, p);
366-
length &= ~size_t{8};
367-
}
368-
if (length & 4) {
369-
ABSL_INTERNAL_STEP4(l, p);
370-
length &= ~size_t{4};
371-
}
372-
if (length & 2) {
373-
ABSL_INTERNAL_STEP2(l, p);
374-
length &= ~size_t{2};
375-
}
376-
if (length & 1) {
377-
ABSL_INTERNAL_STEP1(l, p);
378-
length &= ~size_t{1};
379-
}
380-
if (length == 0) {
381-
*crc = l;
382-
return;
383-
}
384-
// length is now multiple of 16.
385-
386364
// For small blocks just run simple loop, because cost of combining multiple
387365
// streams is significant.
388-
if (strategy != CutoffStrategy::Unroll64CRC) {
389-
if (length < kSmallCutoff) {
390-
while (length >= 16) {
391-
ABSL_INTERNAL_STEP8(l, p);
392-
ABSL_INTERNAL_STEP8(l, p);
393-
length -= 16;
394-
}
395-
*crc = l;
396-
return;
397-
}
398-
}
399-
400-
// For medium blocks we run 3 crc streams and combine them as described in
401-
// Intel paper above. Running 4th stream doesn't help, because crc
402-
// instruction has latency 3 and throughput 1.
403-
if (length < kMediumCutoff) {
366+
if (strategy != CutoffStrategy::Unroll64CRC && (length < kSmallCutoff)) {
367+
// fallthrough; Use the same strategy as we do for processing the
368+
// remaining bytes after any other strategy.
369+
} else if (length < kMediumCutoff) {
370+
// For medium blocks we run 3 crc streams and combine them as described in
371+
// Intel paper above. Running 4th stream doesn't help, because crc
372+
// instruction has latency 3 and throughput 1.
404373
l64 = l;
405374
if (strategy == CutoffStrategy::Fold3) {
406375
uint64_t l641 = 0;
@@ -449,6 +418,7 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
449418
p += 64;
450419
}
451420
}
421+
l = static_cast<uint32_t>(l64);
452422
} else {
453423
// There is a lot of data, we can ignore combine costs and run all
454424
// requested streams (num_crc_streams + num_pclmul_streams),
@@ -571,15 +541,26 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
571541
} else {
572542
p = crc_streams[num_crc_streams - 1];
573543
}
544+
l = static_cast<uint32_t>(l64);
574545
}
575-
l = static_cast<uint32_t>(l64);
576546

547+
uint64_t remaining_bytes = static_cast<uint64_t>(e - p);
548+
// Process the remaining bytes.
577549
while ((e - p) >= 16) {
578550
ABSL_INTERNAL_STEP8(l, p);
579551
ABSL_INTERNAL_STEP8(l, p);
580552
}
581-
// Process the last few bytes
582-
while (p != e) {
553+
554+
if (remaining_bytes & 8) {
555+
ABSL_INTERNAL_STEP8(l, p);
556+
}
557+
if (remaining_bytes & 4) {
558+
ABSL_INTERNAL_STEP4(l, p);
559+
}
560+
if (remaining_bytes & 2) {
561+
ABSL_INTERNAL_STEP2(l, p);
562+
}
563+
if (remaining_bytes & 1) {
583564
ABSL_INTERNAL_STEP1(l, p);
584565
}
585566

0 commit comments

Comments
 (0)