Skip to content

Commit 9927576

Browse files
Abseil Teamcopybara-github
authored andcommitted
Use even faster reduction algorithm in FinalizePclmulStream()
My previous CL optimized the Barrett reduction. But since this is CRC32C and scalar instructions for it are available, there is actually no need for Barrett reduction at all. Just use two 64-bit CRC32C instructions to reduce fullCRC. This improves CRC32C performance on 2048-byte messages on Skylake by another 2% or so. PiperOrigin-RevId: 739977426 Change-Id: I4611af88cd32ed7a995e772a13c30e3bdcec8de9
1 parent d79e680 commit 9927576

File tree

2 files changed

+5
-61
lines changed

2 files changed

+5
-61
lines changed

absl/crc/internal/crc32_x86_arm_combined_simd.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -99,19 +99,12 @@ V128 V128_PMul10(const V128 l, const V128 r);
9999
// Produces a XOR operation of |l| and |r|.
100100
V128 V128_Xor(const V128 l, const V128 r);
101101

102-
// Produces an AND operation of |l| and |r|.
103-
V128 V128_And(const V128 l, const V128 r);
104-
105102
// Sets the lower half of a 128 bit register to the given 64-bit value and
106103
// zeroes the upper half.
107104
// dst[63:0] := |r|
108105
// dst[127:64] := |0|
109106
V128 V128_From64WithZeroFill(const uint64_t r);
110107

111-
// Shift |l| right by |imm| bytes while shifting in zeros.
112-
template <int imm>
113-
V128 V128_ShiftRight(const V128 l);
114-
115108
// Extracts a 32-bit integer from |l|, selected with |imm|.
116109
template <int imm>
117110
int V128_Extract32(const V128 l);
@@ -170,17 +163,10 @@ inline V128 V128_PMul10(const V128 l, const V128 r) {
170163

171164
inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
172165

173-
inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
174-
175166
inline V128 V128_From64WithZeroFill(const uint64_t r) {
176167
return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
177168
}
178169

179-
template <int imm>
180-
inline V128 V128_ShiftRight(const V128 l) {
181-
return _mm_srli_si128(l, imm);
182-
}
183-
184170
template <int imm>
185171
inline int V128_Extract32(const V128 l) {
186172
return _mm_extract_epi32(l, imm);
@@ -261,20 +247,12 @@ inline V128 V128_PMul10(const V128 l, const V128 r) {
261247

262248
inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
263249

264-
inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
265-
266250
inline V128 V128_From64WithZeroFill(const uint64_t r){
267251
constexpr uint64x2_t kZero = {0, 0};
268252
return vsetq_lane_u64(r, kZero, 0);
269253
}
270254

271255

272-
template <int imm>
273-
inline V128 V128_ShiftRight(const V128 l) {
274-
return vreinterpretq_u64_s8(
275-
vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
276-
}
277-
278256
template <int imm>
279257
inline int V128_Extract32(const V128 l) {
280258
return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);

absl/crc/internal/crc_x86_arm_combined.cc

Lines changed: 5 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -288,40 +288,11 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
288288
V128 fullCRC = V128_Xor(low, high);
289289
fullCRC = V128_Xor(fullCRC, partialCRC2);
290290

291-
// Reduce the 128-bit polynomial fullCRC into a 32-bit scalar value.
292-
293-
// Multiply fullCRC by x^32 (as required for CRC32C) and reduce it to 96
294-
// bits. Call that result t0. Store 'x^32 * t0' back into fullCRC.
295-
//
296-
// More concretely, multiply the high 64 terms by '(x^95 mod G) * x^32 * x'
297-
// and the low 64 terms by x^64, then add the two products together.
298-
//
299-
// Note that CRC32C is a least-significant-bit-first CRC. Therefore, the
300-
// mapping between bits and polynomial coefficients is reflected, and each
301-
// carryless multiplication implicitly introduces an extra factor of x.
302-
fullCRC = V128_Xor(V128_PMul10(fullCRC, reductionMultiplicands),
303-
V128_ShiftRight<8>(fullCRC));
304-
305-
// Calculate floor(t0 / G), i.e. the value by which G needs to be multiplied
306-
// to cancel out the x^32 and higher terms of t0. To do this, first do:
307-
//
308-
// t1 := floor(t0 / x^32) * floor(x^95 / G) * x
309-
//
310-
// Then, floor(t1 / x^64) contains the desired value floor(t0 / G).
311-
reductionMultiplicands =
312-
V128_Load(reinterpret_cast<const V128*>(kBarrettReduction));
313-
V128 t1 = V128_PMulLow(fullCRC, reductionMultiplicands);
314-
315-
// Cancel out the x^32 and higher terms of t0 by subtracting the needed
316-
// multiple of G. But since fullCRC contains x^32 * t0, it is more
317-
// convenient to use the same multiple of G * x^32. Also, the x^32 term of G
318-
// makes no difference in the result if mod x^32 is applied (as
319-
// V128_Extract32 does). Thus, the calculation is as follows:
320-
//
321-
// fullCRC := fullCRC - (floor(t0 / G) * ((G - x^32) * x^31) * x)
322-
// crc := (fullCRC / x^32) mod x^32
323-
fullCRC = V128_Xor(fullCRC, V128_PMul10(t1, reductionMultiplicands));
324-
return static_cast<uint64_t>(V128_Extract32<2>(fullCRC));
291+
// Reduce fullCRC into scalar value.
292+
uint32_t crc = 0;
293+
crc = CRC32_u64(crc, V128_Extract64<0>(fullCRC));
294+
crc = CRC32_u64(crc, V128_Extract64<1>(fullCRC));
295+
return crc;
325296
}
326297

327298
// Update crc with 64 bytes of data from p.
@@ -352,11 +323,6 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreamsBase
352323
0x00000000f20c0dfe,
353324
// (x^95 mod G) * x^32
354325
0x00000000493c7d27};
355-
alignas(16) static constexpr uint64_t kBarrettReduction[2] = {
356-
// floor(x^95 / G)
357-
0x4869ec38dea713f1,
358-
// (G - x^32) * x^31
359-
0x0000000105ec76f0};
360326

361327
// Medium runs of bytes are broken into groups of kGroupsSmall blocks of same
362328
// size. Each group is CRCed in parallel then combined at the end of the

0 commit comments

Comments
 (0)