Skip to content

Commit a03fda9

Browse files
committed
x86/crc32: implement crc32_le using new template
Instantiate crc-pclmul-template.S for crc32_le, and delete the original PCLMULQDQ optimized implementation. This has the following advantages: - Less CRC-variant-specific code. - VPCLMULQDQ support, greatly improving performance on sufficiently long messages on newer CPUs. - A faster reduction from 128 bits to the final CRC. - Support for lengths not a multiple of 16 bytes, improving performance for such lengths. - Support for misaligned buffers, improving performance in such cases. Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit: Length Before After ------ ------ ----- 1 427 MB/s 605 MB/s 16 710 MB/s 3631 MB/s 64 704 MB/s 7615 MB/s 127 3610 MB/s 9710 MB/s 128 8759 MB/s 12702 MB/s 200 7083 MB/s 15343 MB/s 256 17284 MB/s 22904 MB/s 511 10919 MB/s 27309 MB/s 512 19849 MB/s 48900 MB/s 1024 21216 MB/s 62630 MB/s 3173 22150 MB/s 72437 MB/s 4096 22496 MB/s 79593 MB/s 16384 22018 MB/s 85106 MB/s Acked-by: Ard Biesheuvel <[email protected]> Acked-by: Keith Busch <[email protected]> Reviewed-by: "Martin K. Petersen" <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Eric Biggers <[email protected]>
1 parent 8d2d3e7 commit a03fda9

File tree

3 files changed

+65
-244
lines changed

3 files changed

+65
-244
lines changed

arch/x86/lib/crc-pclmul-consts.h

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
/*
3+
* CRC constants generated by:
4+
*
5+
* ./scripts/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320
6+
*
7+
* Do not edit manually.
8+
*/
9+
10+
/*
11+
* CRC folding constants generated for least-significant-bit-first CRC-32 using
12+
* G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +
13+
* x^5 + x^4 + x^2 + x^1 + x^0
14+
*/
15+
static const struct {
16+
u64 fold_across_2048_bits_consts[2];
17+
u64 fold_across_1024_bits_consts[2];
18+
u64 fold_across_512_bits_consts[2];
19+
u64 fold_across_256_bits_consts[2];
20+
u64 fold_across_128_bits_consts[2];
21+
u8 shuf_table[48];
22+
u64 barrett_reduction_consts[2];
23+
} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = {
24+
.fold_across_2048_bits_consts = {
25+
0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */
26+
0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */
27+
},
28+
.fold_across_1024_bits_consts = {
29+
0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */
30+
0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */
31+
},
32+
.fold_across_512_bits_consts = {
33+
0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */
34+
0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */
35+
},
36+
.fold_across_256_bits_consts = {
37+
0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */
38+
0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */
39+
},
40+
.fold_across_128_bits_consts = {
41+
0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */
42+
0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */
43+
},
44+
.shuf_table = {
45+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
46+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
47+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
48+
},
49+
.barrett_reduction_consts = {
50+
0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */
51+
0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */
52+
},
53+
};

arch/x86/lib/crc32-glue.c

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,43 +7,20 @@
77
* Copyright 2024 Google LLC
88
*/
99

10-
#include <asm/cpufeatures.h>
11-
#include <asm/simd.h>
12-
#include <crypto/internal/simd.h>
1310
#include <linux/crc32.h>
14-
#include <linux/linkage.h>
1511
#include <linux/module.h>
16-
17-
/* minimum size of buffer for crc32_pclmul_le_16 */
18-
#define CRC32_PCLMUL_MIN_LEN 64
12+
#include "crc-pclmul-template.h"
1913

2014
static DEFINE_STATIC_KEY_FALSE(have_crc32);
2115
static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
2216

23-
u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
17+
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
2418

2519
u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
2620
{
27-
if (len >= CRC32_PCLMUL_MIN_LEN + 15 &&
28-
static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) {
29-
size_t n = -(uintptr_t)p & 15;
30-
31-
/* align p to 16-byte boundary */
32-
if (n) {
33-
crc = crc32_le_base(crc, p, n);
34-
p += n;
35-
len -= n;
36-
}
37-
n = round_down(len, 16);
38-
kernel_fpu_begin();
39-
crc = crc32_pclmul_le_16(crc, p, n);
40-
kernel_fpu_end();
41-
p += n;
42-
len -= n;
43-
}
44-
if (len)
45-
crc = crc32_le_base(crc, p, len);
46-
return crc;
21+
CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
22+
have_pclmulqdq);
23+
return crc32_le_base(crc, p, len);
4724
}
4825
EXPORT_SYMBOL(crc32_le_arch);
4926

@@ -97,8 +74,10 @@ static int __init crc32_x86_init(void)
9774
{
9875
if (boot_cpu_has(X86_FEATURE_XMM4_2))
9976
static_branch_enable(&have_crc32);
100-
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
77+
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
10178
static_branch_enable(&have_pclmulqdq);
79+
INIT_CRC_PCLMUL(crc32_lsb);
80+
}
10281
return 0;
10382
}
10483
arch_initcall(crc32_x86_init);

arch/x86/lib/crc32-pclmul.S

Lines changed: 4 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -1,217 +1,6 @@
1-
/* SPDX-License-Identifier: GPL-2.0-only */
2-
/*
3-
* Copyright 2012 Xyratex Technology Limited
4-
*
5-
* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
6-
* calculation.
7-
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
8-
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
9-
* at:
10-
* http://www.intel.com/products/processor/manuals/
11-
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
12-
* Volume 2B: Instruction Set Reference, N-Z
13-
*
14-
* Authors: Gregory Prestas <[email protected]>
15-
* Alexander Boyko <[email protected]>
16-
*/
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
// Copyright 2025 Google LLC
173

18-
#include <linux/linkage.h>
4+
#include "crc-pclmul-template.S"
195

20-
21-
.section .rodata
22-
.align 16
23-
/*
24-
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
25-
* #define CONSTANT_R1 0x154442bd4LL
26-
*
27-
* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
28-
* #define CONSTANT_R2 0x1c6e41596LL
29-
*/
30-
.Lconstant_R2R1:
31-
.octa 0x00000001c6e415960000000154442bd4
32-
/*
33-
* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
34-
* #define CONSTANT_R3 0x1751997d0LL
35-
*
36-
* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
37-
* #define CONSTANT_R4 0x0ccaa009eLL
38-
*/
39-
.Lconstant_R4R3:
40-
.octa 0x00000000ccaa009e00000001751997d0
41-
/*
42-
* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
43-
* #define CONSTANT_R5 0x163cd6124LL
44-
*/
45-
.Lconstant_R5:
46-
.octa 0x00000000000000000000000163cd6124
47-
.Lconstant_mask32:
48-
.octa 0x000000000000000000000000FFFFFFFF
49-
/*
50-
* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
51-
*
52-
* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
53-
* #define CONSTANT_RU 0x1F7011641LL
54-
*/
55-
.Lconstant_RUpoly:
56-
.octa 0x00000001F701164100000001DB710641
57-
58-
#define CONSTANT %xmm0
59-
60-
#ifdef __x86_64__
61-
#define CRC %edi
62-
#define BUF %rsi
63-
#define LEN %rdx
64-
#else
65-
#define CRC %eax
66-
#define BUF %edx
67-
#define LEN %ecx
68-
#endif
69-
70-
71-
72-
.text
73-
/**
74-
* Calculate crc32
75-
* CRC - initial crc32
76-
* BUF - buffer (16 bytes aligned)
77-
* LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63
78-
* return %eax crc32
79-
* u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len);
80-
*/
81-
82-
SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
83-
movdqa (BUF), %xmm1
84-
movdqa 0x10(BUF), %xmm2
85-
movdqa 0x20(BUF), %xmm3
86-
movdqa 0x30(BUF), %xmm4
87-
movd CRC, CONSTANT
88-
pxor CONSTANT, %xmm1
89-
sub $0x40, LEN
90-
add $0x40, BUF
91-
cmp $0x40, LEN
92-
jb .Lless_64
93-
94-
#ifdef __x86_64__
95-
movdqa .Lconstant_R2R1(%rip), CONSTANT
96-
#else
97-
movdqa .Lconstant_R2R1, CONSTANT
98-
#endif
99-
100-
.Lloop_64:/* 64 bytes Full cache line folding */
101-
prefetchnta 0x40(BUF)
102-
movdqa %xmm1, %xmm5
103-
movdqa %xmm2, %xmm6
104-
movdqa %xmm3, %xmm7
105-
#ifdef __x86_64__
106-
movdqa %xmm4, %xmm8
107-
#endif
108-
pclmulqdq $0x00, CONSTANT, %xmm1
109-
pclmulqdq $0x00, CONSTANT, %xmm2
110-
pclmulqdq $0x00, CONSTANT, %xmm3
111-
#ifdef __x86_64__
112-
pclmulqdq $0x00, CONSTANT, %xmm4
113-
#endif
114-
pclmulqdq $0x11, CONSTANT, %xmm5
115-
pclmulqdq $0x11, CONSTANT, %xmm6
116-
pclmulqdq $0x11, CONSTANT, %xmm7
117-
#ifdef __x86_64__
118-
pclmulqdq $0x11, CONSTANT, %xmm8
119-
#endif
120-
pxor %xmm5, %xmm1
121-
pxor %xmm6, %xmm2
122-
pxor %xmm7, %xmm3
123-
#ifdef __x86_64__
124-
pxor %xmm8, %xmm4
125-
#else
126-
/* xmm8 unsupported for x32 */
127-
movdqa %xmm4, %xmm5
128-
pclmulqdq $0x00, CONSTANT, %xmm4
129-
pclmulqdq $0x11, CONSTANT, %xmm5
130-
pxor %xmm5, %xmm4
131-
#endif
132-
133-
pxor (BUF), %xmm1
134-
pxor 0x10(BUF), %xmm2
135-
pxor 0x20(BUF), %xmm3
136-
pxor 0x30(BUF), %xmm4
137-
138-
sub $0x40, LEN
139-
add $0x40, BUF
140-
cmp $0x40, LEN
141-
jge .Lloop_64
142-
.Lless_64:/* Folding cache line into 128bit */
143-
#ifdef __x86_64__
144-
movdqa .Lconstant_R4R3(%rip), CONSTANT
145-
#else
146-
movdqa .Lconstant_R4R3, CONSTANT
147-
#endif
148-
prefetchnta (BUF)
149-
150-
movdqa %xmm1, %xmm5
151-
pclmulqdq $0x00, CONSTANT, %xmm1
152-
pclmulqdq $0x11, CONSTANT, %xmm5
153-
pxor %xmm5, %xmm1
154-
pxor %xmm2, %xmm1
155-
156-
movdqa %xmm1, %xmm5
157-
pclmulqdq $0x00, CONSTANT, %xmm1
158-
pclmulqdq $0x11, CONSTANT, %xmm5
159-
pxor %xmm5, %xmm1
160-
pxor %xmm3, %xmm1
161-
162-
movdqa %xmm1, %xmm5
163-
pclmulqdq $0x00, CONSTANT, %xmm1
164-
pclmulqdq $0x11, CONSTANT, %xmm5
165-
pxor %xmm5, %xmm1
166-
pxor %xmm4, %xmm1
167-
168-
cmp $0x10, LEN
169-
jb .Lfold_64
170-
.Lloop_16:/* Folding rest buffer into 128bit */
171-
movdqa %xmm1, %xmm5
172-
pclmulqdq $0x00, CONSTANT, %xmm1
173-
pclmulqdq $0x11, CONSTANT, %xmm5
174-
pxor %xmm5, %xmm1
175-
pxor (BUF), %xmm1
176-
sub $0x10, LEN
177-
add $0x10, BUF
178-
cmp $0x10, LEN
179-
jge .Lloop_16
180-
181-
.Lfold_64:
182-
/* perform the last 64 bit fold, also adds 32 zeroes
183-
* to the input stream */
184-
pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
185-
psrldq $0x08, %xmm1
186-
pxor CONSTANT, %xmm1
187-
188-
/* final 32-bit fold */
189-
movdqa %xmm1, %xmm2
190-
#ifdef __x86_64__
191-
movdqa .Lconstant_R5(%rip), CONSTANT
192-
movdqa .Lconstant_mask32(%rip), %xmm3
193-
#else
194-
movdqa .Lconstant_R5, CONSTANT
195-
movdqa .Lconstant_mask32, %xmm3
196-
#endif
197-
psrldq $0x04, %xmm2
198-
pand %xmm3, %xmm1
199-
pclmulqdq $0x00, CONSTANT, %xmm1
200-
pxor %xmm2, %xmm1
201-
202-
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
203-
#ifdef __x86_64__
204-
movdqa .Lconstant_RUpoly(%rip), CONSTANT
205-
#else
206-
movdqa .Lconstant_RUpoly, CONSTANT
207-
#endif
208-
movdqa %xmm1, %xmm2
209-
pand %xmm3, %xmm1
210-
pclmulqdq $0x10, CONSTANT, %xmm1
211-
pand %xmm3, %xmm1
212-
pclmulqdq $0x00, CONSTANT, %xmm1
213-
pxor %xmm2, %xmm1
214-
pextrd $0x01, %xmm1, %eax
215-
216-
RET
217-
SYM_FUNC_END(crc32_pclmul_le_16)
6+
DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1)

0 commit comments

Comments
 (0)