Skip to content

Commit dbdda1f

Browse files
committed
x86/crc-t10dif: implement crc_t10dif using new template
Instantiate crc-pclmul-template.S for crc_t10dif and delete the original PCLMULQDQ optimized implementation. This has the following advantages: - Less CRC-variant-specific code. - VPCLMULQDQ support, greatly improving performance on sufficiently long messages on newer CPUs. - A faster reduction from 128 bits to the final CRC. - Support for i386. Benchmark results on AMD Ryzen 9 9950X (Zen 5) using crc_kunit: Length Before After ------ ------ ----- 1 440 MB/s 386 MB/s 16 1865 MB/s 2008 MB/s 64 4343 MB/s 6917 MB/s 127 5440 MB/s 8909 MB/s 128 5533 MB/s 12150 MB/s 200 5908 MB/s 14423 MB/s 256 15870 MB/s 21288 MB/s 511 14219 MB/s 25840 MB/s 512 18361 MB/s 37797 MB/s 1024 19941 MB/s 61374 MB/s 3173 20461 MB/s 74909 MB/s 4096 21310 MB/s 78919 MB/s 16384 21663 MB/s 85012 MB/s Acked-by: Ard Biesheuvel <[email protected]> Acked-by: Keith Busch <[email protected]> Reviewed-by: "Martin K. Petersen" <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Eric Biggers <[email protected]>
1 parent a03fda9 commit dbdda1f

File tree

6 files changed

+64
-349
lines changed

6 files changed

+64
-349
lines changed

arch/x86/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ config X86
7777
select ARCH_HAS_CPU_FINALIZE_INIT
7878
select ARCH_HAS_CPU_PASID if IOMMU_SVA
7979
select ARCH_HAS_CRC32
80-
select ARCH_HAS_CRC_T10DIF if X86_64
80+
select ARCH_HAS_CRC_T10DIF
8181
select ARCH_HAS_CURRENT_STACK_POINTER
8282
select ARCH_HAS_DEBUG_VIRTUAL
8383
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE

arch/x86/lib/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ crc32-x86-y := crc32-glue.o crc32-pclmul.o
4343
crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o
4444

4545
obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o
46-
crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o
46+
crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o
4747

4848
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
4949
obj-y += iomem.o

arch/x86/lib/crc-pclmul-consts.h

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,57 @@
22
/*
33
* CRC constants generated by:
44
*
5-
* ./scripts/gen-crc-consts.py x86_pclmul crc32_lsb_0xedb88320
5+
* ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320
66
*
77
* Do not edit manually.
88
*/
99

10+
/*
11+
* CRC folding constants generated for most-significant-bit-first CRC-16 using
12+
* G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
13+
*/
14+
static const struct {
15+
u8 bswap_mask[16];
16+
u64 fold_across_2048_bits_consts[2];
17+
u64 fold_across_1024_bits_consts[2];
18+
u64 fold_across_512_bits_consts[2];
19+
u64 fold_across_256_bits_consts[2];
20+
u64 fold_across_128_bits_consts[2];
21+
u8 shuf_table[48];
22+
u64 barrett_reduction_consts[2];
23+
} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = {
24+
.bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
25+
.fold_across_2048_bits_consts = {
26+
0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */
27+
0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */
28+
},
29+
.fold_across_1024_bits_consts = {
30+
0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */
31+
0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */
32+
},
33+
.fold_across_512_bits_consts = {
34+
0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */
35+
0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */
36+
},
37+
.fold_across_256_bits_consts = {
38+
0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */
39+
0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */
40+
},
41+
.fold_across_128_bits_consts = {
42+
0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */
43+
0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */
44+
},
45+
.shuf_table = {
46+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
47+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
48+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
49+
},
50+
.barrett_reduction_consts = {
51+
0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */
52+
0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */
53+
},
54+
};
55+
1056
/*
1157
* CRC folding constants generated for least-significant-bit-first CRC-32 using
1258
* G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 +

arch/x86/lib/crc-t10dif-glue.c

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,32 @@
11
// SPDX-License-Identifier: GPL-2.0-or-later
22
/*
3-
* CRC-T10DIF using PCLMULQDQ instructions
3+
* CRC-T10DIF using [V]PCLMULQDQ instructions
44
*
55
* Copyright 2024 Google LLC
66
*/
77

8-
#include <asm/cpufeatures.h>
9-
#include <asm/simd.h>
10-
#include <crypto/internal/simd.h>
118
#include <linux/crc-t10dif.h>
129
#include <linux/module.h>
10+
#include "crc-pclmul-template.h"
1311

1412
static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
1513

16-
asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
14+
DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16);
1715

1816
u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len)
1917
{
20-
if (len >= 16 &&
21-
static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) {
22-
kernel_fpu_begin();
23-
crc = crc_t10dif_pcl(crc, p, len);
24-
kernel_fpu_end();
25-
return crc;
26-
}
18+
CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts,
19+
have_pclmulqdq);
2720
return crc_t10dif_generic(crc, p, len);
2821
}
2922
EXPORT_SYMBOL(crc_t10dif_arch);
3023

3124
static int __init crc_t10dif_x86_init(void)
3225
{
33-
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ))
26+
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
3427
static_branch_enable(&have_pclmulqdq);
28+
INIT_CRC_PCLMUL(crc16_msb);
29+
}
3530
return 0;
3631
}
3732
arch_initcall(crc_t10dif_x86_init);
@@ -41,5 +36,5 @@ static void __exit crc_t10dif_x86_exit(void)
4136
}
4237
module_exit(crc_t10dif_x86_exit);
4338

44-
MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions");
39+
MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions");
4540
MODULE_LICENSE("GPL");

arch/x86/lib/crc16-msb-pclmul.S

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
// Copyright 2025 Google LLC
3+
4+
#include "crc-pclmul-template.S"
5+
6+
DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0)

0 commit comments

Comments
 (0)