Skip to content

Commit bbe2610

Browse files
committed
riscv/crc: add "template" for Zbc optimized CRC functions
Add a "template" crc-clmul-template.h that can generate RISC-V Zbc optimized CRC functions. Each generated CRC function is parameterized by CRC length and bit order, and it accepts a pointer to the constants struct required for the specific CRC polynomial desired. Update gen-crc-consts.py to support generating the needed constants structs. This makes it possible to easily wire up a Zbc optimized implementation of almost any CRC. The design generally follows what I did for x86, but it is simplified by using RISC-V's scalar carryless multiplication Zbc, which has no equivalent on x86. RISC-V's clmulr instruction is also helpful. A potential switch to Zvbc (or support for Zvbc alongside Zbc) is left for future work. For long messages Zvbc should be fastest, but it would need to be shown to be worthwhile over just using Zbc which is significantly more convenient to use, especially in the kernel context. Compared to the existing Zbc-optimized CRC32 code and the earlier proposed Zbc-optimized CRC-T10DIF code (https://lore.kernel.org/r/[email protected]), this submission deduplicates the code among CRC variants and is significantly more optimized. It uses "folding" to take better advantage of instruction-level parallelism (to a more limited extent than x86 for now, but it could be extended to more), it reworks the Barrett reduction to eliminate unnecessary instructions, and it documents all the math used and makes all the constants reproducible. Tested-by: Björn Töpel <[email protected]> Acked-by: Alexandre Ghiti <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Eric Biggers <[email protected]>
1 parent a0bd462 commit bbe2610

File tree

2 files changed

+319
-1
lines changed

2 files changed

+319
-1
lines changed

arch/riscv/lib/crc-clmul-template.h

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
/* Copyright 2025 Google LLC */
3+
4+
/*
5+
* This file is a "template" that generates a CRC function optimized using the
6+
* RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
7+
* file must define the following parameters to specify the type of CRC:
8+
*
9+
* crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
10+
* LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
11+
* mapping between bits and polynomial coefficients
12+
* 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
13+
* mapping between bits and polynomial coefficients
14+
*/
15+
16+
#include <asm/byteorder.h>
17+
#include <linux/minmax.h>
18+
19+
#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
20+
21+
static inline unsigned long clmul(unsigned long a, unsigned long b)
22+
{
23+
unsigned long res;
24+
25+
asm(".option push\n"
26+
".option arch,+zbc\n"
27+
"clmul %0, %1, %2\n"
28+
".option pop\n"
29+
: "=r" (res) : "r" (a), "r" (b));
30+
return res;
31+
}
32+
33+
static inline unsigned long clmulh(unsigned long a, unsigned long b)
34+
{
35+
unsigned long res;
36+
37+
asm(".option push\n"
38+
".option arch,+zbc\n"
39+
"clmulh %0, %1, %2\n"
40+
".option pop\n"
41+
: "=r" (res) : "r" (a), "r" (b));
42+
return res;
43+
}
44+
45+
static inline unsigned long clmulr(unsigned long a, unsigned long b)
46+
{
47+
unsigned long res;
48+
49+
asm(".option push\n"
50+
".option arch,+zbc\n"
51+
"clmulr %0, %1, %2\n"
52+
".option pop\n"
53+
: "=r" (res) : "r" (a), "r" (b));
54+
return res;
55+
}
56+
57+
/*
58+
* crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
59+
* polynomial whose bit order matches the CRC's bit order.
60+
*/
61+
#ifdef CONFIG_64BIT
62+
# if LSB_CRC
63+
# define crc_load_long(x) le64_to_cpup(x)
64+
# else
65+
# define crc_load_long(x) be64_to_cpup(x)
66+
# endif
67+
#else
68+
# if LSB_CRC
69+
# define crc_load_long(x) le32_to_cpup(x)
70+
# else
71+
# define crc_load_long(x) be32_to_cpup(x)
72+
# endif
73+
#endif
74+
75+
/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
76+
static inline unsigned long
77+
crc_clmul_prep(crc_t crc, unsigned long msgpoly)
78+
{
79+
#if LSB_CRC
80+
return msgpoly ^ crc;
81+
#else
82+
return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
83+
#endif
84+
}
85+
86+
/*
87+
* Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
88+
* modulo the generator polynomial G. This gives the CRC of @msgpoly.
89+
*/
90+
static inline crc_t
91+
crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
92+
{
93+
unsigned long tmp;
94+
95+
/*
96+
* First step of Barrett reduction with integrated multiplication by
97+
* x^n: calculate floor((msgpoly * x^n) / G). This is the value by
98+
* which G needs to be multiplied to cancel out the x^n and higher terms
99+
* of msgpoly * x^n. Do it using the following formula:
100+
*
101+
* msb-first:
102+
* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
103+
* lsb-first:
104+
* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
105+
*
106+
* barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
107+
* which fits a long exactly. Using any lower power of x there would
108+
* not carry enough precision through the calculation, while using any
109+
* higher power of x would require extra instructions to handle a wider
110+
* multiplication. In the msb-first case, using this power of x results
111+
* in needing a floored division by x^(BITS_PER_LONG-1), which matches
112+
* what clmulr produces. In the lsb-first case, a factor of x gets
113+
* implicitly introduced by each carryless multiplication (shown as
114+
* '* x' above), and the floored division instead needs to be by
115+
* x^BITS_PER_LONG which matches what clmul produces.
116+
*/
117+
#if LSB_CRC
118+
tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
119+
#else
120+
tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
121+
#endif
122+
123+
/*
124+
* Second step of Barrett reduction:
125+
*
126+
* crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
127+
*
128+
* This reduces (msgpoly * x^n) modulo G by adding the appropriate
129+
* multiple of G to it. The result uses only the x^0..x^(n-1) terms.
130+
* HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
131+
* terms in the first place, it is more efficient to do the equivalent:
132+
*
133+
* crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
134+
*
135+
* In the lsb-first case further modify it to the following which avoids
136+
* a shift, as the crc ends up in the physically low n bits from clmulr:
137+
*
138+
* product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
139+
* crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
140+
*
141+
* barrett_reduction_const_2 contains the constant multiplier (G - x^n)
142+
* or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
143+
* cast of the result to crc_t is essential, as it applies the mod x^n!
144+
*/
145+
#if LSB_CRC
146+
return clmulr(tmp, consts->barrett_reduction_const_2);
147+
#else
148+
return clmul(tmp, consts->barrett_reduction_const_2);
149+
#endif
150+
}
151+
152+
/* Update @crc with the data from @msgpoly. */
153+
static inline crc_t
154+
crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
155+
const struct crc_clmul_consts *consts)
156+
{
157+
return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
158+
}
159+
160+
/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
161+
static inline crc_t
162+
crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
163+
const struct crc_clmul_consts *consts)
164+
{
165+
unsigned long msgpoly;
166+
size_t i;
167+
168+
#if LSB_CRC
169+
msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
170+
for (i = 1; i < len; i++)
171+
msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
172+
#else
173+
msgpoly = p[0];
174+
for (i = 1; i < len; i++)
175+
msgpoly = (msgpoly << 8) ^ p[i];
176+
#endif
177+
178+
if (len >= sizeof(crc_t)) {
179+
#if LSB_CRC
180+
msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
181+
#else
182+
msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
183+
#endif
184+
return crc_clmul_long(msgpoly, consts);
185+
}
186+
#if LSB_CRC
187+
msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
188+
return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
189+
#else
190+
msgpoly ^= crc >> (CRC_BITS - 8*len);
191+
return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
192+
#endif
193+
}
194+
195+
static inline crc_t
196+
crc_clmul(crc_t crc, const void *p, size_t len,
197+
const struct crc_clmul_consts *consts)
198+
{
199+
size_t align;
200+
201+
/* This implementation assumes that the CRC fits in an unsigned long. */
202+
BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
203+
204+
/* If the buffer is not long-aligned, align it. */
205+
align = (unsigned long)p % sizeof(unsigned long);
206+
if (align && len) {
207+
align = min(sizeof(unsigned long) - align, len);
208+
crc = crc_clmul_update_partial(crc, p, align, consts);
209+
p += align;
210+
len -= align;
211+
}
212+
213+
if (len >= 4 * sizeof(unsigned long)) {
214+
unsigned long m0, m1;
215+
216+
m0 = crc_clmul_prep(crc, crc_load_long(p));
217+
m1 = crc_load_long(p + sizeof(unsigned long));
218+
p += 2 * sizeof(unsigned long);
219+
len -= 2 * sizeof(unsigned long);
220+
/*
221+
* Main loop. Each iteration starts with a message polynomial
222+
* (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
223+
* more longs of data to form x^(3*BITS_PER_LONG)*m0 +
224+
* x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
225+
* "folds" that back into a congruent (modulo G) value that uses
226+
* just m0 and m1 again. This is done by multiplying m0 by the
227+
* precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
228+
* the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
229+
* adding the results to m2 and m3 as appropriate. Each such
230+
* multiplication produces a result twice the length of a long,
231+
* which in RISC-V is two instructions clmul and clmulh.
232+
*
233+
* This could be changed to fold across more than 2 longs at a
234+
* time if there is a CPU that can take advantage of it.
235+
*/
236+
do {
237+
unsigned long p0, p1, p2, p3;
238+
239+
p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
240+
p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
241+
p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
242+
p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
243+
m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
244+
m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
245+
crc_load_long(p + sizeof(unsigned long));
246+
247+
p += 2 * sizeof(unsigned long);
248+
len -= 2 * sizeof(unsigned long);
249+
} while (len >= 2 * sizeof(unsigned long));
250+
251+
crc = crc_clmul_long(m0, consts);
252+
crc = crc_clmul_update_long(crc, m1, consts);
253+
}
254+
255+
while (len >= sizeof(unsigned long)) {
256+
crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
257+
p += sizeof(unsigned long);
258+
len -= sizeof(unsigned long);
259+
}
260+
261+
if (len)
262+
crc = crc_clmul_update_partial(crc, p, len, consts);
263+
264+
return crc;
265+
}

scripts/gen-crc-consts.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,57 @@ def gen_slicebyN_tables(variants, n):
105105
print(f'\t{s}')
106106
print('};')
107107

108+
def print_riscv_const(v, bits_per_long, name, val, desc):
109+
print(f'\t.{name} = {fmt_poly(v, val, bits_per_long)}, /* {desc} */')
110+
111+
def do_gen_riscv_clmul_consts(v, bits_per_long):
112+
(G, n, lsb) = (v.G, v.bits, v.lsb)
113+
114+
pow_of_x = 3 * bits_per_long - (1 if lsb else 0)
115+
print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_hi',
116+
reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
117+
pow_of_x = 2 * bits_per_long - (1 if lsb else 0)
118+
print_riscv_const(v, bits_per_long, 'fold_across_2_longs_const_lo',
119+
reduce(1 << pow_of_x, G), f'x^{pow_of_x} mod G')
120+
121+
pow_of_x = bits_per_long - 1 + n
122+
print_riscv_const(v, bits_per_long, 'barrett_reduction_const_1',
123+
div(1 << pow_of_x, G), f'floor(x^{pow_of_x} / G)')
124+
125+
val = G - (1 << n)
126+
desc = f'G - x^{n}'
127+
if lsb:
128+
val <<= bits_per_long - n
129+
desc = f'({desc}) * x^{bits_per_long - n}'
130+
print_riscv_const(v, bits_per_long, 'barrett_reduction_const_2', val, desc)
131+
132+
def gen_riscv_clmul_consts(variants):
133+
print('')
134+
print('struct crc_clmul_consts {');
135+
print('\tunsigned long fold_across_2_longs_const_hi;');
136+
print('\tunsigned long fold_across_2_longs_const_lo;');
137+
print('\tunsigned long barrett_reduction_const_1;');
138+
print('\tunsigned long barrett_reduction_const_2;');
139+
print('};');
140+
for v in variants:
141+
print('');
142+
if v.bits > 32:
143+
print_header(v, 'Constants')
144+
print('#ifdef CONFIG_64BIT')
145+
print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
146+
do_gen_riscv_clmul_consts(v, 64)
147+
print('};')
148+
print('#endif')
149+
else:
150+
print_header(v, 'Constants')
151+
print(f'static const struct crc_clmul_consts {v.name}_consts __maybe_unused = {{')
152+
print('#ifdef CONFIG_64BIT')
153+
do_gen_riscv_clmul_consts(v, 64)
154+
print('#else')
155+
do_gen_riscv_clmul_consts(v, 32)
156+
print('#endif')
157+
print('};')
158+
108159
# Generate constants for carryless multiplication based CRC computation.
109160
def gen_x86_pclmul_consts(variants):
110161
# These are the distances, in bits, to generate folding constants for.
@@ -213,7 +264,7 @@ def parse_crc_variants(vars_string):
213264

214265
if len(sys.argv) != 3:
215266
sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n')
216-
sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8] or x86_pclmul\n')
267+
sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8], riscv_clmul, or x86_pclmul\n')
217268
sys.stderr.write(' CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n')
218269
sys.stderr.write(' E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n')
219270
sys.stderr.write(' Polynomial must use the given bit_order and exclude x^{num_bits}\n')
@@ -232,6 +283,8 @@ def parse_crc_variants(vars_string):
232283
for consts_type in consts_types:
233284
if consts_type.startswith('sliceby'):
234285
gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby')))
286+
elif consts_type == 'riscv_clmul':
287+
gen_riscv_clmul_consts(variants)
235288
elif consts_type == 'x86_pclmul':
236289
gen_x86_pclmul_consts(variants)
237290
else:

0 commit comments

Comments
 (0)