|
| 1 | +/* SPDX-License-Identifier: GPL-2.0-or-later */ |
| 2 | +/* Copyright 2025 Google LLC */ |
| 3 | + |
| 4 | +/* |
| 5 | + * This file is a "template" that generates a CRC function optimized using the |
| 6 | + * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this |
| 7 | + * file must define the following parameters to specify the type of CRC: |
| 8 | + * |
| 9 | + * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC |
| 10 | + * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural |
| 11 | + * mapping between bits and polynomial coefficients |
| 12 | + * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected |
| 13 | + * mapping between bits and polynomial coefficients |
| 14 | + */ |
| 15 | + |
| 16 | +#include <asm/byteorder.h> |
| 17 | +#include <linux/minmax.h> |
| 18 | + |
| 19 | +#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ |
| 20 | + |
| 21 | +static inline unsigned long clmul(unsigned long a, unsigned long b) |
| 22 | +{ |
| 23 | + unsigned long res; |
| 24 | + |
| 25 | + asm(".option push\n" |
| 26 | + ".option arch,+zbc\n" |
| 27 | + "clmul %0, %1, %2\n" |
| 28 | + ".option pop\n" |
| 29 | + : "=r" (res) : "r" (a), "r" (b)); |
| 30 | + return res; |
| 31 | +} |
| 32 | + |
| 33 | +static inline unsigned long clmulh(unsigned long a, unsigned long b) |
| 34 | +{ |
| 35 | + unsigned long res; |
| 36 | + |
| 37 | + asm(".option push\n" |
| 38 | + ".option arch,+zbc\n" |
| 39 | + "clmulh %0, %1, %2\n" |
| 40 | + ".option pop\n" |
| 41 | + : "=r" (res) : "r" (a), "r" (b)); |
| 42 | + return res; |
| 43 | +} |
| 44 | + |
| 45 | +static inline unsigned long clmulr(unsigned long a, unsigned long b) |
| 46 | +{ |
| 47 | + unsigned long res; |
| 48 | + |
| 49 | + asm(".option push\n" |
| 50 | + ".option arch,+zbc\n" |
| 51 | + "clmulr %0, %1, %2\n" |
| 52 | + ".option pop\n" |
| 53 | + : "=r" (res) : "r" (a), "r" (b)); |
| 54 | + return res; |
| 55 | +} |
| 56 | + |
| 57 | +/* |
| 58 | + * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a |
| 59 | + * polynomial whose bit order matches the CRC's bit order. |
| 60 | + */ |
| 61 | +#ifdef CONFIG_64BIT |
| 62 | +# if LSB_CRC |
| 63 | +# define crc_load_long(x) le64_to_cpup(x) |
| 64 | +# else |
| 65 | +# define crc_load_long(x) be64_to_cpup(x) |
| 66 | +# endif |
| 67 | +#else |
| 68 | +# if LSB_CRC |
| 69 | +# define crc_load_long(x) le32_to_cpup(x) |
| 70 | +# else |
| 71 | +# define crc_load_long(x) be32_to_cpup(x) |
| 72 | +# endif |
| 73 | +#endif |
| 74 | + |
| 75 | +/* XOR @crc into the end of @msgpoly that represents the high-order terms. */ |
| 76 | +static inline unsigned long |
| 77 | +crc_clmul_prep(crc_t crc, unsigned long msgpoly) |
| 78 | +{ |
| 79 | +#if LSB_CRC |
| 80 | + return msgpoly ^ crc; |
| 81 | +#else |
| 82 | + return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); |
| 83 | +#endif |
| 84 | +} |
| 85 | + |
| 86 | +/* |
| 87 | + * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it |
| 88 | + * modulo the generator polynomial G. This gives the CRC of @msgpoly. |
| 89 | + */ |
| 90 | +static inline crc_t |
| 91 | +crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) |
| 92 | +{ |
| 93 | + unsigned long tmp; |
| 94 | + |
| 95 | + /* |
| 96 | + * First step of Barrett reduction with integrated multiplication by |
| 97 | + * x^n: calculate floor((msgpoly * x^n) / G). This is the value by |
| 98 | + * which G needs to be multiplied to cancel out the x^n and higher terms |
| 99 | + * of msgpoly * x^n. Do it using the following formula: |
| 100 | + * |
| 101 | + * msb-first: |
| 102 | + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) |
| 103 | + * lsb-first: |
| 104 | + * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) |
| 105 | + * |
| 106 | + * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), |
| 107 | + * which fits a long exactly. Using any lower power of x there would |
| 108 | + * not carry enough precision through the calculation, while using any |
| 109 | + * higher power of x would require extra instructions to handle a wider |
| 110 | + * multiplication. In the msb-first case, using this power of x results |
| 111 | + * in needing a floored division by x^(BITS_PER_LONG-1), which matches |
| 112 | + * what clmulr produces. In the lsb-first case, a factor of x gets |
| 113 | + * implicitly introduced by each carryless multiplication (shown as |
| 114 | + * '* x' above), and the floored division instead needs to be by |
| 115 | + * x^BITS_PER_LONG which matches what clmul produces. |
| 116 | + */ |
| 117 | +#if LSB_CRC |
| 118 | + tmp = clmul(msgpoly, consts->barrett_reduction_const_1); |
| 119 | +#else |
| 120 | + tmp = clmulr(msgpoly, consts->barrett_reduction_const_1); |
| 121 | +#endif |
| 122 | + |
| 123 | + /* |
| 124 | + * Second step of Barrett reduction: |
| 125 | + * |
| 126 | + * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) |
| 127 | + * |
| 128 | + * This reduces (msgpoly * x^n) modulo G by adding the appropriate |
| 129 | + * multiple of G to it. The result uses only the x^0..x^(n-1) terms. |
| 130 | + * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those |
| 131 | + * terms in the first place, it is more efficient to do the equivalent: |
| 132 | + * |
| 133 | + * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n |
| 134 | + * |
| 135 | + * In the lsb-first case further modify it to the following which avoids |
| 136 | + * a shift, as the crc ends up in the physically low n bits from clmulr: |
| 137 | + * |
| 138 | + * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x |
| 139 | + * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n |
| 140 | + * |
| 141 | + * barrett_reduction_const_2 contains the constant multiplier (G - x^n) |
| 142 | + * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The |
| 143 | + * cast of the result to crc_t is essential, as it applies the mod x^n! |
| 144 | + */ |
| 145 | +#if LSB_CRC |
| 146 | + return clmulr(tmp, consts->barrett_reduction_const_2); |
| 147 | +#else |
| 148 | + return clmul(tmp, consts->barrett_reduction_const_2); |
| 149 | +#endif |
| 150 | +} |
| 151 | + |
| 152 | +/* Update @crc with the data from @msgpoly. */ |
| 153 | +static inline crc_t |
| 154 | +crc_clmul_update_long(crc_t crc, unsigned long msgpoly, |
| 155 | + const struct crc_clmul_consts *consts) |
| 156 | +{ |
| 157 | + return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts); |
| 158 | +} |
| 159 | + |
| 160 | +/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ |
| 161 | +static inline crc_t |
| 162 | +crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, |
| 163 | + const struct crc_clmul_consts *consts) |
| 164 | +{ |
| 165 | + unsigned long msgpoly; |
| 166 | + size_t i; |
| 167 | + |
| 168 | +#if LSB_CRC |
| 169 | + msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); |
| 170 | + for (i = 1; i < len; i++) |
| 171 | + msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); |
| 172 | +#else |
| 173 | + msgpoly = p[0]; |
| 174 | + for (i = 1; i < len; i++) |
| 175 | + msgpoly = (msgpoly << 8) ^ p[i]; |
| 176 | +#endif |
| 177 | + |
| 178 | + if (len >= sizeof(crc_t)) { |
| 179 | + #if LSB_CRC |
| 180 | + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); |
| 181 | + #else |
| 182 | + msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); |
| 183 | + #endif |
| 184 | + return crc_clmul_long(msgpoly, consts); |
| 185 | + } |
| 186 | +#if LSB_CRC |
| 187 | + msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); |
| 188 | + return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); |
| 189 | +#else |
| 190 | + msgpoly ^= crc >> (CRC_BITS - 8*len); |
| 191 | + return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); |
| 192 | +#endif |
| 193 | +} |
| 194 | + |
| 195 | +static inline crc_t |
| 196 | +crc_clmul(crc_t crc, const void *p, size_t len, |
| 197 | + const struct crc_clmul_consts *consts) |
| 198 | +{ |
| 199 | + size_t align; |
| 200 | + |
| 201 | + /* This implementation assumes that the CRC fits in an unsigned long. */ |
| 202 | + BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); |
| 203 | + |
| 204 | + /* If the buffer is not long-aligned, align it. */ |
| 205 | + align = (unsigned long)p % sizeof(unsigned long); |
| 206 | + if (align && len) { |
| 207 | + align = min(sizeof(unsigned long) - align, len); |
| 208 | + crc = crc_clmul_update_partial(crc, p, align, consts); |
| 209 | + p += align; |
| 210 | + len -= align; |
| 211 | + } |
| 212 | + |
| 213 | + if (len >= 4 * sizeof(unsigned long)) { |
| 214 | + unsigned long m0, m1; |
| 215 | + |
| 216 | + m0 = crc_clmul_prep(crc, crc_load_long(p)); |
| 217 | + m1 = crc_load_long(p + sizeof(unsigned long)); |
| 218 | + p += 2 * sizeof(unsigned long); |
| 219 | + len -= 2 * sizeof(unsigned long); |
| 220 | + /* |
| 221 | + * Main loop. Each iteration starts with a message polynomial |
| 222 | + * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two |
| 223 | + * more longs of data to form x^(3*BITS_PER_LONG)*m0 + |
| 224 | + * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then |
| 225 | + * "folds" that back into a congruent (modulo G) value that uses |
| 226 | + * just m0 and m1 again. This is done by multiplying m0 by the |
| 227 | + * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by |
| 228 | + * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then |
| 229 | + * adding the results to m2 and m3 as appropriate. Each such |
| 230 | + * multiplication produces a result twice the length of a long, |
| 231 | + * which in RISC-V is two instructions clmul and clmulh. |
| 232 | + * |
| 233 | + * This could be changed to fold across more than 2 longs at a |
| 234 | + * time if there is a CPU that can take advantage of it. |
| 235 | + */ |
| 236 | + do { |
| 237 | + unsigned long p0, p1, p2, p3; |
| 238 | + |
| 239 | + p0 = clmulh(m0, consts->fold_across_2_longs_const_hi); |
| 240 | + p1 = clmul(m0, consts->fold_across_2_longs_const_hi); |
| 241 | + p2 = clmulh(m1, consts->fold_across_2_longs_const_lo); |
| 242 | + p3 = clmul(m1, consts->fold_across_2_longs_const_lo); |
| 243 | + m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); |
| 244 | + m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ |
| 245 | + crc_load_long(p + sizeof(unsigned long)); |
| 246 | + |
| 247 | + p += 2 * sizeof(unsigned long); |
| 248 | + len -= 2 * sizeof(unsigned long); |
| 249 | + } while (len >= 2 * sizeof(unsigned long)); |
| 250 | + |
| 251 | + crc = crc_clmul_long(m0, consts); |
| 252 | + crc = crc_clmul_update_long(crc, m1, consts); |
| 253 | + } |
| 254 | + |
| 255 | + while (len >= sizeof(unsigned long)) { |
| 256 | + crc = crc_clmul_update_long(crc, crc_load_long(p), consts); |
| 257 | + p += sizeof(unsigned long); |
| 258 | + len -= sizeof(unsigned long); |
| 259 | + } |
| 260 | + |
| 261 | + if (len) |
| 262 | + crc = crc_clmul_update_partial(crc, p, len, consts); |
| 263 | + |
| 264 | + return crc; |
| 265 | +} |
0 commit comments