|
| 1 | +// SPDX-License-Identifier: GPL-2.0-only |
| 2 | +/* |
| 3 | + * Accelerated CRC32 implementation with Zbc extension. |
| 4 | + * |
| 5 | + * Copyright (C) 2024 Intel Corporation |
| 6 | + */ |
| 7 | + |
| 8 | +#include <asm/hwcap.h> |
| 9 | +#include <asm/alternative-macros.h> |
| 10 | +#include <asm/byteorder.h> |
| 11 | + |
| 12 | +#include <linux/types.h> |
| 13 | +#include <linux/minmax.h> |
| 14 | +#include <linux/crc32poly.h> |
| 15 | +#include <linux/crc32.h> |
| 16 | +#include <linux/byteorder/generic.h> |
| 17 | + |
| 18 | +/* |
| 19 | + * Refer to https://www.corsix.org/content/barrett-reduction-polynomials for |
| 20 | + * better understanding of how this math works. |
| 21 | + * |
| 22 | + * let "+" denotes polynomial add (XOR) |
| 23 | + * let "-" denotes polynomial sub (XOR) |
| 24 | + * let "*" denotes polynomial multiplication |
| 25 | + * let "/" denotes polynomial floor division |
| 26 | + * let "S" denotes source data, XLEN bit wide |
| 27 | + * let "P" denotes CRC32 polynomial |
| 28 | + * let "T" denotes 2^(XLEN+32) |
| 29 | + * let "QT" denotes quotient of T/P, with the bit for 2^XLEN being implicit |
| 30 | + * |
| 31 | + * crc32(S, P) |
| 32 | + * => S * (2^32) - S * (2^32) / P * P |
| 33 | + * => lowest 32 bits of: S * (2^32) / P * P |
| 34 | + * => lowest 32 bits of: S * (2^32) * (T / P) / T * P |
| 35 | + * => lowest 32 bits of: S * (2^32) * quotient / T * P |
| 36 | + * => lowest 32 bits of: S * quotient / 2^XLEN * P |
| 37 | + * => lowest 32 bits of: (clmul_high_part(S, QT) + S) * P |
| 38 | + * => clmul_low_part(clmul_high_part(S, QT) + S, P) |
| 39 | + * |
| 40 | + * In terms of below implementations, the BE case is more intuitive, since the |
| 41 | + * higher order bit sits at more significant position. |
| 42 | + */ |
| 43 | + |
| 44 | +#if __riscv_xlen == 64 |
| 45 | +/* Slide by XLEN bits per iteration */ |
| 46 | +# define STEP_ORDER 3 |
| 47 | + |
| 48 | +/* Each below polynomial quotient has an implicit bit for 2^XLEN */ |
| 49 | + |
| 50 | +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in LE format */ |
| 51 | +# define CRC32_POLY_QT_LE 0x5a72d812fb808b20 |
| 52 | + |
| 53 | +/* Polynomial quotient of (2^(XLEN+32))/CRC32C_POLY, in LE format */ |
| 54 | +# define CRC32C_POLY_QT_LE 0xa434f61c6f5389f8 |
| 55 | + |
| 56 | +/* Polynomial quotient of (2^(XLEN+32))/CRC32_POLY, in BE format, it should be |
| 57 | + * the same as the bit-reversed version of CRC32_POLY_QT_LE |
| 58 | + */ |
| 59 | +# define CRC32_POLY_QT_BE 0x04d101df481b4e5a |
| 60 | + |
| 61 | +static inline u64 crc32_le_prep(u32 crc, unsigned long const *ptr) |
| 62 | +{ |
| 63 | + return (u64)crc ^ (__force u64)__cpu_to_le64(*ptr); |
| 64 | +} |
| 65 | + |
| 66 | +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) |
| 67 | +{ |
| 68 | + u32 crc; |
| 69 | + |
| 70 | + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ |
| 71 | + asm volatile (".option push\n" |
| 72 | + ".option arch,+zbc\n" |
| 73 | + "clmul %0, %1, %2\n" |
| 74 | + "slli %0, %0, 1\n" |
| 75 | + "xor %0, %0, %1\n" |
| 76 | + "clmulr %0, %0, %3\n" |
| 77 | + "srli %0, %0, 32\n" |
| 78 | + ".option pop\n" |
| 79 | + : "=&r" (crc) |
| 80 | + : "r" (s), |
| 81 | + "r" (poly_qt), |
| 82 | + "r" ((u64)poly << 32) |
| 83 | + :); |
| 84 | + return crc; |
| 85 | +} |
| 86 | + |
| 87 | +static inline u64 crc32_be_prep(u32 crc, unsigned long const *ptr) |
| 88 | +{ |
| 89 | + return ((u64)crc << 32) ^ (__force u64)__cpu_to_be64(*ptr); |
| 90 | +} |
| 91 | + |
| 92 | +#elif __riscv_xlen == 32 |
| 93 | +# define STEP_ORDER 2 |
| 94 | +/* Each quotient should match the upper half of its analog in RV64 */ |
| 95 | +# define CRC32_POLY_QT_LE 0xfb808b20 |
| 96 | +# define CRC32C_POLY_QT_LE 0x6f5389f8 |
| 97 | +# define CRC32_POLY_QT_BE 0x04d101df |
| 98 | + |
| 99 | +static inline u32 crc32_le_prep(u32 crc, unsigned long const *ptr) |
| 100 | +{ |
| 101 | + return crc ^ (__force u32)__cpu_to_le32(*ptr); |
| 102 | +} |
| 103 | + |
| 104 | +static inline u32 crc32_le_zbc(unsigned long s, u32 poly, unsigned long poly_qt) |
| 105 | +{ |
| 106 | + u32 crc; |
| 107 | + |
| 108 | + /* We don't have a "clmulrh" insn, so use clmul + slli instead. */ |
| 109 | + asm volatile (".option push\n" |
| 110 | + ".option arch,+zbc\n" |
| 111 | + "clmul %0, %1, %2\n" |
| 112 | + "slli %0, %0, 1\n" |
| 113 | + "xor %0, %0, %1\n" |
| 114 | + "clmulr %0, %0, %3\n" |
| 115 | + ".option pop\n" |
| 116 | + : "=&r" (crc) |
| 117 | + : "r" (s), |
| 118 | + "r" (poly_qt), |
| 119 | + "r" (poly) |
| 120 | + :); |
| 121 | + return crc; |
| 122 | +} |
| 123 | + |
| 124 | +static inline u32 crc32_be_prep(u32 crc, unsigned long const *ptr) |
| 125 | +{ |
| 126 | + return crc ^ (__force u32)__cpu_to_be32(*ptr); |
| 127 | +} |
| 128 | + |
| 129 | +#else |
| 130 | +# error "Unexpected __riscv_xlen" |
| 131 | +#endif |
| 132 | + |
| 133 | +static inline u32 crc32_be_zbc(unsigned long s) |
| 134 | +{ |
| 135 | + u32 crc; |
| 136 | + |
| 137 | + asm volatile (".option push\n" |
| 138 | + ".option arch,+zbc\n" |
| 139 | + "clmulh %0, %1, %2\n" |
| 140 | + "xor %0, %0, %1\n" |
| 141 | + "clmul %0, %0, %3\n" |
| 142 | + ".option pop\n" |
| 143 | + : "=&r" (crc) |
| 144 | + : "r" (s), |
| 145 | + "r" (CRC32_POLY_QT_BE), |
| 146 | + "r" (CRC32_POLY_BE) |
| 147 | + :); |
| 148 | + return crc; |
| 149 | +} |
| 150 | + |
| 151 | +#define STEP (1 << STEP_ORDER) |
| 152 | +#define OFFSET_MASK (STEP - 1) |
| 153 | + |
| 154 | +typedef u32 (*fallback)(u32 crc, unsigned char const *p, size_t len); |
| 155 | + |
| 156 | +static inline u32 crc32_le_unaligned(u32 crc, unsigned char const *p, |
| 157 | + size_t len, u32 poly, |
| 158 | + unsigned long poly_qt) |
| 159 | +{ |
| 160 | + size_t bits = len * 8; |
| 161 | + unsigned long s = 0; |
| 162 | + u32 crc_low = 0; |
| 163 | + |
| 164 | + for (int i = 0; i < len; i++) |
| 165 | + s = ((unsigned long)*p++ << (__riscv_xlen - 8)) | (s >> 8); |
| 166 | + |
| 167 | + s ^= (unsigned long)crc << (__riscv_xlen - bits); |
| 168 | + if (__riscv_xlen == 32 || len < sizeof(u32)) |
| 169 | + crc_low = crc >> bits; |
| 170 | + |
| 171 | + crc = crc32_le_zbc(s, poly, poly_qt); |
| 172 | + crc ^= crc_low; |
| 173 | + |
| 174 | + return crc; |
| 175 | +} |
| 176 | + |
| 177 | +static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, |
| 178 | + size_t len, u32 poly, |
| 179 | + unsigned long poly_qt, |
| 180 | + fallback crc_fb) |
| 181 | +{ |
| 182 | + size_t offset, head_len, tail_len; |
| 183 | + unsigned long const *p_ul; |
| 184 | + unsigned long s; |
| 185 | + |
| 186 | + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, |
| 187 | + RISCV_ISA_EXT_ZBC, 1) |
| 188 | + : : : : legacy); |
| 189 | + |
| 190 | + /* Handle the unaligned head. */ |
| 191 | + offset = (unsigned long)p & OFFSET_MASK; |
| 192 | + if (offset && len) { |
| 193 | + head_len = min(STEP - offset, len); |
| 194 | + crc = crc32_le_unaligned(crc, p, head_len, poly, poly_qt); |
| 195 | + p += head_len; |
| 196 | + len -= head_len; |
| 197 | + } |
| 198 | + |
| 199 | + tail_len = len & OFFSET_MASK; |
| 200 | + len = len >> STEP_ORDER; |
| 201 | + p_ul = (unsigned long const *)p; |
| 202 | + |
| 203 | + for (int i = 0; i < len; i++) { |
| 204 | + s = crc32_le_prep(crc, p_ul); |
| 205 | + crc = crc32_le_zbc(s, poly, poly_qt); |
| 206 | + p_ul++; |
| 207 | + } |
| 208 | + |
| 209 | + /* Handle the tail bytes. */ |
| 210 | + p = (unsigned char const *)p_ul; |
| 211 | + if (tail_len) |
| 212 | + crc = crc32_le_unaligned(crc, p, tail_len, poly, poly_qt); |
| 213 | + |
| 214 | + return crc; |
| 215 | + |
| 216 | +legacy: |
| 217 | + return crc_fb(crc, p, len); |
| 218 | +} |
| 219 | + |
| 220 | +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) |
| 221 | +{ |
| 222 | + return crc32_le_generic(crc, p, len, CRC32_POLY_LE, CRC32_POLY_QT_LE, |
| 223 | + crc32_le_base); |
| 224 | +} |
| 225 | + |
| 226 | +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) |
| 227 | +{ |
| 228 | + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE, |
| 229 | + CRC32C_POLY_QT_LE, __crc32c_le_base); |
| 230 | +} |
| 231 | + |
| 232 | +static inline u32 crc32_be_unaligned(u32 crc, unsigned char const *p, |
| 233 | + size_t len) |
| 234 | +{ |
| 235 | + size_t bits = len * 8; |
| 236 | + unsigned long s = 0; |
| 237 | + u32 crc_low = 0; |
| 238 | + |
| 239 | + s = 0; |
| 240 | + for (int i = 0; i < len; i++) |
| 241 | + s = *p++ | (s << 8); |
| 242 | + |
| 243 | + if (__riscv_xlen == 32 || len < sizeof(u32)) { |
| 244 | + s ^= crc >> (32 - bits); |
| 245 | + crc_low = crc << bits; |
| 246 | + } else { |
| 247 | + s ^= (unsigned long)crc << (bits - 32); |
| 248 | + } |
| 249 | + |
| 250 | + crc = crc32_be_zbc(s); |
| 251 | + crc ^= crc_low; |
| 252 | + |
| 253 | + return crc; |
| 254 | +} |
| 255 | + |
| 256 | +u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) |
| 257 | +{ |
| 258 | + size_t offset, head_len, tail_len; |
| 259 | + unsigned long const *p_ul; |
| 260 | + unsigned long s; |
| 261 | + |
| 262 | + asm goto(ALTERNATIVE("j %l[legacy]", "nop", 0, |
| 263 | + RISCV_ISA_EXT_ZBC, 1) |
| 264 | + : : : : legacy); |
| 265 | + |
| 266 | + /* Handle the unaligned head. */ |
| 267 | + offset = (unsigned long)p & OFFSET_MASK; |
| 268 | + if (offset && len) { |
| 269 | + head_len = min(STEP - offset, len); |
| 270 | + crc = crc32_be_unaligned(crc, p, head_len); |
| 271 | + p += head_len; |
| 272 | + len -= head_len; |
| 273 | + } |
| 274 | + |
| 275 | + tail_len = len & OFFSET_MASK; |
| 276 | + len = len >> STEP_ORDER; |
| 277 | + p_ul = (unsigned long const *)p; |
| 278 | + |
| 279 | + for (int i = 0; i < len; i++) { |
| 280 | + s = crc32_be_prep(crc, p_ul); |
| 281 | + crc = crc32_be_zbc(s); |
| 282 | + p_ul++; |
| 283 | + } |
| 284 | + |
| 285 | + /* Handle the tail bytes. */ |
| 286 | + p = (unsigned char const *)p_ul; |
| 287 | + if (tail_len) |
| 288 | + crc = crc32_be_unaligned(crc, p, tail_len); |
| 289 | + |
| 290 | + return crc; |
| 291 | + |
| 292 | +legacy: |
| 293 | + return crc32_be_base(crc, p, len); |
| 294 | +} |
0 commit comments