1
1
//
2
2
// Accelerated CRC - T10DIF using arm64 NEON and Crypto Extensions instructions
3
3
//
4
- // Copyright (C) 2016 Linaro Ltd <[email protected] >
5
- // Copyright (C) 2019 Google LLC <[email protected] >
4
+ // Copyright (C) 2016 Linaro Ltd
5
+ // Copyright (C) 2019 - 2024 Google LLC
6
+ //
7
+ // Authors: Ard Biesheuvel <[email protected] >
8
+ // Eric Biggers <[email protected] >
6
9
//
7
10
// This program is free software ; you can redistribute it and/or modify
8
11
// it under the terms of the GNU General Public License version 2 as
122
125
sli perm2.2d , perm1.2d , # 56
123
126
sli perm3.2d , perm1.2d , # 48
124
127
sli perm4.2d , perm1.2d , # 40
128
+
129
+ // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
130
+ movi bd1. 4h , # 8 , lsl # 8
131
+ orr bd1.2s , # 1 , lsl # 16
132
+ orr bd1.2s , # 1 , lsl # 24
133
+ zip1 bd1.16b , bd1.16b , bd1.16b
134
+ zip1 bd1.16b , bd1.16b , bd1.16b
125
135
.endm
126
136
127
137
.macro __pmull_pre_p8 , bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
196
206
ret
197
207
SYM_FUNC_END(__pmull_p8_core)
198
208
209
+ .macro pmull16x64_p64 , a16 , b64 , c64
210
+ pmull2 \c64\().1q , \a16\().2d , \b64\().2d
211
+ pmull \b64\().1q , \a16\().1d , \b64\().1d
212
+ .endm
213
+
214
+ / *
215
+ * Pairwise long polynomial multiplication of two 16 - bit values
216
+ *
217
+ * { w0 , w1 } , { y0 , y1 }
218
+ *
219
+ * by two 64 - bit values
220
+ *
221
+ * { x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 } , { z0 , z1 , z2 , z3 , z4 , z5 , z6 , z7 }
222
+ *
223
+ * where each vector element is a byte , ordered from least to most
224
+ * significant.
225
+ *
226
+ * This can be implemented using 8x8 long polynomial multiplication , by
227
+ * reorganizing the input so th at each pairwise 8x8 multiplication
228
+ * produces one of the terms from the decomposition below , and
229
+ * combining the results of each rank and shifting them into place.
230
+ *
231
+ * Rank
232
+ * 0 w0 * x0 ^ | y0 * z0 ^
233
+ * 1 (w0 * x1 ^ w1 * x0) << 8 ^ | (y0 * z1 ^ y1 * z0) << 8 ^
234
+ * 2 (w0 * x2 ^ w1 * x1) << 16 ^ | (y0 * z2 ^ y1 * z1) << 16 ^
235
+ * 3 (w0 * x3 ^ w1 * x2) << 24 ^ | (y0 * z3 ^ y1 * z2) << 24 ^
236
+ * 4 (w0 * x4 ^ w1 * x3) << 32 ^ | (y0 * z4 ^ y1 * z3) << 32 ^
237
+ * 5 (w0 * x5 ^ w1 * x4) << 40 ^ | (y0 * z5 ^ y1 * z4) << 40 ^
238
+ * 6 (w0 * x6 ^ w1 * x5) << 48 ^ | (y0 * z6 ^ y1 * z5) << 48 ^
239
+ * 7 (w0 * x7 ^ w1 * x6) << 56 ^ | (y0 * z7 ^ y1 * z6) << 56 ^
240
+ * 8 w1 * x7 << 64 | y1 * z7 << 64
241
+ *
242
+ * The inputs can be reorganized into
243
+ *
244
+ * { w0 , w0 , w0 , w0 , y0 , y0 , y0 , y0 } , { w1 , w1 , w1 , w1 , y1 , y1 , y1 , y1 }
245
+ * { x0 , x2 , x4 , x6 , z0 , z2 , z4 , z6 } , { x1 , x3 , x5 , x7 , z1 , z3 , z5 , z7 }
246
+ *
247
+ * and after performing 8x8 - > 16 bit long polynomial multiplication of
248
+ * each of the halves of the first vector with those of the second one ,
249
+ * we obtain the following four vectors of 16 - bit elements:
250
+ *
251
+ * a := { w0 * x0 , w0 * x2 , w0 * x4 , w0 * x6 } , { y0 * z0 , y0 * z2 , y0 * z4 , y0 * z6 }
252
+ * b := { w0 * x1 , w0 * x3 , w0 * x5 , w0 * x7 } , { y0 * z1 , y0 * z3 , y0 * z5 , y0 * z7 }
253
+ * c := { w1 * x0 , w1 * x2 , w1 * x4 , w1 * x6 } , { y1 * z0 , y1 * z2 , y1 * z4 , y1 * z6 }
254
+ * d := { w1 * x1 , w1 * x3 , w1 * x5 , w1 * x7 } , { y1 * z1 , y1 * z3 , y1 * z5 , y1 * z7 }
255
+ *
256
+ * Results b and c can be XORed together , as the vector elements have
257
+ * matching ranks. Then , the final XOR ( * ) can be pulled forward , and
258
+ * applied between the halves of each of the remaining three vectors ,
259
+ * which are then shifted into place , and combined to produce two
260
+ * 80 - bit results.
261
+ *
262
+ * ( * ) NOTE: the 16x64 bit polynomial multiply below is not equivalent
263
+ * to the 64x64 bit one above , but XOR 'ing the outputs together will
264
+ * produce the expected result , and this is sufficient in the context of
265
+ * this algorithm.
266
+ * /
267
+ .macro pmull16x64_p8 , a16 , b64 , c64
268
+ ext t7.16b , \b64\().16b , \b64\().16b , # 1
269
+ tbl t5.16b , {\a16\().16b} , bd1.16b
270
+ uzp1 t7.16b , \b64\().16b , t7.16b
271
+ bl __pmull_p8_16x64
272
+ ext \b64\().16b , t4.16b , t4.16b , # 15
273
+ eor \c64\().16b , t8.16b , t5.16b
274
+ .endm
275
+
276
+ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
277
+ ext t6.16b , t5.16b , t5.16b , # 8
278
+
279
+ pmull t3. 8h , t7.8b , t5.8b
280
+ pmull t4. 8h , t7.8b , t6.8b
281
+ pmull2 t5. 8h , t7.16b , t5.16b
282
+ pmull2 t6. 8h , t7.16b , t6.16b
283
+
284
+ ext t8.16b , t3.16b , t3.16b , # 8
285
+ eor t4.16b , t4.16b , t6.16b
286
+ ext t7.16b , t5.16b , t5.16b , # 8
287
+ ext t6.16b , t4.16b , t4.16b , # 8
288
+ eor t8.8b , t8.8b , t3.8b
289
+ eor t5.8b , t5.8b , t7.8b
290
+ eor t4.8b , t4.8b , t6.8b
291
+ ext t5.16b , t5.16b , t5.16b , # 14
292
+ ret
293
+ SYM_FUNC_END(__pmull_p8_16x64)
294
+
199
295
.macro __pmull_p8 , rq , ad , bd , i
200
296
.ifnc \bd , fold_consts
201
297
.err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
218
314
.macro fold_32_bytes , p , reg1 , reg2
219
315
ldp q11 , q12 , [ buf ], # 0x20
220
316
221
- __pmull_\p v8 , \reg1 , fold_consts , 2
222
- __pmull_\p \reg1 , \reg1 , fold_consts
317
+ pmull16x64_\p fold_consts , \reg1 , v8
223
318
224
319
CPU_LE( rev64 v11.16b , v11.16b )
225
320
CPU_LE( rev64 v12.16b , v12.16b )
226
321
227
- __pmull_\p v9 , \reg2 , fold_consts , 2
228
- __pmull_\p \reg2 , \reg2 , fold_consts
322
+ pmull16x64_\p fold_consts , \reg2 , v9
229
323
230
324
CPU_LE( ext v11.16b , v11.16b , v11.16b , # 8 )
231
325
CPU_LE( ext v12.16b , v12.16b , v12.16b , # 8 )
@@ -238,11 +332,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
238
332
239
333
// Fold src_reg into dst_reg , optionally loading the next fold constants
240
334
.macro fold_16_bytes , p , src_reg , dst_reg , load_next_consts
241
- __pmull_\p v8 , \src_reg , fold_consts
242
- __pmull_\p \src_reg , \src_reg , fold_consts , 2
335
+ pmull16x64_\p fold_consts , \src_reg , v8
243
336
.ifnb \load_next_consts
244
337
ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
245
- __pmull_pre_\p fold_consts
246
338
.endif
247
339
eor \dst_reg\().16b , \dst_reg\().16b , v8.16b
248
340
eor \dst_reg\().16b , \dst_reg\().16b , \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
296
388
297
389
// Load the constants for folding across 128 bytes.
298
390
ld1 {fold_consts.2d} , [ fold_consts_ptr ]
299
- __pmull_pre_\p fold_consts
300
391
301
392
// Subtract 128 for the 128 data bytes just consumed. Subtract another
302
393
// 128 to simplify the termination condition of the following loop .
@@ -318,7 +409,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
318
409
// Fold across 64 bytes.
319
410
add fold_consts_ptr , fold_consts_ptr , # 16
320
411
ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
321
- __pmull_pre_\p fold_consts
322
412
fold_16_bytes \p , v0 , v4
323
413
fold_16_bytes \p , v1 , v5
324
414
fold_16_bytes \p , v2 , v6
@@ -339,8 +429,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
339
429
// into them , storing the result back into v7.
340
430
b.lt .Lfold_16_bytes_loop_done_\@
341
431
.Lfold_16_bytes_loop_\@:
342
- __pmull_\p v8 , v7 , fold_consts
343
- __pmull_\p v7 , v7 , fold_consts , 2
432
+ pmull16x64_\p fold_consts , v7 , v8
344
433
eor v7.16b , v7.16b , v8.16b
345
434
ldr q0 , [ buf ], # 16
346
435
CPU_LE( rev64 v0.16b , v0.16b )
@@ -387,9 +476,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
387
476
bsl v2.16b , v1.16b , v0.16b
388
477
389
478
// Fold the first chunk into the second chunk , storing the result in v7.
390
- __pmull_\p v0 , v3 , fold_consts
391
- __pmull_\p v7 , v3 , fold_consts , 2
392
- eor v7.16b , v7.16b , v0.16b
479
+ pmull16x64_\p fold_consts , v3 , v0
480
+ eor v7.16b , v3.16b , v0.16b
393
481
eor v7.16b , v7.16b , v2.16b
394
482
395
483
.Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
450
538
451
539
// Load the fold - across - 16 - bytes constants.
452
540
ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
453
- __pmull_pre_\p fold_consts
454
541
455
542
cmp len , # 16
456
543
b.eq .Lreduce_final_16_bytes_\@ // len == 16
0 commit comments