11//
22// Accelerated CRC - T10DIF using arm64 NEON and Crypto Extensions instructions
33//
4- // Copyright (C) 2016 Linaro Ltd <[email protected] > 5- // Copyright (C) 2019 Google LLC <[email protected] > 4+ // Copyright (C) 2016 Linaro Ltd
5+ // Copyright (C) 2019 - 2024 Google LLC
6+ //
7+ // Authors: Ard Biesheuvel <[email protected] > 8+ // Eric Biggers <[email protected] > 69//
710// This program is free software ; you can redistribute it and/or modify
811// it under the terms of the GNU General Public License version 2 as
122125 sli perm2.2d , perm1.2d , # 56
123126 sli perm3.2d , perm1.2d , # 48
124127 sli perm4.2d , perm1.2d , # 40
128+
129+ // Compose { 0 , 0 , 0 , 0 , 8 , 8 , 8 , 8 , 1 , 1 , 1 , 1 , 9 , 9 , 9 , 9 }
130+ movi bd1. 4h , # 8 , lsl # 8
131+ orr bd1.2s , # 1 , lsl # 16
132+ orr bd1.2s , # 1 , lsl # 24
133+ zip1 bd1.16b , bd1.16b , bd1.16b
134+ zip1 bd1.16b , bd1.16b , bd1.16b
125135 .endm
126136
127137 .macro __pmull_pre_p8 , bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
196206 ret
197207SYM_FUNC_END(__pmull_p8_core)
198208
209+ .macro pmull16x64_p64 , a16 , b64 , c64
210+ pmull2 \c64\().1q , \a16\().2d , \b64\().2d
211+ pmull \b64\().1q , \a16\().1d , \b64\().1d
212+ .endm
213+
214+ / *
215+ * Pairwise long polynomial multiplication of two 16 - bit values
216+ *
217+ * { w0 , w1 } , { y0 , y1 }
218+ *
219+ * by two 64 - bit values
220+ *
221+ * { x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 } , { z0 , z1 , z2 , z3 , z4 , z5 , z6 , z7 }
222+ *
223+ * where each vector element is a byte , ordered from least to most
224+ * significant.
225+ *
226+ * This can be implemented using 8x8 long polynomial multiplication , by
227+ * reorganizing the input so th at each pairwise 8x8 multiplication
228+ * produces one of the terms from the decomposition below , and
229+ * combining the results of each rank and shifting them into place.
230+ *
231+ * Rank
232+ * 0 w0 * x0 ^ | y0 * z0 ^
233+ * 1 (w0 * x1 ^ w1 * x0) << 8 ^ | (y0 * z1 ^ y1 * z0) << 8 ^
234+ * 2 (w0 * x2 ^ w1 * x1) << 16 ^ | (y0 * z2 ^ y1 * z1) << 16 ^
235+ * 3 (w0 * x3 ^ w1 * x2) << 24 ^ | (y0 * z3 ^ y1 * z2) << 24 ^
236+ * 4 (w0 * x4 ^ w1 * x3) << 32 ^ | (y0 * z4 ^ y1 * z3) << 32 ^
237+ * 5 (w0 * x5 ^ w1 * x4) << 40 ^ | (y0 * z5 ^ y1 * z4) << 40 ^
238+ * 6 (w0 * x6 ^ w1 * x5) << 48 ^ | (y0 * z6 ^ y1 * z5) << 48 ^
239+ * 7 (w0 * x7 ^ w1 * x6) << 56 ^ | (y0 * z7 ^ y1 * z6) << 56 ^
240+ * 8 w1 * x7 << 64 | y1 * z7 << 64
241+ *
242+ * The inputs can be reorganized into
243+ *
244+ * { w0 , w0 , w0 , w0 , y0 , y0 , y0 , y0 } , { w1 , w1 , w1 , w1 , y1 , y1 , y1 , y1 }
245+ * { x0 , x2 , x4 , x6 , z0 , z2 , z4 , z6 } , { x1 , x3 , x5 , x7 , z1 , z3 , z5 , z7 }
246+ *
247+ * and after performing 8x8 - > 16 bit long polynomial multiplication of
248+ * each of the halves of the first vector with those of the second one ,
249+ * we obtain the following four vectors of 16 - bit elements:
250+ *
251+ * a := { w0 * x0 , w0 * x2 , w0 * x4 , w0 * x6 } , { y0 * z0 , y0 * z2 , y0 * z4 , y0 * z6 }
252+ * b := { w0 * x1 , w0 * x3 , w0 * x5 , w0 * x7 } , { y0 * z1 , y0 * z3 , y0 * z5 , y0 * z7 }
253+ * c := { w1 * x0 , w1 * x2 , w1 * x4 , w1 * x6 } , { y1 * z0 , y1 * z2 , y1 * z4 , y1 * z6 }
254+ * d := { w1 * x1 , w1 * x3 , w1 * x5 , w1 * x7 } , { y1 * z1 , y1 * z3 , y1 * z5 , y1 * z7 }
255+ *
256+ * Results b and c can be XORed together , as the vector elements have
257+ * matching ranks. Then , the final XOR ( * ) can be pulled forward , and
258+ * applied between the halves of each of the remaining three vectors ,
259+ * which are then shifted into place , and combined to produce two
260+ * 80 - bit results.
261+ *
262+ * ( * ) NOTE: the 16x64 bit polynomial multiply below is not equivalent
263+ * to the 64x64 bit one above , but XOR 'ing the outputs together will
264+ * produce the expected result , and this is sufficient in the context of
265+ * this algorithm.
266+ * /
267+ .macro pmull16x64_p8 , a16 , b64 , c64
268+ ext t7.16b , \b64\().16b , \b64\().16b , # 1
269+ tbl t5.16b , {\a16\().16b} , bd1.16b
270+ uzp1 t7.16b , \b64\().16b , t7.16b
271+ bl __pmull_p8_16x64
272+ ext \b64\().16b , t4.16b , t4.16b , # 15
273+ eor \c64\().16b , t8.16b , t5.16b
274+ .endm
275+
276+ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
277+ ext t6.16b , t5.16b , t5.16b , # 8
278+
279+ pmull t3. 8h , t7.8b , t5.8b
280+ pmull t4. 8h , t7.8b , t6.8b
281+ pmull2 t5. 8h , t7.16b , t5.16b
282+ pmull2 t6. 8h , t7.16b , t6.16b
283+
284+ ext t8.16b , t3.16b , t3.16b , # 8
285+ eor t4.16b , t4.16b , t6.16b
286+ ext t7.16b , t5.16b , t5.16b , # 8
287+ ext t6.16b , t4.16b , t4.16b , # 8
288+ eor t8.8b , t8.8b , t3.8b
289+ eor t5.8b , t5.8b , t7.8b
290+ eor t4.8b , t4.8b , t6.8b
291+ ext t5.16b , t5.16b , t5.16b , # 14
292+ ret
293+ SYM_FUNC_END(__pmull_p8_16x64)
294+
199295 .macro __pmull_p8 , rq , ad , bd , i
200296 .ifnc \bd , fold_consts
201297 .err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
218314 .macro fold_32_bytes , p , reg1 , reg2
219315 ldp q11 , q12 , [ buf ], # 0x20
220316
221- __pmull_\p v8 , \reg1 , fold_consts , 2
222- __pmull_\p \reg1 , \reg1 , fold_consts
317+ pmull16x64_\p fold_consts , \reg1 , v8
223318
224319CPU_LE( rev64 v11.16b , v11.16b )
225320CPU_LE( rev64 v12.16b , v12.16b )
226321
227- __pmull_\p v9 , \reg2 , fold_consts , 2
228- __pmull_\p \reg2 , \reg2 , fold_consts
322+ pmull16x64_\p fold_consts , \reg2 , v9
229323
230324CPU_LE( ext v11.16b , v11.16b , v11.16b , # 8 )
231325CPU_LE( ext v12.16b , v12.16b , v12.16b , # 8 )
@@ -238,11 +332,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
238332
239333 // Fold src_reg into dst_reg , optionally loading the next fold constants
240334 .macro fold_16_bytes , p , src_reg , dst_reg , load_next_consts
241- __pmull_\p v8 , \src_reg , fold_consts
242- __pmull_\p \src_reg , \src_reg , fold_consts , 2
335+ pmull16x64_\p fold_consts , \src_reg , v8
243336 .ifnb \load_next_consts
244337 ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
245- __pmull_pre_\p fold_consts
246338 .endif
247339 eor \dst_reg\().16b , \dst_reg\().16b , v8.16b
248340 eor \dst_reg\().16b , \dst_reg\().16b , \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
296388
297389 // Load the constants for folding across 128 bytes.
298390 ld1 {fold_consts.2d} , [ fold_consts_ptr ]
299- __pmull_pre_\p fold_consts
300391
301392 // Subtract 128 for the 128 data bytes just consumed. Subtract another
302393 // 128 to simplify the termination condition of the following loop .
@@ -318,7 +409,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
318409 // Fold across 64 bytes.
319410 add fold_consts_ptr , fold_consts_ptr , # 16
320411 ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
321- __pmull_pre_\p fold_consts
322412 fold_16_bytes \p , v0 , v4
323413 fold_16_bytes \p , v1 , v5
324414 fold_16_bytes \p , v2 , v6
@@ -339,8 +429,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
339429 // into them , storing the result back into v7.
340430 b.lt .Lfold_16_bytes_loop_done_\@
341431.Lfold_16_bytes_loop_\@:
342- __pmull_\p v8 , v7 , fold_consts
343- __pmull_\p v7 , v7 , fold_consts , 2
432+ pmull16x64_\p fold_consts , v7 , v8
344433 eor v7.16b , v7.16b , v8.16b
345434 ldr q0 , [ buf ], # 16
346435CPU_LE( rev64 v0.16b , v0.16b )
@@ -387,9 +476,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
387476 bsl v2.16b , v1.16b , v0.16b
388477
389478 // Fold the first chunk into the second chunk , storing the result in v7.
390- __pmull_\p v0 , v3 , fold_consts
391- __pmull_\p v7 , v3 , fold_consts , 2
392- eor v7.16b , v7.16b , v0.16b
479+ pmull16x64_\p fold_consts , v3 , v0
480+ eor v7.16b , v3.16b , v0.16b
393481 eor v7.16b , v7.16b , v2.16b
394482
395483.Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
450538
451539 // Load the fold - across - 16 - bytes constants.
452540 ld1 {fold_consts.2d} , [ fold_consts_ptr ], # 16
453- __pmull_pre_\p fold_consts
454541
455542 cmp len , # 16
456543 b.eq .Lreduce_final_16_bytes_\@ // len == 16
0 commit comments