Skip to content

Commit 67dfb1b

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: arm64/crct10dif - Use faster 16x64 bit polynomial multiply
The CRC-T10DIF implementation for arm64 has a version that uses 8x8 polynomial multiplication, for cores that lack the crypto extensions, which cover the 64x64 polynomial multiplication instruction that the algorithm was built around. This fallback version rather naively adopted the 64x64 polynomial multiplication algorithm that I ported from ARM for the GHASH driver, which needs 8 PMULL8 instructions to implement one PMULL64. This is reasonable, given that each 8-bit vector element needs to be multiplied with each element in the other vector, producing 8 vectors with partial results that need to be combined to yield the correct result. However, most PMULL64 invocations in the CRC-T10DIF code involve multiplication by a pair of 16-bit folding coefficients, and so all the partial results from higher order bytes will be zero, and there is no need to calculate them to begin with. Then, the CRC-T10DIF algorithm always XORs the output values of the PMULL64 instructions being issued in pairs, and so there is no need to faithfully implement each individual PMULL64 instruction, as long as XORing the results pairwise produces the expected result. Implementing these improvements results in a speedup of 3.3x on low-end platforms such as Raspberry Pi 4 (Cortex-A72) Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 7048c21 commit 67dfb1b

File tree

1 file changed

+104
-17
lines changed

1 file changed

+104
-17
lines changed

arch/arm64/crypto/crct10dif-ce-core.S

Lines changed: 104 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
//
22
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
33
//
4-
// Copyright (C) 2016 Linaro Ltd <[email protected]>
5-
// Copyright (C) 2019 Google LLC <[email protected]>
4+
// Copyright (C) 2016 Linaro Ltd
5+
// Copyright (C) 2019-2024 Google LLC
6+
//
7+
// Authors: Ard Biesheuvel <[email protected]>
8+
// Eric Biggers <[email protected]>
69
//
710
// This program is free software; you can redistribute it and/or modify
811
// it under the terms of the GNU General Public License version 2 as
@@ -122,6 +125,13 @@
122125
sli perm2.2d, perm1.2d, #56
123126
sli perm3.2d, perm1.2d, #48
124127
sli perm4.2d, perm1.2d, #40
128+
129+
// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
130+
movi bd1.4h, #8, lsl #8
131+
orr bd1.2s, #1, lsl #16
132+
orr bd1.2s, #1, lsl #24
133+
zip1 bd1.16b, bd1.16b, bd1.16b
134+
zip1 bd1.16b, bd1.16b, bd1.16b
125135
.endm
126136

127137
.macro __pmull_pre_p8, bd
@@ -196,6 +206,92 @@ SYM_FUNC_START_LOCAL(__pmull_p8_core)
196206
ret
197207
SYM_FUNC_END(__pmull_p8_core)
198208

209+
.macro pmull16x64_p64, a16, b64, c64
210+
pmull2 \c64\().1q, \a16\().2d, \b64\().2d
211+
pmull \b64\().1q, \a16\().1d, \b64\().1d
212+
.endm
213+
214+
/*
215+
* Pairwise long polynomial multiplication of two 16-bit values
216+
*
217+
* { w0, w1 }, { y0, y1 }
218+
*
219+
* by two 64-bit values
220+
*
221+
* { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
222+
*
223+
* where each vector element is a byte, ordered from least to most
224+
* significant.
225+
*
226+
* This can be implemented using 8x8 long polynomial multiplication, by
227+
* reorganizing the input so that each pairwise 8x8 multiplication
228+
* produces one of the terms from the decomposition below, and
229+
* combining the results of each rank and shifting them into place.
230+
*
231+
* Rank
232+
* 0 w0*x0 ^ | y0*z0 ^
233+
* 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
234+
* 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
235+
* 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
236+
* 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
237+
* 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
238+
* 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
239+
* 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
240+
* 8 w1*x7 << 64 | y1*z7 << 64
241+
*
242+
* The inputs can be reorganized into
243+
*
244+
* { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
245+
* { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
246+
*
247+
* and after performing 8x8->16 bit long polynomial multiplication of
248+
* each of the halves of the first vector with those of the second one,
249+
* we obtain the following four vectors of 16-bit elements:
250+
*
251+
* a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
252+
* b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
253+
* c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
254+
* d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
255+
*
256+
* Results b and c can be XORed together, as the vector elements have
257+
* matching ranks. Then, the final XOR (*) can be pulled forward, and
258+
* applied between the halves of each of the remaining three vectors,
259+
* which are then shifted into place, and combined to produce two
260+
* 80-bit results.
261+
*
262+
* (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
263+
* to the 64x64 bit one above, but XOR'ing the outputs together will
264+
* produce the expected result, and this is sufficient in the context of
265+
* this algorithm.
266+
*/
267+
.macro pmull16x64_p8, a16, b64, c64
268+
ext t7.16b, \b64\().16b, \b64\().16b, #1
269+
tbl t5.16b, {\a16\().16b}, bd1.16b
270+
uzp1 t7.16b, \b64\().16b, t7.16b
271+
bl __pmull_p8_16x64
272+
ext \b64\().16b, t4.16b, t4.16b, #15
273+
eor \c64\().16b, t8.16b, t5.16b
274+
.endm
275+
276+
SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
277+
ext t6.16b, t5.16b, t5.16b, #8
278+
279+
pmull t3.8h, t7.8b, t5.8b
280+
pmull t4.8h, t7.8b, t6.8b
281+
pmull2 t5.8h, t7.16b, t5.16b
282+
pmull2 t6.8h, t7.16b, t6.16b
283+
284+
ext t8.16b, t3.16b, t3.16b, #8
285+
eor t4.16b, t4.16b, t6.16b
286+
ext t7.16b, t5.16b, t5.16b, #8
287+
ext t6.16b, t4.16b, t4.16b, #8
288+
eor t8.8b, t8.8b, t3.8b
289+
eor t5.8b, t5.8b, t7.8b
290+
eor t4.8b, t4.8b, t6.8b
291+
ext t5.16b, t5.16b, t5.16b, #14
292+
ret
293+
SYM_FUNC_END(__pmull_p8_16x64)
294+
199295
.macro __pmull_p8, rq, ad, bd, i
200296
.ifnc \bd, fold_consts
201297
.err
@@ -218,14 +314,12 @@ SYM_FUNC_END(__pmull_p8_core)
218314
.macro fold_32_bytes, p, reg1, reg2
219315
ldp q11, q12, [buf], #0x20
220316

221-
__pmull_\p v8, \reg1, fold_consts, 2
222-
__pmull_\p \reg1, \reg1, fold_consts
317+
pmull16x64_\p fold_consts, \reg1, v8
223318

224319
CPU_LE( rev64 v11.16b, v11.16b )
225320
CPU_LE( rev64 v12.16b, v12.16b )
226321

227-
__pmull_\p v9, \reg2, fold_consts, 2
228-
__pmull_\p \reg2, \reg2, fold_consts
322+
pmull16x64_\p fold_consts, \reg2, v9
229323

230324
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
231325
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -238,11 +332,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
238332

239333
// Fold src_reg into dst_reg, optionally loading the next fold constants
240334
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
241-
__pmull_\p v8, \src_reg, fold_consts
242-
__pmull_\p \src_reg, \src_reg, fold_consts, 2
335+
pmull16x64_\p fold_consts, \src_reg, v8
243336
.ifnb \load_next_consts
244337
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
245-
__pmull_pre_\p fold_consts
246338
.endif
247339
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
248340
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
@@ -296,7 +388,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
296388

297389
// Load the constants for folding across 128 bytes.
298390
ld1 {fold_consts.2d}, [fold_consts_ptr]
299-
__pmull_pre_\p fold_consts
300391

301392
// Subtract 128 for the 128 data bytes just consumed. Subtract another
302393
// 128 to simplify the termination condition of the following loop.
@@ -318,7 +409,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
318409
// Fold across 64 bytes.
319410
add fold_consts_ptr, fold_consts_ptr, #16
320411
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
321-
__pmull_pre_\p fold_consts
322412
fold_16_bytes \p, v0, v4
323413
fold_16_bytes \p, v1, v5
324414
fold_16_bytes \p, v2, v6
@@ -339,8 +429,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
339429
// into them, storing the result back into v7.
340430
b.lt .Lfold_16_bytes_loop_done_\@
341431
.Lfold_16_bytes_loop_\@:
342-
__pmull_\p v8, v7, fold_consts
343-
__pmull_\p v7, v7, fold_consts, 2
432+
pmull16x64_\p fold_consts, v7, v8
344433
eor v7.16b, v7.16b, v8.16b
345434
ldr q0, [buf], #16
346435
CPU_LE( rev64 v0.16b, v0.16b )
@@ -387,9 +476,8 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
387476
bsl v2.16b, v1.16b, v0.16b
388477

389478
// Fold the first chunk into the second chunk, storing the result in v7.
390-
__pmull_\p v0, v3, fold_consts
391-
__pmull_\p v7, v3, fold_consts, 2
392-
eor v7.16b, v7.16b, v0.16b
479+
pmull16x64_\p fold_consts, v3, v0
480+
eor v7.16b, v3.16b, v0.16b
393481
eor v7.16b, v7.16b, v2.16b
394482

395483
.Lreduce_final_16_bytes_\@:
@@ -450,7 +538,6 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
450538

451539
// Load the fold-across-16-bytes constants.
452540
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
453-
__pmull_pre_\p fold_consts
454541

455542
cmp len, #16
456543
b.eq .Lreduce_final_16_bytes_\@ // len == 16

0 commit comments

Comments
 (0)