Skip to content

Commit 779cee8

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code
The only remaining user of the fallback implementation of 64x64 polynomial multiplication using 8x8 PMULL instructions is the final reduction from a 16 byte vector to a 16-bit CRC. The fallback code is complicated and messy, and this reduction has little impact on the overall performance, so instead, let's calculate the final CRC by passing the 16 byte vector to the generic CRC-T10DIF implementation when running the fallback version. Signed-off-by: Ard Biesheuvel <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 67dfb1b commit 779cee8

File tree

2 files changed

+68
-194
lines changed

2 files changed

+68
-194
lines changed

arch/arm64/crypto/crct10dif-ce-core.S

Lines changed: 56 additions & 188 deletions
Original file line numberDiff line numberDiff line change
@@ -74,137 +74,18 @@
7474
init_crc .req w0
7575
buf .req x1
7676
len .req x2
77-
fold_consts_ptr .req x3
77+
fold_consts_ptr .req x5
7878

7979
fold_consts .req v10
8080

81-
ad .req v14
82-
83-
k00_16 .req v15
84-
k32_48 .req v16
85-
8681
t3 .req v17
8782
t4 .req v18
8883
t5 .req v19
8984
t6 .req v20
9085
t7 .req v21
9186
t8 .req v22
92-
t9 .req v23
93-
94-
perm1 .req v24
95-
perm2 .req v25
96-
perm3 .req v26
97-
perm4 .req v27
98-
99-
bd1 .req v28
100-
bd2 .req v29
101-
bd3 .req v30
102-
bd4 .req v31
103-
104-
.macro __pmull_init_p64
105-
.endm
10687

107-
.macro __pmull_pre_p64, bd
108-
.endm
109-
110-
.macro __pmull_init_p8
111-
// k00_16 := 0x0000000000000000_000000000000ffff
112-
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
113-
movi k32_48.2d, #0xffffffff
114-
mov k32_48.h[2], k32_48.h[0]
115-
ushr k00_16.2d, k32_48.2d, #32
116-
117-
// prepare the permutation vectors
118-
mov_q x5, 0x080f0e0d0c0b0a09
119-
movi perm4.8b, #8
120-
dup perm1.2d, x5
121-
eor perm1.16b, perm1.16b, perm4.16b
122-
ushr perm2.2d, perm1.2d, #8
123-
ushr perm3.2d, perm1.2d, #16
124-
ushr perm4.2d, perm1.2d, #24
125-
sli perm2.2d, perm1.2d, #56
126-
sli perm3.2d, perm1.2d, #48
127-
sli perm4.2d, perm1.2d, #40
128-
129-
// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
130-
movi bd1.4h, #8, lsl #8
131-
orr bd1.2s, #1, lsl #16
132-
orr bd1.2s, #1, lsl #24
133-
zip1 bd1.16b, bd1.16b, bd1.16b
134-
zip1 bd1.16b, bd1.16b, bd1.16b
135-
.endm
136-
137-
.macro __pmull_pre_p8, bd
138-
tbl bd1.16b, {\bd\().16b}, perm1.16b
139-
tbl bd2.16b, {\bd\().16b}, perm2.16b
140-
tbl bd3.16b, {\bd\().16b}, perm3.16b
141-
tbl bd4.16b, {\bd\().16b}, perm4.16b
142-
.endm
143-
144-
SYM_FUNC_START_LOCAL(__pmull_p8_core)
145-
.L__pmull_p8_core:
146-
ext t4.8b, ad.8b, ad.8b, #1 // A1
147-
ext t5.8b, ad.8b, ad.8b, #2 // A2
148-
ext t6.8b, ad.8b, ad.8b, #3 // A3
149-
150-
pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
151-
pmull t8.8h, ad.8b, bd1.8b // E = A*B1
152-
pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
153-
pmull t7.8h, ad.8b, bd2.8b // G = A*B2
154-
pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
155-
pmull t9.8h, ad.8b, bd3.8b // I = A*B3
156-
pmull t3.8h, ad.8b, bd4.8b // K = A*B4
157-
b 0f
158-
159-
.L__pmull_p8_core2:
160-
tbl t4.16b, {ad.16b}, perm1.16b // A1
161-
tbl t5.16b, {ad.16b}, perm2.16b // A2
162-
tbl t6.16b, {ad.16b}, perm3.16b // A3
163-
164-
pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
165-
pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
166-
pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
167-
pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
168-
pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
169-
pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
170-
pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
171-
172-
0: eor t4.16b, t4.16b, t8.16b // L = E + F
173-
eor t5.16b, t5.16b, t7.16b // M = G + H
174-
eor t6.16b, t6.16b, t9.16b // N = I + J
175-
176-
uzp1 t8.2d, t4.2d, t5.2d
177-
uzp2 t4.2d, t4.2d, t5.2d
178-
uzp1 t7.2d, t6.2d, t3.2d
179-
uzp2 t6.2d, t6.2d, t3.2d
180-
181-
// t4 = (L) (P0 + P1) << 8
182-
// t5 = (M) (P2 + P3) << 16
183-
eor t8.16b, t8.16b, t4.16b
184-
and t4.16b, t4.16b, k32_48.16b
185-
186-
// t6 = (N) (P4 + P5) << 24
187-
// t7 = (K) (P6 + P7) << 32
188-
eor t7.16b, t7.16b, t6.16b
189-
and t6.16b, t6.16b, k00_16.16b
190-
191-
eor t8.16b, t8.16b, t4.16b
192-
eor t7.16b, t7.16b, t6.16b
193-
194-
zip2 t5.2d, t8.2d, t4.2d
195-
zip1 t4.2d, t8.2d, t4.2d
196-
zip2 t3.2d, t7.2d, t6.2d
197-
zip1 t6.2d, t7.2d, t6.2d
198-
199-
ext t4.16b, t4.16b, t4.16b, #15
200-
ext t5.16b, t5.16b, t5.16b, #14
201-
ext t6.16b, t6.16b, t6.16b, #13
202-
ext t3.16b, t3.16b, t3.16b, #12
203-
204-
eor t4.16b, t4.16b, t5.16b
205-
eor t6.16b, t6.16b, t3.16b
206-
ret
207-
SYM_FUNC_END(__pmull_p8_core)
88+
perm .req v27
20889

20990
.macro pmull16x64_p64, a16, b64, c64
21091
pmull2 \c64\().1q, \a16\().2d, \b64\().2d
@@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
266147
*/
267148
.macro pmull16x64_p8, a16, b64, c64
268149
ext t7.16b, \b64\().16b, \b64\().16b, #1
269-
tbl t5.16b, {\a16\().16b}, bd1.16b
150+
tbl t5.16b, {\a16\().16b}, perm.16b
270151
uzp1 t7.16b, \b64\().16b, t7.16b
271152
bl __pmull_p8_16x64
272153
ext \b64\().16b, t4.16b, t4.16b, #15
@@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
292173
ret
293174
SYM_FUNC_END(__pmull_p8_16x64)
294175

295-
.macro __pmull_p8, rq, ad, bd, i
296-
.ifnc \bd, fold_consts
297-
.err
298-
.endif
299-
mov ad.16b, \ad\().16b
300-
.ifb \i
301-
pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
302-
.else
303-
pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
304-
.endif
305-
306-
bl .L__pmull_p8_core\i
307-
308-
eor \rq\().16b, \rq\().16b, t4.16b
309-
eor \rq\().16b, \rq\().16b, t6.16b
310-
.endm
311176

312177
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
313178
// into reg1, reg2.
@@ -340,16 +205,7 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
340205
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
341206
.endm
342207

343-
.macro __pmull_p64, rd, rn, rm, n
344-
.ifb \n
345-
pmull \rd\().1q, \rn\().1d, \rm\().1d
346-
.else
347-
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
348-
.endif
349-
.endm
350-
351208
.macro crc_t10dif_pmull, p
352-
__pmull_init_\p
353209

354210
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
355211
cmp len, #256
@@ -479,47 +335,7 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
479335
pmull16x64_\p fold_consts, v3, v0
480336
eor v7.16b, v3.16b, v0.16b
481337
eor v7.16b, v7.16b, v2.16b
482-
483-
.Lreduce_final_16_bytes_\@:
484-
// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
485-
486-
movi v2.16b, #0 // init zero register
487-
488-
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
489-
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
490-
__pmull_pre_\p fold_consts
491-
492-
// Fold the high 64 bits into the low 64 bits, while also multiplying by
493-
// x^64. This produces a 128-bit value congruent to x^64 * M(x) and
494-
// whose low 48 bits are 0.
495-
ext v0.16b, v2.16b, v7.16b, #8
496-
__pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
497-
eor v0.16b, v0.16b, v7.16b // + low bits * x^64
498-
499-
// Fold the high 32 bits into the low 96 bits. This produces a 96-bit
500-
// value congruent to x^64 * M(x) and whose low 48 bits are 0.
501-
ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
502-
mov v0.s[3], v2.s[0] // zero high 32 bits
503-
__pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
504-
eor v0.16b, v0.16b, v1.16b // + low bits
505-
506-
// Load G(x) and floor(x^48 / G(x)).
507-
ld1 {fold_consts.2d}, [fold_consts_ptr]
508-
__pmull_pre_\p fold_consts
509-
510-
// Use Barrett reduction to compute the final CRC value.
511-
__pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
512-
ushr v1.2d, v1.2d, #32 // /= x^32
513-
__pmull_\p v1, v1, fold_consts // *= G(x)
514-
ushr v0.2d, v0.2d, #48
515-
eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
516-
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
517-
518-
umov w0, v0.h[0]
519-
.ifc \p, p8
520-
frame_pop
521-
.endif
522-
ret
338+
b .Lreduce_final_16_bytes_\@
523339

524340
.Lless_than_256_bytes_\@:
525341
// Checksumming a buffer of length 16...255 bytes
@@ -545,6 +361,8 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
545361
b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
546362
add len, len, #16
547363
b .Lhandle_partial_segment_\@ // 17 <= len <= 31
364+
365+
.Lreduce_final_16_bytes_\@:
548366
.endm
549367

550368
//
@@ -554,7 +372,22 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
554372
//
555373
SYM_FUNC_START(crc_t10dif_pmull_p8)
556374
frame_push 1
375+
376+
// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
377+
movi perm.4h, #8, lsl #8
378+
orr perm.2s, #1, lsl #16
379+
orr perm.2s, #1, lsl #24
380+
zip1 perm.16b, perm.16b, perm.16b
381+
zip1 perm.16b, perm.16b, perm.16b
382+
557383
crc_t10dif_pmull p8
384+
385+
CPU_LE( rev64 v7.16b, v7.16b )
386+
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
387+
str q7, [x3]
388+
389+
frame_pop
390+
ret
558391
SYM_FUNC_END(crc_t10dif_pmull_p8)
559392

560393
.align 5
@@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
565398
//
566399
SYM_FUNC_START(crc_t10dif_pmull_p64)
567400
crc_t10dif_pmull p64
401+
402+
// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
403+
404+
movi v2.16b, #0 // init zero register
405+
406+
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
407+
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
408+
409+
// Fold the high 64 bits into the low 64 bits, while also multiplying by
410+
// x^64. This produces a 128-bit value congruent to x^64 * M(x) and
411+
// whose low 48 bits are 0.
412+
ext v0.16b, v2.16b, v7.16b, #8
413+
pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
414+
eor v0.16b, v0.16b, v7.16b // + low bits * x^64
415+
416+
// Fold the high 32 bits into the low 96 bits. This produces a 96-bit
417+
// value congruent to x^64 * M(x) and whose low 48 bits are 0.
418+
ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
419+
mov v0.s[3], v2.s[0] // zero high 32 bits
420+
pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
421+
eor v0.16b, v0.16b, v1.16b // + low bits
422+
423+
// Load G(x) and floor(x^48 / G(x)).
424+
ld1 {fold_consts.2d}, [fold_consts_ptr]
425+
426+
// Use Barrett reduction to compute the final CRC value.
427+
pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
428+
ushr v1.2d, v1.2d, #32 // /= x^32
429+
pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
430+
ushr v0.2d, v0.2d, #48
431+
eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
432+
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
433+
434+
umov w0, v0.h[0]
435+
ret
568436
SYM_FUNC_END(crc_t10dif_pmull_p64)
569437

570438
.section ".rodata", "a"

arch/arm64/crypto/crct10dif-ce-glue.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
2222

23-
asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
23+
asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
24+
u8 out[16]);
2425
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
2526

2627
static int crct10dif_init(struct shash_desc *desc)
@@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
3435
static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
3536
unsigned int length)
3637
{
37-
u16 *crc = shash_desc_ctx(desc);
38+
u16 *crcp = shash_desc_ctx(desc);
39+
u16 crc = *crcp;
40+
u8 buf[16];
3841

39-
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
42+
if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
4043
kernel_neon_begin();
41-
*crc = crc_t10dif_pmull_p8(*crc, data, length);
44+
crc_t10dif_pmull_p8(crc, data, length, buf);
4245
kernel_neon_end();
43-
} else {
44-
*crc = crc_t10dif_generic(*crc, data, length);
46+
47+
crc = 0;
48+
data = buf;
49+
length = sizeof(buf);
4550
}
4651

52+
*crcp = crc_t10dif_generic(crc, data, length);
4753
return 0;
4854
}
4955

0 commit comments

Comments
 (0)