Skip to content

Commit 933e897

Browse files
ebiggersherbertx
authored andcommitted
crypto: x86/aegis128 - optimize partial block handling using SSE4.1
Optimize the code that loads and stores partial blocks, taking advantage of SSE4.1. The code is adapted from that in aes-gcm-aesni-x86_64.S. Reviewed-by: Ondrej Mosnacek <[email protected]> Signed-off-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 8da94b3 commit 933e897

File tree

1 file changed

+95
-141
lines changed

1 file changed

+95
-141
lines changed

arch/x86/crypto/aegis128-aesni-asm.S

Lines changed: 95 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*
55
* Copyright (c) 2017-2018 Ondrej Mosnacek <[email protected]>
66
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
7+
* Copyright 2024 Google LLC
78
*/
89

910
#include <linux/linkage.h>
@@ -28,11 +29,11 @@
2829
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
2930
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
3031

31-
.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
32-
.align 16
33-
.Laegis128_counter:
34-
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
35-
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
32+
.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
33+
.align 32
34+
.Lzeropad_mask:
35+
.octa 0xffffffffffffffffffffffffffffffff
36+
.octa 0
3637

3738
.text
3839

@@ -55,132 +56,86 @@
5556
.endm
5657

5758
/*
58-
* __load_partial: internal ABI
59-
* input:
60-
* LEN - bytes
61-
* SRC - src
62-
* output:
63-
* MSG - message block
64-
* changed:
65-
* T0
66-
* %r8
67-
* %r9
59+
* Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
60+
* MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8.
6861
*/
69-
SYM_FUNC_START_LOCAL(__load_partial)
70-
.set LEN, %ecx
71-
.set SRC, %rsi
72-
xor %r9d, %r9d
73-
pxor MSG, MSG
74-
75-
mov LEN, %r8d
76-
and $0x1, %r8
77-
jz .Lld_partial_1
78-
79-
mov LEN, %r8d
80-
and $0x1E, %r8
81-
add SRC, %r8
82-
mov (%r8), %r9b
83-
84-
.Lld_partial_1:
85-
mov LEN, %r8d
86-
and $0x2, %r8
87-
jz .Lld_partial_2
88-
89-
mov LEN, %r8d
90-
and $0x1C, %r8
91-
add SRC, %r8
92-
shl $0x10, %r9
93-
mov (%r8), %r9w
94-
95-
.Lld_partial_2:
96-
mov LEN, %r8d
97-
and $0x4, %r8
98-
jz .Lld_partial_4
99-
100-
mov LEN, %r8d
101-
and $0x18, %r8
102-
add SRC, %r8
103-
shl $32, %r9
104-
mov (%r8), %r8d
105-
xor %r8, %r9
106-
107-
.Lld_partial_4:
108-
movq %r9, MSG
109-
110-
mov LEN, %r8d
111-
and $0x8, %r8
112-
jz .Lld_partial_8
113-
114-
mov LEN, %r8d
115-
and $0x10, %r8
116-
add SRC, %r8
117-
pslldq $8, MSG
118-
movq (%r8), T0
119-
pxor T0, MSG
120-
121-
.Lld_partial_8:
122-
RET
123-
SYM_FUNC_END(__load_partial)
62+
.macro load_partial
63+
sub $8, %ecx /* LEN - 8 */
64+
jle .Lle8\@
65+
66+
/* Load 9 <= LEN <= 15 bytes: */
67+
movq (SRC), MSG /* Load first 8 bytes */
68+
mov (SRC, %rcx), %rax /* Load last 8 bytes */
69+
neg %ecx
70+
shl $3, %ecx
71+
shr %cl, %rax /* Discard overlapping bytes */
72+
pinsrq $1, %rax, MSG
73+
jmp .Ldone\@
74+
75+
.Lle8\@:
76+
add $4, %ecx /* LEN - 4 */
77+
jl .Llt4\@
78+
79+
/* Load 4 <= LEN <= 8 bytes: */
80+
mov (SRC), %eax /* Load first 4 bytes */
81+
mov (SRC, %rcx), %r8d /* Load last 4 bytes */
82+
jmp .Lcombine\@
83+
84+
.Llt4\@:
85+
/* Load 1 <= LEN <= 3 bytes: */
86+
add $2, %ecx /* LEN - 2 */
87+
movzbl (SRC), %eax /* Load first byte */
88+
jl .Lmovq\@
89+
movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */
90+
.Lcombine\@:
91+
shl $3, %ecx
92+
shl %cl, %r8
93+
or %r8, %rax /* Combine the two parts */
94+
.Lmovq\@:
95+
movq %rax, MSG
96+
.Ldone\@:
97+
.endm
12498

12599
/*
126-
* __store_partial: internal ABI
127-
* input:
128-
* LEN - bytes
129-
* DST - dst
130-
* output:
131-
* T0 - message block
132-
* changed:
133-
* %r8
134-
* %r9
135-
* %r10
100+
* Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
101+
* DST. Clobbers %rax, %rcx, and %r8.
136102
*/
137-
SYM_FUNC_START_LOCAL(__store_partial)
138-
.set LEN, %ecx
139-
.set DST, %rdx
140-
mov LEN, %r8d
141-
mov DST, %r9
142-
143-
movq T0, %r10
144-
145-
cmp $8, %r8
146-
jl .Lst_partial_8
147-
148-
mov %r10, (%r9)
149-
psrldq $8, T0
150-
movq T0, %r10
151-
152-
sub $8, %r8
153-
add $8, %r9
154-
155-
.Lst_partial_8:
156-
cmp $4, %r8
157-
jl .Lst_partial_4
158-
159-
mov %r10d, (%r9)
160-
shr $32, %r10
161-
162-
sub $4, %r8
163-
add $4, %r9
164-
165-
.Lst_partial_4:
166-
cmp $2, %r8
167-
jl .Lst_partial_2
168-
169-
mov %r10w, (%r9)
170-
shr $0x10, %r10
171-
172-
sub $2, %r8
173-
add $2, %r9
174-
175-
.Lst_partial_2:
176-
cmp $1, %r8
177-
jl .Lst_partial_1
178-
179-
mov %r10b, (%r9)
180-
181-
.Lst_partial_1:
182-
RET
183-
SYM_FUNC_END(__store_partial)
103+
.macro store_partial msg
104+
sub $8, %ecx /* LEN - 8 */
105+
jl .Llt8\@
106+
107+
/* Store 8 <= LEN <= 15 bytes: */
108+
pextrq $1, \msg, %rax
109+
mov %ecx, %r8d
110+
shl $3, %ecx
111+
ror %cl, %rax
112+
mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */
113+
movq \msg, (DST) /* Store first 8 bytes */
114+
jmp .Ldone\@
115+
116+
.Llt8\@:
117+
add $4, %ecx /* LEN - 4 */
118+
jl .Llt4\@
119+
120+
/* Store 4 <= LEN <= 7 bytes: */
121+
pextrd $1, \msg, %eax
122+
mov %ecx, %r8d
123+
shl $3, %ecx
124+
ror %cl, %eax
125+
mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */
126+
movd \msg, (DST) /* Store first 4 bytes */
127+
jmp .Ldone\@
128+
129+
.Llt4\@:
130+
/* Store 1 <= LEN <= 3 bytes: */
131+
pextrb $0, \msg, 0(DST)
132+
cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */
133+
jl .Ldone\@
134+
pextrb $1, \msg, 1(DST)
135+
je .Ldone\@
136+
pextrb $2, \msg, 2(DST)
137+
.Ldone\@:
138+
.endm
184139

185140
/*
186141
* void aegis128_aesni_init(struct aegis_state *state,
@@ -453,7 +408,7 @@ SYM_FUNC_START(aegis128_aesni_enc_tail)
453408
.set STATEP, %rdi
454409
.set SRC, %rsi
455410
.set DST, %rdx
456-
.set LEN, %ecx
411+
.set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
457412
FRAME_BEGIN
458413

459414
/* load the state: */
@@ -464,7 +419,8 @@ SYM_FUNC_START(aegis128_aesni_enc_tail)
464419
movdqu 0x40(STATEP), STATE4
465420

466421
/* encrypt message: */
467-
call __load_partial
422+
mov LEN, %r9d
423+
load_partial
468424

469425
movdqa MSG, T0
470426
pxor STATE1, T0
@@ -473,7 +429,8 @@ SYM_FUNC_START(aegis128_aesni_enc_tail)
473429
pand STATE3, T1
474430
pxor T1, T0
475431

476-
call __store_partial
432+
mov %r9d, LEN
433+
store_partial T0
477434

478435
aegis128_update
479436
pxor MSG, STATE4
@@ -598,7 +555,7 @@ SYM_FUNC_START(aegis128_aesni_dec_tail)
598555
.set STATEP, %rdi
599556
.set SRC, %rsi
600557
.set DST, %rdx
601-
.set LEN, %ecx
558+
.set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
602559
FRAME_BEGIN
603560

604561
/* load the state: */
@@ -609,25 +566,22 @@ SYM_FUNC_START(aegis128_aesni_dec_tail)
609566
movdqu 0x40(STATEP), STATE4
610567

611568
/* decrypt message: */
612-
call __load_partial
569+
mov LEN, %r9d
570+
load_partial
613571

614572
pxor STATE1, MSG
615573
pxor STATE4, MSG
616574
movdqa STATE2, T1
617575
pand STATE3, T1
618576
pxor T1, MSG
619577

620-
movdqa MSG, T0
621-
call __store_partial
578+
mov %r9d, LEN
579+
store_partial MSG
622580

623581
/* mask with byte count: */
624-
movd LEN, T0
625-
punpcklbw T0, T0
626-
punpcklbw T0, T0
627-
punpcklbw T0, T0
628-
punpcklbw T0, T0
629-
movdqa .Laegis128_counter(%rip), T1
630-
pcmpgtb T1, T0
582+
lea .Lzeropad_mask+16(%rip), %rax
583+
sub %r9, %rax
584+
movdqu (%rax), T0
631585
pand T0, MSG
632586

633587
aegis128_update

0 commit comments

Comments
 (0)