Skip to content

Commit 571e557

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: arm64/aes-ce - Simplify round key load sequence
Tweak the round key logic so that they can be loaded using a single branchless sequence using overlapping loads. This is shorter and simpler, and puts the conditional branches based on the key size further apart, which might benefit microarchitectures that cannot record taken branches at every instruction. For these branches, use test-bit-branch instructions that don't clobber the condition flags. Note that none of this has any impact on performance, positive or otherwise (and the branch prediction benefit would only benefit AES-192 which nobody uses). It does make for nicer code, though. While at it, use \@ to generate the labels inside the macros, which is more robust than using fixed numbers, which could clash inadvertently. Also, bring aes-neon.S in line with these changes, including the switch to test-and-branch instructions, to avoid surprises in the future when we might start relying on the condition flags being preserved in the chaining mode wrappers in aes-modes.S Signed-off-by: Ard Biesheuvel <[email protected]> Reviewed-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 3f4d148 commit 571e557

File tree

2 files changed

+24
-30
lines changed

2 files changed

+24
-30
lines changed

arch/arm64/crypto/aes-ce.S

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,28 @@
2525
.endm
2626

2727
/* preload all round keys */
28-
.macro load_round_keys, rounds, rk
29-
cmp \rounds, #12
30-
blo 2222f /* 128 bits */
31-
beq 1111f /* 192 bits */
32-
ld1 {v17.4s-v18.4s}, [\rk], #32
33-
1111: ld1 {v19.4s-v20.4s}, [\rk], #32
34-
2222: ld1 {v21.4s-v24.4s}, [\rk], #64
35-
ld1 {v25.4s-v28.4s}, [\rk], #64
36-
ld1 {v29.4s-v31.4s}, [\rk]
28+
.macro load_round_keys, rk, nr, tmp
29+
add \tmp, \rk, \nr, sxtw #4
30+
sub \tmp, \tmp, #160
31+
ld1 {v17.4s-v20.4s}, [\rk]
32+
ld1 {v21.4s-v24.4s}, [\tmp], #64
33+
ld1 {v25.4s-v28.4s}, [\tmp], #64
34+
ld1 {v29.4s-v31.4s}, [\tmp]
3735
.endm
3836

3937
/* prepare for encryption with key in rk[] */
4038
.macro enc_prepare, rounds, rk, temp
41-
mov \temp, \rk
42-
load_round_keys \rounds, \temp
39+
load_round_keys \rk, \rounds, \temp
4340
.endm
4441

4542
/* prepare for encryption (again) but with new key in rk[] */
4643
.macro enc_switch_key, rounds, rk, temp
47-
mov \temp, \rk
48-
load_round_keys \rounds, \temp
44+
load_round_keys \rk, \rounds, \temp
4945
.endm
5046

5147
/* prepare for decryption with key in rk[] */
5248
.macro dec_prepare, rounds, rk, temp
53-
mov \temp, \rk
54-
load_round_keys \rounds, \temp
49+
load_round_keys \rk, \rounds, \temp
5550
.endm
5651

5752
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
@@ -110,14 +105,13 @@
110105

111106
/* up to 5 interleaved blocks */
112107
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
113-
cmp \rounds, #12
114-
blo 2222f /* 128 bits */
115-
beq 1111f /* 192 bits */
108+
tbz \rounds, #2, .L\@ /* 128 bits */
116109
round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4
117110
round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4
118-
1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
111+
tbz \rounds, #1, .L\@ /* 192 bits */
112+
round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
119113
round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4
120-
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
114+
.L\@: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
121115
round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4
122116
.endr
123117
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4

arch/arm64/crypto/aes-neon.S

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -99,16 +99,16 @@
9999
ld1 {v15.4s}, [\rk]
100100
add \rkp, \rk, #16
101101
mov \i, \rounds
102-
1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
102+
.La\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
103103
movi v15.16b, #0x40
104104
tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
105105
sub_bytes \in
106-
subs \i, \i, #1
106+
sub \i, \i, #1
107107
ld1 {v15.4s}, [\rkp], #16
108-
beq 2222f
108+
cbz \i, .Lb\@
109109
mix_columns \in, \enc
110-
b 1111b
111-
2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
110+
b .La\@
111+
.Lb\@: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
112112
.endm
113113

114114
.macro encrypt_block, in, rounds, rk, rkp, i
@@ -206,7 +206,7 @@
206206
ld1 {v15.4s}, [\rk]
207207
add \rkp, \rk, #16
208208
mov \i, \rounds
209-
1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
209+
.La\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
210210
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
211211
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
212212
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
@@ -216,13 +216,13 @@
216216
tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
217217
tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
218218
sub_bytes_4x \in0, \in1, \in2, \in3
219-
subs \i, \i, #1
219+
sub \i, \i, #1
220220
ld1 {v15.4s}, [\rkp], #16
221-
beq 2222f
221+
cbz \i, .Lb\@
222222
mix_columns_2x \in0, \in1, \enc
223223
mix_columns_2x \in2, \in3, \enc
224-
b 1111b
225-
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
224+
b .La\@
225+
.Lb\@: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
226226
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
227227
eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
228228
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */

0 commit comments

Comments
 (0)