Skip to content

Commit 09c31cb

Browse files
committed
AArch64: Use alignment-safe Neon loads/stores in Keccak x2/x4
The Keccak x2/x4 implementations process two sequential Keccak states. The first state is 16-byte aligned, but the second starts at offset 0xc8 (200 bytes), which is only 8-byte aligned. The `ldp`/`stp` of q-registers require 16-byte alignment, which faults on bare-metal AArch64 without an MMU (Device memory). Replace `ldp`/`stp` for the second state pointer with `ld1`/`st1`, which have no alignment requirement. Both state pointers now use post-increment addressing, removing the lane index parameter from the load/store macros. Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 6dc035b commit 09c31cb

14 files changed

+404
-390
lines changed

dev/fips202/aarch64/src/keccak_f1600_x2_v84a_asm.S

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -146,56 +146,57 @@
146146

147147
/************************ MACROS ****************************/
148148

149-
.macro load_lane out0, out1, out2, out3, idx
150-
ldp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
151-
ldp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
149+
.macro load_lane out0, out1, out2, out3
150+
ldp tmp0q, tmp1q, [input_addr], #32
151+
ld1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
152152
trn1 \out0\().2d, tmp0.2d, tmp2.2d
153153
trn2 \out1\().2d, tmp0.2d, tmp2.2d
154154
trn1 \out2\().2d, tmp1.2d, tmp3.2d
155155
trn2 \out3\().2d, tmp1.2d, tmp3.2d
156156
.endm
157157

158-
.macro load_lane_single out, idx
159-
ldr tmp0d, [input_addr, #(16*(\idx))]
160-
ldr tmp2d, [input_addr_hi, #(16*(\idx))]
158+
.macro load_lane_single out
159+
ldr tmp0d, [input_addr]
160+
ldr tmp2d, [input_addr_hi]
161161
trn1 \out\().2d, tmp0.2d, tmp2.2d
162162
.endm
163163

164-
.macro store_lane out0, out1, out2, out3, idx
164+
.macro store_lane out0, out1, out2, out3
165165
trn1 tmp0\().2d, \out0\().2d, \out1\().2d
166166
trn1 tmp1\().2d, \out2\().2d, \out3\().2d
167-
stp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
167+
stp tmp0q, tmp1q, [input_addr], #32
168168
trn2 tmp2\().2d, \out0\().2d, \out1\().2d
169169
trn2 tmp3\().2d, \out2\().2d, \out3\().2d
170-
stp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
170+
st1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
171171
.endm
172172

173-
.macro store_lane_single out, idx
174-
str \out\()d, [input_addr, #(16*(\idx))]
173+
.macro store_lane_single out
174+
str \out\()d, [input_addr]
175175
trn2 tmp0.2d, \out\().2d, \out\().2d
176-
str tmp0d, [input_addr_hi, #(16*(\idx))]
176+
str tmp0d, [input_addr_hi]
177177
.endm
178178

179179
.macro load_input
180180
add input_addr_hi, input_addr, #0xc8
181-
load_lane Aba, Abe, Abi, Abo, 0
182-
load_lane Abu, Aga, Age, Agi, 2
183-
load_lane Ago, Agu, Aka, Ake, 4
184-
load_lane Aki, Ako, Aku, Ama, 6
185-
load_lane Ame, Ami, Amo, Amu, 8
186-
load_lane Asa, Ase, Asi, Aso, 10
187-
load_lane_single Asu, 12
181+
load_lane Aba, Abe, Abi, Abo
182+
load_lane Abu, Aga, Age, Agi
183+
load_lane Ago, Agu, Aka, Ake
184+
load_lane Aki, Ako, Aku, Ama
185+
load_lane Ame, Ami, Amo, Amu
186+
load_lane Asa, Ase, Asi, Aso
187+
load_lane_single Asu
188188
.endm
189189

190190
.macro store_input
191+
sub input_addr, input_addr, #0xc0
191192
add input_addr_hi, input_addr, #0xc8
192-
store_lane Aba, Abe, Abi, Abo, 0
193-
store_lane Abu, Aga, Age, Agi, 2
194-
store_lane Ago, Agu, Aka, Ake, 4
195-
store_lane Aki, Ako, Aku, Ama, 6
196-
store_lane Ame, Ami, Amo, Amu, 8
197-
store_lane Asa, Ase, Asi, Aso, 10
198-
store_lane_single Asu, 12
193+
store_lane Aba, Abe, Abi, Abo
194+
store_lane Abu, Aga, Age, Agi
195+
store_lane Ago, Agu, Aka, Ake
196+
store_lane Aki, Ako, Aku, Ama
197+
store_lane Ame, Ami, Amo, Amu
198+
store_lane Asa, Ase, Asi, Aso
199+
store_lane_single Asu
199200
.endm
200201

201202
#define STACK_SIZE (16*4) /* VREGS (16*4) */

dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -188,56 +188,57 @@
188188
eor \d\().16b, vtmp.16b, \s0\().16b
189189
.endm
190190

191-
.macro load_lane out0, out1, out2, out3, idx
192-
ldp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
193-
ldp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
191+
.macro load_lane out0, out1, out2, out3
192+
ldp tmp0q, tmp1q, [input_addr], #32
193+
ld1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
194194
trn1 \out0\().2d, tmp0.2d, tmp2.2d
195195
trn2 \out1\().2d, tmp0.2d, tmp2.2d
196196
trn1 \out2\().2d, tmp1.2d, tmp3.2d
197197
trn2 \out3\().2d, tmp1.2d, tmp3.2d
198198
.endm
199199

200-
.macro load_lane_single out, idx
201-
ldr tmp0d, [input_addr, #(16*(\idx))]
202-
ldr tmp2d, [input_addr_hi, #(16*(\idx))]
200+
.macro load_lane_single out
201+
ldr tmp0d, [input_addr]
202+
ldr tmp2d, [input_addr_hi]
203203
trn1 \out\().2d, tmp0.2d, tmp2.2d
204204
.endm
205205

206-
.macro store_lane out0, out1, out2, out3, idx
206+
.macro store_lane out0, out1, out2, out3
207207
trn1 tmp0\().2d, \out0\().2d, \out1\().2d
208208
trn1 tmp1\().2d, \out2\().2d, \out3\().2d
209-
stp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
209+
stp tmp0q, tmp1q, [input_addr], #32
210210
trn2 tmp2\().2d, \out0\().2d, \out1\().2d
211211
trn2 tmp3\().2d, \out2\().2d, \out3\().2d
212-
stp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
212+
st1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
213213
.endm
214214

215-
.macro store_lane_single out, idx
216-
str \out\()d, [input_addr, #(16*(\idx))]
215+
.macro store_lane_single out
216+
str \out\()d, [input_addr]
217217
trn2 tmp0.2d, \out\().2d, \out\().2d
218-
str tmp0d, [input_addr_hi, #(16*(\idx))]
218+
str tmp0d, [input_addr_hi]
219219
.endm
220220

221221
.macro load_input_vector
222222
add input_addr_hi, input_addr, #0xc8
223-
load_lane vAba, vAbe, vAbi, vAbo, 0
224-
load_lane vAbu, vAga, vAge, vAgi, 2
225-
load_lane vAgo, vAgu, vAka, vAke, 4
226-
load_lane vAki, vAko, vAku, vAma, 6
227-
load_lane vAme, vAmi, vAmo, vAmu, 8
228-
load_lane vAsa, vAse, vAsi, vAso, 10
229-
load_lane_single vAsu, 12
223+
load_lane vAba, vAbe, vAbi, vAbo
224+
load_lane vAbu, vAga, vAge, vAgi
225+
load_lane vAgo, vAgu, vAka, vAke
226+
load_lane vAki, vAko, vAku, vAma
227+
load_lane vAme, vAmi, vAmo, vAmu
228+
load_lane vAsa, vAse, vAsi, vAso
229+
load_lane_single vAsu
230+
sub input_addr, input_addr, #0xc0
230231
.endm
231232

232233
.macro store_input_vector
233234
add input_addr_hi, input_addr, #0xc8
234-
store_lane vAba, vAbe, vAbi, vAbo, 0
235-
store_lane vAbu, vAga, vAge, vAgi, 2
236-
store_lane vAgo, vAgu, vAka, vAke, 4
237-
store_lane vAki, vAko, vAku, vAma, 6
238-
store_lane vAme, vAmi, vAmo, vAmu, 8
239-
store_lane vAsa, vAse, vAsi, vAso, 10
240-
store_lane_single vAsu, 12
235+
store_lane vAba, vAbe, vAbi, vAbo
236+
store_lane vAbu, vAga, vAge, vAgi
237+
store_lane vAgo, vAgu, vAka, vAke
238+
store_lane vAki, vAko, vAku, vAma
239+
store_lane vAme, vAmi, vAmo, vAmu
240+
store_lane vAsa, vAse, vAsi, vAso
241+
store_lane_single vAsu
241242
.endm
242243

243244
.macro store_input_scalar idx

dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -206,56 +206,57 @@
206206
eor \d\().16b, vtmp.16b, \s0\().16b
207207
.endm
208208

209-
.macro load_lane out0, out1, out2, out3, idx
210-
ldp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
211-
ldp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
209+
.macro load_lane out0, out1, out2, out3
210+
ldp tmp0q, tmp1q, [input_addr], #32
211+
ld1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
212212
trn1 \out0\().2d, tmp0.2d, tmp2.2d
213213
trn2 \out1\().2d, tmp0.2d, tmp2.2d
214214
trn1 \out2\().2d, tmp1.2d, tmp3.2d
215215
trn2 \out3\().2d, tmp1.2d, tmp3.2d
216216
.endm
217217

218-
.macro load_lane_single out, idx
219-
ldr tmp0d, [input_addr, #(16*(\idx))]
220-
ldr tmp2d, [input_addr_hi, #(16*(\idx))]
218+
.macro load_lane_single out
219+
ldr tmp0d, [input_addr]
220+
ldr tmp2d, [input_addr_hi]
221221
trn1 \out\().2d, tmp0.2d, tmp2.2d
222222
.endm
223223

224-
.macro store_lane out0, out1, out2, out3, idx
224+
.macro store_lane out0, out1, out2, out3
225225
trn1 tmp0\().2d, \out0\().2d, \out1\().2d
226226
trn1 tmp1\().2d, \out2\().2d, \out3\().2d
227-
stp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
227+
stp tmp0q, tmp1q, [input_addr], #32
228228
trn2 tmp2\().2d, \out0\().2d, \out1\().2d
229229
trn2 tmp3\().2d, \out2\().2d, \out3\().2d
230-
stp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
230+
st1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
231231
.endm
232232

233-
.macro store_lane_single out, idx
234-
str \out\()d, [input_addr, #(16*(\idx))]
233+
.macro store_lane_single out
234+
str \out\()d, [input_addr]
235235
trn2 tmp0.2d, \out\().2d, \out\().2d
236-
str tmp0d, [input_addr_hi, #(16*(\idx))]
236+
str tmp0d, [input_addr_hi]
237237
.endm
238238

239239
.macro load_input_vector
240240
add input_addr_hi, input_addr, #0xc8
241-
load_lane vAba, vAbe, vAbi, vAbo, 0
242-
load_lane vAbu, vAga, vAge, vAgi, 2
243-
load_lane vAgo, vAgu, vAka, vAke, 4
244-
load_lane vAki, vAko, vAku, vAma, 6
245-
load_lane vAme, vAmi, vAmo, vAmu, 8
246-
load_lane vAsa, vAse, vAsi, vAso, 10
247-
load_lane_single vAsu, 12
241+
load_lane vAba, vAbe, vAbi, vAbo
242+
load_lane vAbu, vAga, vAge, vAgi
243+
load_lane vAgo, vAgu, vAka, vAke
244+
load_lane vAki, vAko, vAku, vAma
245+
load_lane vAme, vAmi, vAmo, vAmu
246+
load_lane vAsa, vAse, vAsi, vAso
247+
load_lane_single vAsu
248+
sub input_addr, input_addr, #0xc0
248249
.endm
249250

250251
.macro store_input_vector
251252
add input_addr_hi, input_addr, #0xc8
252-
store_lane vAba, vAbe, vAbi, vAbo, 0
253-
store_lane vAbu, vAga, vAge, vAgi, 2
254-
store_lane vAgo, vAgu, vAka, vAke, 4
255-
store_lane vAki, vAko, vAku, vAma, 6
256-
store_lane vAme, vAmi, vAmo, vAmu, 8
257-
store_lane vAsa, vAse, vAsi, vAso, 10
258-
store_lane_single vAsu, 12
253+
store_lane vAba, vAbe, vAbi, vAbo
254+
store_lane vAbu, vAga, vAge, vAgi
255+
store_lane vAgo, vAgu, vAka, vAke
256+
store_lane vAki, vAko, vAku, vAma
257+
store_lane vAme, vAmi, vAmo, vAmu
258+
store_lane vAsa, vAse, vAsi, vAso
259+
store_lane_single vAsu
259260
.endm
260261

261262
.macro store_input_scalar idx

dev/fips202/aarch64_symbolic/keccak_f1600_x4_v8a_scalar_hybrid_clean.S

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -188,56 +188,57 @@
188188
eor \d\().16b, vtmp.16b, \s0\().16b
189189
.endm
190190

191-
.macro load_lane out0, out1, out2, out3, idx
192-
ldp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
193-
ldp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
191+
.macro load_lane out0, out1, out2, out3
192+
ldp tmp0q, tmp1q, [input_addr], #32
193+
ld1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
194194
trn1 \out0\().2d, tmp0.2d, tmp2.2d
195195
trn2 \out1\().2d, tmp0.2d, tmp2.2d
196196
trn1 \out2\().2d, tmp1.2d, tmp3.2d
197197
trn2 \out3\().2d, tmp1.2d, tmp3.2d
198198
.endm
199199

200-
.macro load_lane_single out, idx
201-
ldr tmp0d, [input_addr, #(16*(\idx))]
202-
ldr tmp2d, [input_addr_hi, #(16*(\idx))]
200+
.macro load_lane_single out
201+
ldr tmp0d, [input_addr]
202+
ldr tmp2d, [input_addr_hi]
203203
trn1 \out\().2d, tmp0.2d, tmp2.2d
204204
.endm
205205

206-
.macro store_lane out0, out1, out2, out3, idx
206+
.macro store_lane out0, out1, out2, out3
207207
trn1 tmp0\().2d, \out0\().2d, \out1\().2d
208208
trn1 tmp1\().2d, \out2\().2d, \out3\().2d
209-
stp tmp0q, tmp1q, [input_addr, #(16*(\idx))]
209+
stp tmp0q, tmp1q, [input_addr], #32
210210
trn2 tmp2\().2d, \out0\().2d, \out1\().2d
211211
trn2 tmp3\().2d, \out2\().2d, \out3\().2d
212-
stp tmp2q, tmp3q, [input_addr_hi, #(16*(\idx))]
212+
st1 {tmp2.2d, tmp3.2d}, [input_addr_hi], #32
213213
.endm
214214

215-
.macro store_lane_single out, idx
216-
str \out\()d, [input_addr, #(16*(\idx))]
215+
.macro store_lane_single out
216+
str \out\()d, [input_addr]
217217
trn2 tmp0.2d, \out\().2d, \out\().2d
218-
str tmp0d, [input_addr_hi, #(16*(\idx))]
218+
str tmp0d, [input_addr_hi]
219219
.endm
220220

221221
.macro load_input_vector
222222
add input_addr_hi, input_addr, #0xc8
223-
load_lane vAba, vAbe, vAbi, vAbo, 0
224-
load_lane vAbu, vAga, vAge, vAgi, 2
225-
load_lane vAgo, vAgu, vAka, vAke, 4
226-
load_lane vAki, vAko, vAku, vAma, 6
227-
load_lane vAme, vAmi, vAmo, vAmu, 8
228-
load_lane vAsa, vAse, vAsi, vAso, 10
229-
load_lane_single vAsu, 12
223+
load_lane vAba, vAbe, vAbi, vAbo
224+
load_lane vAbu, vAga, vAge, vAgi
225+
load_lane vAgo, vAgu, vAka, vAke
226+
load_lane vAki, vAko, vAku, vAma
227+
load_lane vAme, vAmi, vAmo, vAmu
228+
load_lane vAsa, vAse, vAsi, vAso
229+
load_lane_single vAsu
230+
sub input_addr, input_addr, #0xc0
230231
.endm
231232

232233
.macro store_input_vector
233234
add input_addr_hi, input_addr, #0xc8
234-
store_lane vAba, vAbe, vAbi, vAbo, 0
235-
store_lane vAbu, vAga, vAge, vAgi, 2
236-
store_lane vAgo, vAgu, vAka, vAke, 4
237-
store_lane vAki, vAko, vAku, vAma, 6
238-
store_lane vAme, vAmi, vAmo, vAmu, 8
239-
store_lane vAsa, vAse, vAsi, vAso, 10
240-
store_lane_single vAsu, 12
235+
store_lane vAba, vAbe, vAbi, vAbo
236+
store_lane vAbu, vAga, vAge, vAgi
237+
store_lane vAgo, vAgu, vAka, vAke
238+
store_lane vAki, vAko, vAku, vAma
239+
store_lane vAme, vAmi, vAmo, vAmu
240+
store_lane vAsa, vAse, vAsi, vAso
241+
store_lane_single vAsu
241242
.endm
242243

243244
.macro store_input_scalar idx

0 commit comments

Comments
 (0)