Skip to content

Commit 6dc035b

Browse files
committed
AArch64: Use alignment-safe Neon stores in rej_uniform
The output pointer in rej_uniform_asm.S advances in multiples of 2 bytes (sizeof(int16_t)), so it is not 16-byte aligned in general. The `str qN, [ptr]` instruction requires 16-byte alignment, which faults on bare-metal AArch64 without an MMU (Device memory). Replace with `st1 {vN.8h}, [ptr]` which has no alignment requirement. Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 3950a3e commit 6dc035b

File tree

4 files changed

+24
-24
lines changed

4 files changed

+24
-24
lines changed

dev/aarch64_opt/src/rej_uniform_asm.S

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -283,16 +283,16 @@ rej_uniform_loop48:
283283
tbl val2.16b, {val2.16b}, table2.16b
284284
tbl val3.16b, {val3.16b}, table3.16b
285285

286-
str val0q, [output_tmp]
286+
st1 {val0.8h}, [output_tmp]
287287
add output_tmp, output_tmp, ctr0, lsl #1
288288

289-
str val1q, [output_tmp]
289+
st1 {val1.8h}, [output_tmp]
290290
add output_tmp, output_tmp, ctr1, lsl #1
291291

292-
str val2q, [output_tmp]
292+
st1 {val2.8h}, [output_tmp]
293293
add output_tmp, output_tmp, ctr2, lsl #1
294294

295-
str val3q, [output_tmp]
295+
st1 {val3.8h}, [output_tmp]
296296
add output_tmp, output_tmp, ctr3, lsl #1
297297

298298
add ctr01, ctr0, ctr1
@@ -350,10 +350,10 @@ rej_uniform_loop48_end:
350350
tbl val0.16b, {val0.16b}, table0.16b
351351
tbl val1.16b, {val1.16b}, table1.16b
352352

353-
str val0q, [output_tmp]
353+
st1 {val0.8h}, [output_tmp]
354354
add output_tmp, output_tmp, ctr0, lsl #1
355355

356-
str val1q, [output_tmp]
356+
st1 {val1.8h}, [output_tmp]
357357
add output_tmp, output_tmp, ctr1, lsl #1
358358

359359
add count, count, ctr0

mlkem/src/native/aarch64/src/rej_uniform_asm.S

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,13 @@ Lrej_uniform_loop48:
141141
tbl v17.16b, { v17.16b }, v25.16b
142142
tbl v18.16b, { v18.16b }, v26.16b
143143
tbl v19.16b, { v19.16b }, v27.16b
144-
str q16, [x7]
144+
st1 { v16.8h }, [x7]
145145
add x7, x7, x12, lsl #1
146-
str q17, [x7]
146+
st1 { v17.8h }, [x7]
147147
add x7, x7, x13, lsl #1
148-
str q18, [x7]
148+
st1 { v18.8h }, [x7]
149149
add x7, x7, x14, lsl #1
150-
str q19, [x7]
150+
st1 { v19.8h }, [x7]
151151
add x7, x7, x15, lsl #1
152152
add x12, x12, x13
153153
add x14, x14, x15
@@ -187,9 +187,9 @@ Lrej_uniform_loop48_end:
187187
fmov w13, s21
188188
tbl v16.16b, { v16.16b }, v24.16b
189189
tbl v17.16b, { v17.16b }, v25.16b
190-
str q16, [x7]
190+
st1 { v16.8h }, [x7]
191191
add x7, x7, x12, lsl #1
192-
str q17, [x7]
192+
st1 { v17.8h }, [x7]
193193
add x7, x7, x13, lsl #1
194194
add x9, x9, x12
195195
add x9, x9, x13

proofs/hol_light/aarch64/mlkem/mlkem_rej_uniform.S

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,13 +143,13 @@ Lrej_uniform_loop48:
143143
tbl v17.16b, { v17.16b }, v25.16b
144144
tbl v18.16b, { v18.16b }, v26.16b
145145
tbl v19.16b, { v19.16b }, v27.16b
146-
str q16, [x7]
146+
st1 { v16.8h }, [x7]
147147
add x7, x7, x12, lsl #1
148-
str q17, [x7]
148+
st1 { v17.8h }, [x7]
149149
add x7, x7, x13, lsl #1
150-
str q18, [x7]
150+
st1 { v18.8h }, [x7]
151151
add x7, x7, x14, lsl #1
152-
str q19, [x7]
152+
st1 { v19.8h }, [x7]
153153
add x7, x7, x15, lsl #1
154154
add x12, x12, x13
155155
add x14, x14, x15
@@ -189,9 +189,9 @@ Lrej_uniform_loop48_end:
189189
fmov w13, s21
190190
tbl v16.16b, { v16.16b }, v24.16b
191191
tbl v17.16b, { v17.16b }, v25.16b
192-
str q16, [x7]
192+
st1 { v16.8h }, [x7]
193193
add x7, x7, x12, lsl #1
194-
str q17, [x7]
194+
st1 { v17.8h }, [x7]
195195
add x7, x7, x13, lsl #1
196196
add x9, x9, x12
197197
add x9, x9, x13

proofs/hol_light/aarch64/proofs/mlkem_rej_uniform.ml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,13 @@ let mlkem_rej_uniform_mc = define_assert_from_elf
102102
0x4e190231; (* arm_TBL Q17 [Q17] Q25 128 *)
103103
0x4e1a0252; (* arm_TBL Q18 [Q18] Q26 128 *)
104104
0x4e1b0273; (* arm_TBL Q19 [Q19] Q27 128 *)
105-
0x3d8000f0; (* arm_STR Q16 X7 (Immediate_Offset (word 0)) *)
105+
0x4c0074f0; (* arm_STR Q16 X7 No_Offset *)
106106
0x8b0c04e7; (* arm_ADD X7 X7 (Shiftedreg X12 LSL 1) *)
107-
0x3d8000f1; (* arm_STR Q17 X7 (Immediate_Offset (word 0)) *)
107+
0x4c0074f1; (* arm_STR Q17 X7 No_Offset *)
108108
0x8b0d04e7; (* arm_ADD X7 X7 (Shiftedreg X13 LSL 1) *)
109-
0x3d8000f2; (* arm_STR Q18 X7 (Immediate_Offset (word 0)) *)
109+
0x4c0074f2; (* arm_STR Q18 X7 No_Offset *)
110110
0x8b0e04e7; (* arm_ADD X7 X7 (Shiftedreg X14 LSL 1) *)
111-
0x3d8000f3; (* arm_STR Q19 X7 (Immediate_Offset (word 0)) *)
111+
0x4c0074f3; (* arm_STR Q19 X7 No_Offset *)
112112
0x8b0f04e7; (* arm_ADD X7 X7 (Shiftedreg X15 LSL 1) *)
113113
0x8b0d018c; (* arm_ADD X12 X12 X13 *)
114114
0x8b0f01ce; (* arm_ADD X14 X14 X15 *)
@@ -146,9 +146,9 @@ let mlkem_rej_uniform_mc = define_assert_from_elf
146146
0x1e2602ad; (* arm_FMOV_FtoI W13 Q21 0 32 *)
147147
0x4e180210; (* arm_TBL Q16 [Q16] Q24 128 *)
148148
0x4e190231; (* arm_TBL Q17 [Q17] Q25 128 *)
149-
0x3d8000f0; (* arm_STR Q16 X7 (Immediate_Offset (word 0)) *)
149+
0x4c0074f0; (* arm_STR Q16 X7 No_Offset *)
150150
0x8b0c04e7; (* arm_ADD X7 X7 (Shiftedreg X12 LSL 1) *)
151-
0x3d8000f1; (* arm_STR Q17 X7 (Immediate_Offset (word 0)) *)
151+
0x4c0074f1; (* arm_STR Q17 X7 No_Offset *)
152152
0x8b0d04e7; (* arm_ADD X7 X7 (Shiftedreg X13 LSL 1) *)
153153
0x8b0c0129; (* arm_ADD X9 X9 X12 *)
154154
0x8b0d0129; (* arm_ADD X9 X9 X13 *)

0 commit comments

Comments
 (0)