Skip to content

Commit 2022b76

Browse files
committed
aarch64: Optimize instruction scheduling in gf_5vect_dot_prod_neon
Implement advanced register allocation strategy that: - Allocates additional stack space for temporary register spilling - Uses shared temporary registers between adjacent sections (p4 for sections 1-2, p1 for sections 3-4, p2 for section 5) - Groups table lookup operations to improve instruction-level parallelism - Replaces individual loads with vector loads for better memory access patterns - Removes unnecessary prefetch instructions This optimization improves encode performance by approximately 9.4%.
1 parent 0b09cec commit 2022b76

File tree

1 file changed

+78
-59
lines changed

1 file changed

+78
-59
lines changed

erasure_code/aarch64/gf_5vect_dot_prod_neon.S

Lines changed: 78 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -180,12 +180,13 @@ cdecl(gf_5vect_dot_prod_neon):
180180
cmp x_len, #64
181181
blt .Lloop16_init
182182

183-
/* save d8 ~ d15 to stack */
184-
sub sp, sp, #64
183+
/* save d8 ~ d15 to stack and allocate additional space for register spilling */
184+
sub sp, sp, #128
185185
stp d8, d9, [sp]
186186
stp d10, d11, [sp, #16]
187187
stp d12, d13, [sp, #32]
188188
stp d14, d15, [sp, #48]
189+
/* Space from sp+64 to sp+128 is reserved for register spilling */
189190

190191
sub x_len, x_len, #64
191192

@@ -216,11 +217,7 @@ cdecl(gf_5vect_dot_prod_neon):
216217
ldr x_ptr, [x_src, x_vec_i]
217218
add x_ptr, x_ptr, x_pos
218219

219-
ldr q_data_0, [x_ptr], #16
220-
ldr q_data_1, [x_ptr], #16
221-
ldr q_data_2, [x_ptr], #16
222-
ldr q_data_3, [x_ptr], #16
223-
prfm pldl2keep, [x_ptr]
220+
ld1 { v_data_0.16b, v_data_1.16b, v_data_2.16b, v_data_3.16b }, [x_ptr], #64
224221

225222
movi v_mask0f.16b, #0x0f
226223
and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
@@ -236,127 +233,149 @@ cdecl(gf_5vect_dot_prod_neon):
236233
add x_tmp, x_tbl, x_vec_i, lsl #2
237234
add x_vec_i, x_vec_i, #8
238235
ldp q_gft_lo, q_gft_hi, [x_tmp]
239-
prfm pldl3keep, [x_tmp, #32]
240236
add x_tmp, x_tmp, x_vec, lsl #2
241237

238+
// Spill p4 registers to stack to free them for temporary use
239+
stp q_p4_0, q_p4_1, [sp, #64]
240+
241+
// Use p4_0 and p4_1 registers as temporaries for instruction reordering
242242
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
243243
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
244+
tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b
245+
tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b
246+
244247
eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
248+
eor v_p1_1.16b, v_p4_0.16b, v_p1_1.16b
245249
eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
246-
247-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
248-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
249-
eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
250-
eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
250+
eor v_p1_1.16b, v_p1_1.16b, v_p4_1.16b
251251

252252
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
253253
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
254+
tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b
255+
tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b
256+
254257
eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
258+
eor v_p1_3.16b, v_p4_0.16b, v_p1_3.16b
255259
eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
260+
eor v_p1_3.16b, v_p1_3.16b, v_p4_1.16b
256261

257-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
258-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
259-
eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
260-
eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
262+
// Note: Not restoring p4 registers yet as they will be used in section 2
261263

262264
/* v_p2_x */
263265
ldp q_gft_lo, q_gft_hi, [x_tmp]
264-
prfm pldl3keep, [x_tmp, #32]
265266
add x_tmp, x_tmp, x_vec, lsl #2
266267

268+
// Continue using p4_0 and p4_1 registers as temporaries for instruction reordering
267269
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
268270
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
271+
tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b
272+
tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b
273+
269274
eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
275+
eor v_p2_1.16b, v_p4_0.16b, v_p2_1.16b
270276
eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
271-
272-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
273-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
274-
eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
275-
eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
277+
eor v_p2_1.16b, v_p2_1.16b, v_p4_1.16b
276278

277279
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
278280
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
281+
tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b
282+
tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b
283+
279284
eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
285+
eor v_p2_3.16b, v_p4_0.16b, v_p2_3.16b
280286
eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
287+
eor v_p2_3.16b, v_p2_3.16b, v_p4_1.16b
281288

282-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
283-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
284-
eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
285-
eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
289+
// Now restore p4 registers after using them for sections 1 and 2
290+
ldp q_p4_0, q_p4_1, [sp, #64]
286291

287292
/* v_p3_x */
288293
ldp q_gft_lo, q_gft_hi, [x_tmp]
289-
prfm pldl3keep, [x_tmp, #32]
290294
add x_tmp, x_tmp, x_vec, lsl #2
291295

296+
// Spill p1 registers to stack to free them for temporary use
297+
stp q_p1_0, q_p1_1, [sp, #64]
298+
299+
// Use p1_0 and p1_1 registers as temporaries for instruction reordering
292300
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
293301
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
302+
tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b
303+
tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b
304+
294305
eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
306+
eor v_p3_1.16b, v_p1_0.16b, v_p3_1.16b
295307
eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
296-
297-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
298-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
299-
eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
300-
eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
308+
eor v_p3_1.16b, v_p3_1.16b, v_p1_1.16b
301309

302310
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
303311
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
312+
tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b
313+
tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b
314+
304315
eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
316+
eor v_p3_3.16b, v_p1_0.16b, v_p3_3.16b
305317
eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
318+
eor v_p3_3.16b, v_p3_3.16b, v_p1_1.16b
306319

307-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
308-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
309-
eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
310-
eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
320+
// Note: Not restoring p1 registers yet as they will be used in section 4
311321

312322
/* v_p4_x */
313323
ldp q_gft_lo, q_gft_hi, [x_tmp]
314-
prfm pldl3keep, [x_tmp, #32]
315324
add x_tmp, x_tmp, x_vec, lsl #2
316325

326+
// Continue using p1_0 and p1_1 registers as temporaries for instruction reordering
317327
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
318328
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
329+
tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b
330+
tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b
331+
319332
eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
333+
eor v_p4_1.16b, v_p1_0.16b, v_p4_1.16b
320334
eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
321-
322-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
323-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
324-
eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
325-
eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
335+
eor v_p4_1.16b, v_p4_1.16b, v_p1_1.16b
326336

327337
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
328338
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
339+
tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b
340+
tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b
341+
329342
eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
343+
eor v_p4_3.16b, v_p1_0.16b, v_p4_3.16b
330344
eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
345+
eor v_p4_3.16b, v_p4_3.16b, v_p1_1.16b
331346

332-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
333-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
334-
eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
335-
eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
347+
// Now restore p1 registers after using them for sections 3 and 4
348+
ldp q_p1_0, q_p1_1, [sp, #64]
336349

337350
/* v_p5_x */
338351
ldp q_gft_lo, q_gft_hi, [x_tmp]
339-
prfm pldl3keep, [x_tmp, #32]
340352

353+
// Spill p2 registers to stack to free them for temporary use
354+
stp q_p2_0, q_p2_1, [sp, #64]
355+
356+
// Use p2_0 and p2_1 registers as temporaries for instruction reordering
341357
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
342358
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
359+
tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b
360+
tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b
361+
343362
eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
363+
eor v_p5_1.16b, v_p2_0.16b, v_p5_1.16b
344364
eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
345-
346-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
347-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
348-
eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
349-
eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
365+
eor v_p5_1.16b, v_p5_1.16b, v_p2_1.16b
350366

351367
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
352368
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
369+
tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b
370+
tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b
371+
353372
eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
373+
eor v_p5_3.16b, v_p2_0.16b, v_p5_3.16b
354374
eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
375+
eor v_p5_3.16b, v_p5_3.16b, v_p2_1.16b
355376

356-
tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
357-
tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
358-
eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
359-
eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
377+
// Restore the p2 registers
378+
ldp q_p2_0, q_p2_1, [sp, #64]
360379

361380
cmp x_vec_i, x_vec
362381
blt .Lloop64_vects
@@ -387,12 +406,12 @@ cdecl(gf_5vect_dot_prod_neon):
387406
ble .Lloop64
388407

389408
.Lloop64_end:
390-
/* restore d8 ~ d15 */
409+
/* restore d8 ~ d15 and deallocate additional space for register spilling */
391410
ldp d8, d9, [sp]
392411
ldp d10, d11, [sp, #16]
393412
ldp d12, d13, [sp, #32]
394413
ldp d14, d15, [sp, #48]
395-
add sp, sp, #64
414+
add sp, sp, #128
396415

397416
add x_len, x_len, #64
398417
cmp x_pos, x_len

0 commit comments

Comments
 (0)