@@ -180,12 +180,13 @@ cdecl(gf_5vect_dot_prod_neon):
180
180
cmp x_len , # 64
181
181
blt .Lloop16_init
182
182
183
- / * save d8 ~ d15 to stack * /
184
- sub sp , sp , # 64
183
+ / * save d8 ~ d15 to stack and allocate additional space for register spilling * /
184
+ sub sp , sp , # 128
185
185
stp d8 , d9 , [ sp ]
186
186
stp d10 , d11 , [ sp , # 16 ]
187
187
stp d12 , d13 , [ sp , # 32 ]
188
188
stp d14 , d15 , [ sp , # 48 ]
189
+ / * Space from sp + 64 to sp + 128 is reserved for register spilling * /
189
190
190
191
sub x_len , x_len , # 64
191
192
@@ -216,11 +217,7 @@ cdecl(gf_5vect_dot_prod_neon):
216
217
ldr x_ptr , [ x_src , x_vec_i ]
217
218
add x_ptr , x_ptr , x_pos
218
219
219
- ldr q_data_0 , [ x_ptr ], # 16
220
- ldr q_data_1 , [ x_ptr ], # 16
221
- ldr q_data_2 , [ x_ptr ], # 16
222
- ldr q_data_3 , [ x_ptr ], # 16
223
- prfm pldl2keep , [ x_ptr ]
220
+ ld1 { v_data_0.16b , v_data_1.16b , v_data_2.16b , v_data_3.16b } , [ x_ptr ], # 64
224
221
225
222
movi v_mask0f.16b , # 0x0f
226
223
and v_data_0_lo.16b , v_data_0.16b , v_mask0f.16b
@@ -236,127 +233,149 @@ cdecl(gf_5vect_dot_prod_neon):
236
233
add x_tmp , x_tbl , x_vec_i , lsl # 2
237
234
add x_vec_i , x_vec_i , # 8
238
235
ldp q_gft_lo , q_gft_hi , [ x_tmp ]
239
- prfm pldl3keep , [ x_tmp , # 32 ]
240
236
add x_tmp , x_tmp , x_vec , lsl # 2
241
237
238
+ // Spill p4 registers to stack to free them for temporary use
239
+ stp q_p4_0 , q_p4_1 , [ sp , # 64 ]
240
+
241
+ // Use p4_0 and p4_1 registers as temporaries for instruction reordering
242
242
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_0_lo.16b
243
243
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_0_hi.16b
244
+ tbl v_p4_0.16b , {v_gft_lo.16b} , v_data_1_lo.16b
245
+ tbl v_p4_1.16b , {v_gft_hi.16b} , v_data_1_hi.16b
246
+
244
247
eor v_p1_0.16b , v_tmp_lo.16b , v_p1_0.16b
248
+ eor v_p1_1.16b , v_p4_0.16b , v_p1_1.16b
245
249
eor v_p1_0.16b , v_p1_0.16b , v_tmp_hi.16b
246
-
247
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_1_lo.16b
248
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_1_hi.16b
249
- eor v_p1_1.16b , v_tmp_lo.16b , v_p1_1.16b
250
- eor v_p1_1.16b , v_p1_1.16b , v_tmp_hi.16b
250
+ eor v_p1_1.16b , v_p1_1.16b , v_p4_1.16b
251
251
252
252
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_2_lo.16b
253
253
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_2_hi.16b
254
+ tbl v_p4_0.16b , {v_gft_lo.16b} , v_data_3_lo.16b
255
+ tbl v_p4_1.16b , {v_gft_hi.16b} , v_data_3_hi.16b
256
+
254
257
eor v_p1_2.16b , v_tmp_lo.16b , v_p1_2.16b
258
+ eor v_p1_3.16b , v_p4_0.16b , v_p1_3.16b
255
259
eor v_p1_2.16b , v_p1_2.16b , v_tmp_hi.16b
260
+ eor v_p1_3.16b , v_p1_3.16b , v_p4_1.16b
256
261
257
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_3_lo.16b
258
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_3_hi.16b
259
- eor v_p1_3.16b , v_tmp_lo.16b , v_p1_3.16b
260
- eor v_p1_3.16b , v_p1_3.16b , v_tmp_hi.16b
262
+ // Note: Not restoring p4 registers yet as they will be used in section 2
261
263
262
264
/ * v_p2_x * /
263
265
ldp q_gft_lo , q_gft_hi , [ x_tmp ]
264
- prfm pldl3keep , [ x_tmp , # 32 ]
265
266
add x_tmp , x_tmp , x_vec , lsl # 2
266
267
268
+ // Continue using p4_0 and p4_1 registers as temporaries for instruction reordering
267
269
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_0_lo.16b
268
270
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_0_hi.16b
271
+ tbl v_p4_0.16b , {v_gft_lo.16b} , v_data_1_lo.16b
272
+ tbl v_p4_1.16b , {v_gft_hi.16b} , v_data_1_hi.16b
273
+
269
274
eor v_p2_0.16b , v_tmp_lo.16b , v_p2_0.16b
275
+ eor v_p2_1.16b , v_p4_0.16b , v_p2_1.16b
270
276
eor v_p2_0.16b , v_p2_0.16b , v_tmp_hi.16b
271
-
272
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_1_lo.16b
273
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_1_hi.16b
274
- eor v_p2_1.16b , v_tmp_lo.16b , v_p2_1.16b
275
- eor v_p2_1.16b , v_p2_1.16b , v_tmp_hi.16b
277
+ eor v_p2_1.16b , v_p2_1.16b , v_p4_1.16b
276
278
277
279
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_2_lo.16b
278
280
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_2_hi.16b
281
+ tbl v_p4_0.16b , {v_gft_lo.16b} , v_data_3_lo.16b
282
+ tbl v_p4_1.16b , {v_gft_hi.16b} , v_data_3_hi.16b
283
+
279
284
eor v_p2_2.16b , v_tmp_lo.16b , v_p2_2.16b
285
+ eor v_p2_3.16b , v_p4_0.16b , v_p2_3.16b
280
286
eor v_p2_2.16b , v_p2_2.16b , v_tmp_hi.16b
287
+ eor v_p2_3.16b , v_p2_3.16b , v_p4_1.16b
281
288
282
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_3_lo.16b
283
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_3_hi.16b
284
- eor v_p2_3.16b , v_tmp_lo.16b , v_p2_3.16b
285
- eor v_p2_3.16b , v_p2_3.16b , v_tmp_hi.16b
289
+ // Now restore p4 registers after using them for sections 1 and 2
290
+ ldp q_p4_0 , q_p4_1 , [ sp , # 64 ]
286
291
287
292
/ * v_p3_x * /
288
293
ldp q_gft_lo , q_gft_hi , [ x_tmp ]
289
- prfm pldl3keep , [ x_tmp , # 32 ]
290
294
add x_tmp , x_tmp , x_vec , lsl # 2
291
295
296
+ // Spill p1 registers to stack to free them for temporary use
297
+ stp q_p1_0 , q_p1_1 , [ sp , # 64 ]
298
+
299
+ // Use p1_0 and p1_1 registers as temporaries for instruction reordering
292
300
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_0_lo.16b
293
301
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_0_hi.16b
302
+ tbl v_p1_0.16b , {v_gft_lo.16b} , v_data_1_lo.16b
303
+ tbl v_p1_1.16b , {v_gft_hi.16b} , v_data_1_hi.16b
304
+
294
305
eor v_p3_0.16b , v_tmp_lo.16b , v_p3_0.16b
306
+ eor v_p3_1.16b , v_p1_0.16b , v_p3_1.16b
295
307
eor v_p3_0.16b , v_p3_0.16b , v_tmp_hi.16b
296
-
297
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_1_lo.16b
298
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_1_hi.16b
299
- eor v_p3_1.16b , v_tmp_lo.16b , v_p3_1.16b
300
- eor v_p3_1.16b , v_p3_1.16b , v_tmp_hi.16b
308
+ eor v_p3_1.16b , v_p3_1.16b , v_p1_1.16b
301
309
302
310
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_2_lo.16b
303
311
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_2_hi.16b
312
+ tbl v_p1_0.16b , {v_gft_lo.16b} , v_data_3_lo.16b
313
+ tbl v_p1_1.16b , {v_gft_hi.16b} , v_data_3_hi.16b
314
+
304
315
eor v_p3_2.16b , v_tmp_lo.16b , v_p3_2.16b
316
+ eor v_p3_3.16b , v_p1_0.16b , v_p3_3.16b
305
317
eor v_p3_2.16b , v_p3_2.16b , v_tmp_hi.16b
318
+ eor v_p3_3.16b , v_p3_3.16b , v_p1_1.16b
306
319
307
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_3_lo.16b
308
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_3_hi.16b
309
- eor v_p3_3.16b , v_tmp_lo.16b , v_p3_3.16b
310
- eor v_p3_3.16b , v_p3_3.16b , v_tmp_hi.16b
320
+ // Note: Not restoring p1 registers yet as they will be used in section 4
311
321
312
322
/ * v_p4_x * /
313
323
ldp q_gft_lo , q_gft_hi , [ x_tmp ]
314
- prfm pldl3keep , [ x_tmp , # 32 ]
315
324
add x_tmp , x_tmp , x_vec , lsl # 2
316
325
326
+ // Continue using p1_0 and p1_1 registers as temporaries for instruction reordering
317
327
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_0_lo.16b
318
328
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_0_hi.16b
329
+ tbl v_p1_0.16b , {v_gft_lo.16b} , v_data_1_lo.16b
330
+ tbl v_p1_1.16b , {v_gft_hi.16b} , v_data_1_hi.16b
331
+
319
332
eor v_p4_0.16b , v_tmp_lo.16b , v_p4_0.16b
333
+ eor v_p4_1.16b , v_p1_0.16b , v_p4_1.16b
320
334
eor v_p4_0.16b , v_p4_0.16b , v_tmp_hi.16b
321
-
322
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_1_lo.16b
323
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_1_hi.16b
324
- eor v_p4_1.16b , v_tmp_lo.16b , v_p4_1.16b
325
- eor v_p4_1.16b , v_p4_1.16b , v_tmp_hi.16b
335
+ eor v_p4_1.16b , v_p4_1.16b , v_p1_1.16b
326
336
327
337
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_2_lo.16b
328
338
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_2_hi.16b
339
+ tbl v_p1_0.16b , {v_gft_lo.16b} , v_data_3_lo.16b
340
+ tbl v_p1_1.16b , {v_gft_hi.16b} , v_data_3_hi.16b
341
+
329
342
eor v_p4_2.16b , v_tmp_lo.16b , v_p4_2.16b
343
+ eor v_p4_3.16b , v_p1_0.16b , v_p4_3.16b
330
344
eor v_p4_2.16b , v_p4_2.16b , v_tmp_hi.16b
345
+ eor v_p4_3.16b , v_p4_3.16b , v_p1_1.16b
331
346
332
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_3_lo.16b
333
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_3_hi.16b
334
- eor v_p4_3.16b , v_tmp_lo.16b , v_p4_3.16b
335
- eor v_p4_3.16b , v_p4_3.16b , v_tmp_hi.16b
347
+ // Now restore p1 registers after using them for sections 3 and 4
348
+ ldp q_p1_0 , q_p1_1 , [ sp , # 64 ]
336
349
337
350
/ * v_p5_x * /
338
351
ldp q_gft_lo , q_gft_hi , [ x_tmp ]
339
- prfm pldl3keep , [ x_tmp , # 32 ]
340
352
353
+ // Spill p2 registers to stack to free them for temporary use
354
+ stp q_p2_0 , q_p2_1 , [ sp , # 64 ]
355
+
356
+ // Use p2_0 and p2_1 registers as temporaries for instruction reordering
341
357
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_0_lo.16b
342
358
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_0_hi.16b
359
+ tbl v_p2_0.16b , {v_gft_lo.16b} , v_data_1_lo.16b
360
+ tbl v_p2_1.16b , {v_gft_hi.16b} , v_data_1_hi.16b
361
+
343
362
eor v_p5_0.16b , v_tmp_lo.16b , v_p5_0.16b
363
+ eor v_p5_1.16b , v_p2_0.16b , v_p5_1.16b
344
364
eor v_p5_0.16b , v_p5_0.16b , v_tmp_hi.16b
345
-
346
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_1_lo.16b
347
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_1_hi.16b
348
- eor v_p5_1.16b , v_tmp_lo.16b , v_p5_1.16b
349
- eor v_p5_1.16b , v_p5_1.16b , v_tmp_hi.16b
365
+ eor v_p5_1.16b , v_p5_1.16b , v_p2_1.16b
350
366
351
367
tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_2_lo.16b
352
368
tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_2_hi.16b
369
+ tbl v_p2_0.16b , {v_gft_lo.16b} , v_data_3_lo.16b
370
+ tbl v_p2_1.16b , {v_gft_hi.16b} , v_data_3_hi.16b
371
+
353
372
eor v_p5_2.16b , v_tmp_lo.16b , v_p5_2.16b
373
+ eor v_p5_3.16b , v_p2_0.16b , v_p5_3.16b
354
374
eor v_p5_2.16b , v_p5_2.16b , v_tmp_hi.16b
375
+ eor v_p5_3.16b , v_p5_3.16b , v_p2_1.16b
355
376
356
- tbl v_tmp_lo.16b , {v_gft_lo.16b} , v_data_3_lo.16b
357
- tbl v_tmp_hi.16b , {v_gft_hi.16b} , v_data_3_hi.16b
358
- eor v_p5_3.16b , v_tmp_lo.16b , v_p5_3.16b
359
- eor v_p5_3.16b , v_p5_3.16b , v_tmp_hi.16b
377
+ // Restore the p2 registers
378
+ ldp q_p2_0 , q_p2_1 , [ sp , # 64 ]
360
379
361
380
cmp x_vec_i , x_vec
362
381
blt .Lloop64_vects
@@ -387,12 +406,12 @@ cdecl(gf_5vect_dot_prod_neon):
387
406
ble .Lloop64
388
407
389
408
.Lloop64_end:
390
- / * restore d8 ~ d15 * /
409
+ / * restore d8 ~ d15 and deallocate additional space for register spilling * /
391
410
ldp d8 , d9 , [ sp ]
392
411
ldp d10 , d11 , [ sp , # 16 ]
393
412
ldp d12 , d13 , [ sp , # 32 ]
394
413
ldp d14 , d15 , [ sp , # 48 ]
395
- add sp , sp , # 64
414
+ add sp , sp , # 128
396
415
397
416
add x_len , x_len , # 64
398
417
cmp x_pos , x_len
0 commit comments