Skip to content

Commit 0b09cec

Browse files
committed
aarch64: Optimize instruction scheduling in gf_4vect_dot_prod_neon
Improve performance by: - Grouping table lookup (tbl) instructions to enhance instruction-level parallelism - Replacing individual loads with paired loads (ldp) for better memory access patterns - Removing unnecessary prefetch instructions - Reordering operations to reduce pipeline stalls and data dependencies This optimization improves decode performance by approximately 6.6%.
1 parent d414b27 commit 0b09cec

File tree

1 file changed

+42
-53
lines changed

1 file changed

+42
-53
lines changed

erasure_code/aarch64/gf_4vect_dot_prod_neon.S

Lines changed: 42 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -186,126 +186,115 @@ cdecl(gf_4vect_dot_prod_neon):
186186
add x_tbl3, x_tbl2, x_vec, lsl #2
187187
add x_tbl4, x_tbl3, x_vec, lsl #2
188188
mov x_vec_i, #0
189-
prfm pldl1keep, [x_tbl1]
190-
prfm pldl1keep, [x_tbl2]
191-
prfm pldl1keep, [x_tbl3]
192-
prfm pldl1keep, [x_tbl4]
193189

194190
.Lloop64_vects:
195191
ldr x_ptr, [x_src, x_vec_i]
196192
add x_vec_i, x_vec_i, #8
197193
add x_ptr, x_ptr, x_pos
198194

199-
ldr q_data_0, [x_ptr], #16
200-
ldr q_data_1, [x_ptr], #16
195+
ldp q_data_0, q_data_1, [x_ptr], #32
201196
ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32
202197
ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32
203198
ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32
204199
ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32
205-
ldr q_data_2, [x_ptr], #16
206-
ldr q_data_3, [x_ptr], #16
207-
208-
prfm pldl1strm, [x_ptr]
209-
prfm pldl1keep, [x_tbl1]
210-
prfm pldl1keep, [x_tbl2]
211-
prfm pldl1keep, [x_tbl3]
212-
prfm pldl1keep, [x_tbl4]
213200

214201
/* data_0 */
215202
and v_tmp1.16b, v_data_0.16b, v_mask0f.16b
216203
ushr v_data_0.16b, v_data_0.16b, #4
217204

218205
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
219206
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
207+
tbl v_data_3.16b, {v_gft2_lo.16b}, v_tmp1.16b
208+
tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_0.16b
209+
220210
eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
211+
eor v_p2_0.16b, v_data_3.16b, v_p2_0.16b
221212
eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
222-
223-
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
224-
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
225-
eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
226-
eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
213+
eor v_p2_0.16b, v_p2_0.16b, v_data_2.16b
227214

228215
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
229216
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
217+
tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b
218+
tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_0.16b
219+
230220
eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
231221
eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
232-
233-
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
234-
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
235-
eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
236-
eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
222+
eor v_p4_0.16b, v_data_2.16b, v_p4_0.16b
223+
eor v_p4_0.16b, v_p4_0.16b, v_data_3.16b
237224

238225
/* data_1 */
239226
and v_tmp1.16b, v_data_1.16b, v_mask0f.16b
240227
ushr v_data_1.16b, v_data_1.16b, #4
241228

242229
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
243230
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
231+
tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b
232+
tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_1.16b
233+
244234
eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
235+
eor v_p2_1.16b, v_data_2.16b, v_p2_1.16b
245236
eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
246-
247-
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
248-
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
249-
eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
250-
eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
237+
eor v_p2_1.16b, v_p2_1.16b, v_data_3.16b
251238

252239
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
253240
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
241+
tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b
242+
tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_1.16b
243+
254244
eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
255245
eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
246+
eor v_p4_1.16b, v_data_2.16b, v_p4_1.16b
247+
eor v_p4_1.16b, v_p4_1.16b, v_data_3.16b
256248

257-
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
258-
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
259-
eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
260-
eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
249+
ldp q_data_2, q_data_3, [x_ptr], #32
261250

262251
/* data_2 */
263252
and v_tmp1.16b, v_data_2.16b, v_mask0f.16b
264253
ushr v_data_2.16b, v_data_2.16b, #4
265254

266255
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
267256
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
257+
tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b
258+
tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_2.16b
259+
268260
eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
261+
eor v_p2_2.16b, v_data_0.16b, v_p2_2.16b
269262
eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
270-
271-
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
272-
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
273-
eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
274-
eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
263+
eor v_p2_2.16b, v_p2_2.16b, v_data_1.16b
275264

276265
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
277266
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
267+
tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b
268+
tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_2.16b
269+
278270
eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
279271
eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
280-
281-
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
282-
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
283-
eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
284-
eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
272+
eor v_p4_2.16b, v_data_0.16b, v_p4_2.16b
273+
eor v_p4_2.16b, v_p4_2.16b, v_data_1.16b
285274

286275
/* data_3 */
287276
and v_tmp1.16b, v_data_3.16b, v_mask0f.16b
288277
ushr v_data_3.16b, v_data_3.16b, #4
289278

290279
tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
291280
tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
281+
tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b
282+
tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_3.16b
283+
292284
eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
285+
eor v_p2_3.16b, v_data_0.16b, v_p2_3.16b
293286
eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
294-
295-
tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
296-
tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
297-
eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
298-
eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
287+
eor v_p2_3.16b, v_p2_3.16b, v_data_1.16b
299288

300289
tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
301290
tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
291+
tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b
292+
tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_3.16b
293+
302294
eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
303295
eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
304-
305-
tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
306-
tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
307-
eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
308-
eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
296+
eor v_p4_3.16b, v_data_0.16b, v_p4_3.16b
297+
eor v_p4_3.16b, v_p4_3.16b, v_data_1.16b
309298

310299
cmp x_vec_i, x_vec
311300
blt .Lloop64_vects

0 commit comments

Comments
 (0)