From 4e27f0b9bfb8678f4d3eeaf8ff3a3b6757079a25 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Mon, 21 Jul 2025 11:46:28 -0500 Subject: [PATCH 1/5] aarch64: Optimize instruction scheduling in gf_4vect_dot_prod_neon Improve performance by: - Grouping table lookup (tbl) instructions to enhance instruction-level parallelism - Replacing individual loads with paired loads (ldp) for better memory access patterns - Removing unnecessary prefetch instructions - Reordering operations to reduce pipeline stalls and data dependencies This optimization improves decode performance by approximately 6.6%. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/gf_4vect_dot_prod_neon.S | 95 ++++++++----------- 1 file changed, 42 insertions(+), 53 deletions(-) diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/erasure_code/aarch64/gf_4vect_dot_prod_neon.S index 6204102f..4d38295d 100644 --- a/erasure_code/aarch64/gf_4vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_4vect_dot_prod_neon.S @@ -186,30 +186,17 @@ cdecl(gf_4vect_dot_prod_neon): add x_tbl3, x_tbl2, x_vec, lsl #2 add x_tbl4, x_tbl3, x_vec, lsl #2 mov x_vec_i, #0 - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - prfm pldl1keep, [x_tbl4] .Lloop64_vects: ldr x_ptr, [x_src, x_vec_i] add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 + ldp q_data_0, q_data_1, [x_ptr], #32 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - prfm pldl1keep, [x_tbl4] /* data_0 */ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b @@ -217,23 +204,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_3.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_3.16b, v_p2_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_2.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b - eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b - eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b + eor v_p4_0.16b, v_data_2.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_data_3.16b /* data_1 */ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b @@ -241,23 +228,25 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_2.16b, v_p2_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_3.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + eor v_p4_1.16b, v_data_2.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_data_3.16b - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b - eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b - eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b + ldp q_data_2, q_data_3, [x_ptr], #32 /* data_2 */ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b @@ -265,23 +254,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_0.16b, v_p2_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_1.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b - eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b - eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b + eor v_p4_2.16b, v_data_0.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_data_1.16b /* data_3 */ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b @@ -289,23 +278,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_0.16b, v_p2_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_1.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b - eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b - eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b + eor v_p4_3.16b, v_data_0.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_data_1.16b cmp x_vec_i, x_vec blt .Lloop64_vects From a4f7b40b939c214178ce38952021e404c3ec3023 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Mon, 21 Jul 2025 11:46:38 -0500 Subject: [PATCH 2/5] aarch64: Optimize instruction scheduling in gf_5vect_dot_prod_neon Implement advanced register allocation strategy that: - Allocates additional stack space for temporary register spilling - Uses shared temporary registers between adjacent sections (p4 for sections 1-2, p1 for sections 3-4, p2 for section 5) - Groups table lookup operations to improve instruction-level parallelism - Replaces individual loads with vector loads for better memory access patterns - Removes unnecessary prefetch instructions This optimization improves encode performance by approximately 9.4%. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/gf_5vect_dot_prod_neon.S | 137 ++++++++++-------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/erasure_code/aarch64/gf_5vect_dot_prod_neon.S index 13166665..14485257 100644 --- a/erasure_code/aarch64/gf_5vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_5vect_dot_prod_neon.S @@ -180,12 +180,13 @@ cdecl(gf_5vect_dot_prod_neon): cmp x_len, #64 blt .Lloop16_init - /* save d8 ~ d15 to stack */ - sub sp, sp, #64 + /* save d8 ~ d15 to stack and allocate additional space for register spilling */ + sub sp, sp, #128 stp d8, d9, [sp] stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] + /* Space from sp+64 to sp+128 is reserved for register spilling */ sub x_len, x_len, #64 @@ -216,11 +217,7 @@ cdecl(gf_5vect_dot_prod_neon): ldr x_ptr, [x_src, x_vec_i] add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - prfm pldl2keep, [x_ptr] + ld1 { v_data_0.16b, v_data_1.16b, v_data_2.16b, v_data_3.16b }, [x_ptr], #64 movi v_mask0f.16b, #0x0f and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b @@ -236,127 +233,149 @@ cdecl(gf_5vect_dot_prod_neon): add x_tmp, x_tbl, x_vec_i, lsl #2 add x_vec_i, x_vec_i, #8 ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Spill p4 registers to stack to free them for temporary use + stp q_p4_0, q_p4_1, [sp, #64] + + // Use p4_0 and p4_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b + eor v_p1_1.16b, v_p4_0.16b, v_p1_1.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b - eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b + eor v_p1_1.16b, v_p1_1.16b, v_p4_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b + eor v_p1_3.16b, v_p4_0.16b, v_p1_3.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b + eor v_p1_3.16b, v_p1_3.16b, v_p4_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b - eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b + // Note: Not restoring p4 registers yet as they will be used in section 2 /* v_p2_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Continue using p4_0 and p4_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b + eor v_p2_1.16b, v_p4_0.16b, v_p2_1.16b eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_p4_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b + eor v_p2_3.16b, v_p4_0.16b, v_p2_3.16b eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_p4_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b + // Now restore p4 registers after using them for sections 1 and 2 + ldp q_p4_0, q_p4_1, [sp, #64] /* v_p3_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Spill p1 registers to stack to free them for temporary use + stp q_p1_0, q_p1_1, [sp, #64] + + // Use p1_0 and p1_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b + eor v_p3_1.16b, v_p1_0.16b, v_p3_1.16b eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b - eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b + eor v_p3_1.16b, v_p3_1.16b, v_p1_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b + eor v_p3_3.16b, v_p1_0.16b, v_p3_3.16b eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b + eor v_p3_3.16b, v_p3_3.16b, v_p1_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b - eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b + // Note: Not restoring p1 registers yet as they will be used in section 4 /* v_p4_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Continue using p1_0 and p1_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b + eor v_p4_1.16b, v_p1_0.16b, v_p4_1.16b eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b - eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b + eor v_p4_1.16b, v_p4_1.16b, v_p1_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b + eor v_p4_3.16b, v_p1_0.16b, v_p4_3.16b eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b + eor v_p4_3.16b, v_p4_3.16b, v_p1_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b - eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b + // Now restore p1 registers after using them for sections 3 and 4 + ldp q_p1_0, q_p1_1, [sp, #64] /* v_p5_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] + // Spill p2 registers to stack to free them for temporary use + stp q_p2_0, q_p2_1, [sp, #64] + + // Use p2_0 and p2_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b + eor v_p5_1.16b, v_p2_0.16b, v_p5_1.16b eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b - eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b + eor v_p5_1.16b, v_p5_1.16b, v_p2_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b + eor v_p5_3.16b, v_p2_0.16b, v_p5_3.16b eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b + eor v_p5_3.16b, v_p5_3.16b, v_p2_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b - eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b + // Restore the p2 registers + ldp q_p2_0, q_p2_1, [sp, #64] cmp x_vec_i, x_vec blt .Lloop64_vects @@ -387,12 +406,12 @@ cdecl(gf_5vect_dot_prod_neon): ble .Lloop64 .Lloop64_end: - /* restore d8 ~ d15 */ + /* restore d8 ~ d15 and deallocate additional space for register spilling */ ldp d8, d9, [sp] ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] - add sp, sp, #64 + add sp, sp, #128 add x_len, x_len, #64 cmp x_pos, x_len From 006ecc845fba5e66c099dbf52829f820faf297b5 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Tue, 2 Sep 2025 00:10:18 +0000 Subject: [PATCH 3/5] aarch64: Optimize instruction scheduling in gf_3vect_dot_prod_neon Implement instruction scheduling optimization using strategic register reuse: - Load data registers just-in-time before processing each section - Reuse other data registers as temporaries for table lookups - Group table lookup instructions together for better parallelism - Group eor instructions together to reduce pipeline stalls - Remove unnecessary prefetch instructions This approach achieves instruction-level parallelism benefits without stack spilling overhead by cleverly reusing data registers that are not currently being processed as temporary storage. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/gf_3vect_dot_prod_neon.S | 106 +++++++++--------- 1 file changed, 52 insertions(+), 54 deletions(-) diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/erasure_code/aarch64/gf_3vect_dot_prod_neon.S index cff34fc3..ba0f71da 100644 --- a/erasure_code/aarch64/gf_3vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_3vect_dot_prod_neon.S @@ -170,95 +170,93 @@ cdecl(gf_3vect_dot_prod_neon): add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - - /* data_0 */ + /* data_0 - load immediately before use */ + ldr q_data_0, [x_ptr], #16 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b ushr v_data_0.16b, v_data_0.16b, #4 + /* Group all tbl instructions for data_0 using data_1, data_2, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_1.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_1 as temp + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_0.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_0.16b, {v_gft3_hi.16b}, v_data_0.16b // data_0 as temp + + /* Group all eor instructions for data_0 */ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_1.16b, v_p2_0.16b + eor v_p3_0.16b, v_data_3.16b, v_p3_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_2.16b + eor v_p3_0.16b, v_p3_0.16b, v_data_0.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b - eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b - eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b - - /* data_1 */ + /* data_1 - load immediately before use */ + ldr q_data_1, [x_ptr], #16 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b ushr v_data_1.16b, v_data_1.16b, #4 + /* Group all tbl instructions for data_1 using data_0, data_2, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_1.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_1.16b, {v_gft3_hi.16b}, v_data_1.16b // data_1 as temp + + /* Group all eor instructions for data_1 */ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_0.16b, v_p2_1.16b + eor v_p3_1.16b, v_data_3.16b, v_p3_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_2.16b + eor v_p3_1.16b, v_p3_1.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b - eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b - eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b - - /* data_2 */ + /* data_2 - load immediately before use */ + ldr q_data_2, [x_ptr], #16 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b ushr v_data_2.16b, v_data_2.16b, #4 + /* Group all tbl instructions for data_2 using data_0, data_1, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_2.16b // data_1 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_2.16b, {v_gft3_hi.16b}, v_data_2.16b // data_2 as temp + + /* Group all eor instructions for data_2 */ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_0.16b, v_p2_2.16b + eor v_p3_2.16b, v_data_3.16b, v_p3_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_1.16b + eor v_p3_2.16b, v_p3_2.16b, v_data_2.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b - eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b - eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b - - /* data_3 */ + /* data_3 - load immediately before use */ + ldr q_data_3, [x_ptr], #16 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b ushr v_data_3.16b, v_data_3.16b, #4 + /* Group all tbl instructions for data_3 using data_0, data_1, data_2 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_3.16b // data_1 as temp + tbl v_data_2.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_hi.16b}, v_data_3.16b // data_3 as temp + + /* Group all eor instructions for data_3 */ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_0.16b, v_p2_3.16b + eor v_p3_3.16b, v_data_2.16b, v_p3_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b - eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b - eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_1.16b + eor v_p3_3.16b, v_p3_3.16b, v_data_3.16b cmp x_vec_i, x_vec blt .Lloop64_vects From 6e844cbb9104522d0b14333711d06bfd3a2cff62 Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Tue, 2 Sep 2025 00:10:29 +0000 Subject: [PATCH 4/5] aarch64: Optimize instruction scheduling in gf_2vect_dot_prod_neon Implement comprehensive optimization using advanced register reuse and efficient memory access patterns: - Use ld1 4-register loads for maximum memory bandwidth utilization - Delay loading of data_4-7 until needed after processing data_0-3 - Reuse unloaded data registers as temporaries for table lookups - Group table lookup and eor instructions for better parallelism - Remove unnecessary prefetch instructions This approach achieves optimal instruction scheduling without stack spilling overhead by strategically timing data loads and reusing registers as temporaries when they are not needed. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/gf_2vect_dot_prod_neon.S | 110 +++++++++--------- 1 file changed, 53 insertions(+), 57 deletions(-) diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/erasure_code/aarch64/gf_2vect_dot_prod_neon.S index 4ff7e7ce..cacb1120 100644 --- a/erasure_code/aarch64/gf_2vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_2vect_dot_prod_neon.S @@ -186,128 +186,124 @@ cdecl(gf_2vect_dot_prod_neon): add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldp q_data_0, q_data_1, [x_ptr], #32 - ldp q_data_2, q_data_3, [x_ptr], #32 + ld1 {v_data_0.16b, v_data_1.16b, v_data_2.16b, v_data_3.16b}, [x_ptr], #64 + ld1 {v_gft1_lo.16b, v_gft1_hi.16b}, [x_tbl1], #32 + ld1 {v_gft2_lo.16b, v_gft2_hi.16b}, [x_tbl2], #32 - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - ldp q_data_4, q_data_5, [x_ptr], #32 - ldp q_data_6, q_data_7, [x_ptr], #32 - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - - /* data_0 */ + /* data_0 - use data_4,5,6,7 as temporaries */ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b ushr v_data_0.16b, v_data_0.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_4.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_5.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_4.16b, v_p2_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_5.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b - - /* data_1 */ + /* data_1 - use data_4,5,6,7 as temporaries */ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b ushr v_data_1.16b, v_data_1.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_4.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_5.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_4.16b, v_p2_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_5.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b - - /* data_2 */ + /* data_2 - use data_6,7 as temporaries */ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b ushr v_data_2.16b, v_data_2.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_6.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_7.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_6.16b, v_p2_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_7.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b - - /* data_3 */ + /* data_3 - use data_6,7 as temporaries */ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b ushr v_data_3.16b, v_data_3.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_6.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_7.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_6.16b, v_p2_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_7.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + /* Load data_4-7 now that we need them */ + ld1 {v_data_4.16b, v_data_5.16b, v_data_6.16b, v_data_7.16b}, [x_ptr], #64 - /* data_4 */ + /* data_4 - use data_0,1 as temporaries */ and v_tmp1.16b, v_data_4.16b, v_mask0f.16b ushr v_data_4.16b, v_data_4.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_4.16b + eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b + eor v_p2_4.16b, v_data_0.16b, v_p2_4.16b eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b + eor v_p2_4.16b, v_p2_4.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b - eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b - eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b - - /* data_5 */ + /* data_5 - use data_0,1 as temporaries */ and v_tmp1.16b, v_data_5.16b, v_mask0f.16b ushr v_data_5.16b, v_data_5.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_5.16b + eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b + eor v_p2_5.16b, v_data_0.16b, v_p2_5.16b eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b + eor v_p2_5.16b, v_p2_5.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b - eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b - eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b - - /* data_6 */ + /* data_6 - use data_2,3 as temporaries */ and v_tmp1.16b, v_data_6.16b, v_mask0f.16b ushr v_data_6.16b, v_data_6.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_6.16b + eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b + eor v_p2_6.16b, v_data_2.16b, v_p2_6.16b eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b + eor v_p2_6.16b, v_p2_6.16b, v_data_3.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b - eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b - eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b - - /* data_7 */ + /* data_7 - use data_2,3 as temporaries */ and v_tmp1.16b, v_data_7.16b, v_mask0f.16b ushr v_data_7.16b, v_data_7.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_7.16b + eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b + eor v_p2_7.16b, v_data_2.16b, v_p2_7.16b eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b - eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b - eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b + eor v_p2_7.16b, v_p2_7.16b, v_data_3.16b cmp x_vec_i, x_vec blt .Lloop128_vects From 5f32ae7fd91b2efaf24c4744ddf690b9d488541d Mon Sep 17 00:00:00 2001 From: Jonathan Swinney Date: Tue, 2 Sep 2025 00:10:35 +0000 Subject: [PATCH 5/5] aarch64: Optimize instruction scheduling in gf_vect_dot_prod_neon Improve instruction-level parallelism through strategic instruction reordering: - Remove unnecessary prefetch instructions - Reorder dependent eor instruction pairs for better pipeline utilization - Group independent operations together to reduce pipeline stalls - Separate dependent instructions to allow parallel execution This optimization reduces pipeline stalls by allowing the CPU to execute more instructions in parallel, improving overall performance through better utilization of the instruction pipeline. Signed-off-by: Jonathan Swinney --- erasure_code/aarch64/gf_vect_dot_prod_neon.S | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/erasure_code/aarch64/gf_vect_dot_prod_neon.S index 4d173628..27cd351f 100644 --- a/erasure_code/aarch64/gf_vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_vect_dot_prod_neon.S @@ -169,9 +169,6 @@ cdecl(gf_vect_dot_prod_neon): ldp q_data_4, q_data_5, [x_ptr], #32 ldp q_data_6, q_data_7, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1strm, [x_ptr] - and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b @@ -209,20 +206,20 @@ cdecl(gf_vect_dot_prod_neon): tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b eor v_p0.16b, v_data_0_lo.16b, v_p0.16b - eor v_p0.16b, v_p0.16b, v_data_0_hi.16b eor v_p1.16b, v_data_1_lo.16b, v_p1.16b - eor v_p1.16b, v_p1.16b, v_data_1_hi.16b eor v_p2.16b, v_data_2_lo.16b, v_p2.16b - eor v_p2.16b, v_p2.16b, v_data_2_hi.16b eor v_p3.16b, v_data_3_lo.16b, v_p3.16b - eor v_p3.16b, v_p3.16b, v_data_3_hi.16b eor v_p4.16b, v_data_4_lo.16b, v_p4.16b - eor v_p4.16b, v_p4.16b, v_data_4_hi.16b eor v_p5.16b, v_data_5_lo.16b, v_p5.16b - eor v_p5.16b, v_p5.16b, v_data_5_hi.16b eor v_p6.16b, v_data_6_lo.16b, v_p6.16b - eor v_p6.16b, v_p6.16b, v_data_6_hi.16b eor v_p7.16b, v_data_7_lo.16b, v_p7.16b + eor v_p0.16b, v_p0.16b, v_data_0_hi.16b + eor v_p1.16b, v_p1.16b, v_data_1_hi.16b + eor v_p2.16b, v_p2.16b, v_data_2_hi.16b + eor v_p3.16b, v_p3.16b, v_data_3_hi.16b + eor v_p4.16b, v_p4.16b, v_data_4_hi.16b + eor v_p5.16b, v_p5.16b, v_data_5_hi.16b + eor v_p6.16b, v_p6.16b, v_data_6_hi.16b eor v_p7.16b, v_p7.16b, v_data_7_hi.16b cmp x_vec_i, x_vec