diff --git a/erasure_code/aarch64/gf_2vect_dot_prod_neon.S b/erasure_code/aarch64/gf_2vect_dot_prod_neon.S index 4ff7e7ce..cacb1120 100644 --- a/erasure_code/aarch64/gf_2vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_2vect_dot_prod_neon.S @@ -186,128 +186,124 @@ cdecl(gf_2vect_dot_prod_neon): add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldp q_data_0, q_data_1, [x_ptr], #32 - ldp q_data_2, q_data_3, [x_ptr], #32 + ld1 {v_data_0.16b, v_data_1.16b, v_data_2.16b, v_data_3.16b}, [x_ptr], #64 + ld1 {v_gft1_lo.16b, v_gft1_hi.16b}, [x_tbl1], #32 + ld1 {v_gft2_lo.16b, v_gft2_hi.16b}, [x_tbl2], #32 - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 - ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 - ldp q_data_4, q_data_5, [x_ptr], #32 - ldp q_data_6, q_data_7, [x_ptr], #32 - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - - /* data_0 */ + /* data_0 - use data_4,5,6,7 as temporaries */ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b ushr v_data_0.16b, v_data_0.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_4.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_5.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_4.16b, v_p2_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_5.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b - - /* data_1 */ + /* data_1 - use data_4,5,6,7 as temporaries */ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b ushr v_data_1.16b, v_data_1.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_4.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_5.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_4.16b, v_p2_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_5.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b - - /* data_2 */ + /* data_2 - use data_6,7 as temporaries */ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b ushr v_data_2.16b, v_data_2.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_6.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_7.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_6.16b, v_p2_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_7.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b - - /* data_3 */ + /* data_3 - use data_6,7 as temporaries */ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b ushr v_data_3.16b, v_data_3.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_6.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_7.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_6.16b, v_p2_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_7.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + /* Load data_4-7 now that we need them */ + ld1 {v_data_4.16b, v_data_5.16b, v_data_6.16b, v_data_7.16b}, [x_ptr], #64 - /* data_4 */ + /* data_4 - use data_0,1 as temporaries */ and v_tmp1.16b, v_data_4.16b, v_mask0f.16b ushr v_data_4.16b, v_data_4.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_4.16b + eor v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b + eor v_p2_4.16b, v_data_0.16b, v_p2_4.16b eor v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b + eor v_p2_4.16b, v_p2_4.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b - eor v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b - eor v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b - - /* data_5 */ + /* data_5 - use data_0,1 as temporaries */ and v_tmp1.16b, v_data_5.16b, v_mask0f.16b ushr v_data_5.16b, v_data_5.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_5.16b + eor v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b + eor v_p2_5.16b, v_data_0.16b, v_p2_5.16b eor v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b + eor v_p2_5.16b, v_p2_5.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b - eor v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b - eor v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b - - /* data_6 */ + /* data_6 - use data_2,3 as temporaries */ and v_tmp1.16b, v_data_6.16b, v_mask0f.16b ushr v_data_6.16b, v_data_6.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_6.16b + eor v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b + eor v_p2_6.16b, v_data_2.16b, v_p2_6.16b eor v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b + eor v_p2_6.16b, v_p2_6.16b, v_data_3.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b - eor v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b - eor v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b - - /* data_7 */ + /* data_7 - use data_2,3 as temporaries */ and v_tmp1.16b, v_data_7.16b, v_mask0f.16b ushr v_data_7.16b, v_data_7.16b, #4 tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_7.16b + eor v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b + eor v_p2_7.16b, v_data_2.16b, v_p2_7.16b eor v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b - eor v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b - eor v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b + eor v_p2_7.16b, v_p2_7.16b, v_data_3.16b cmp x_vec_i, x_vec blt .Lloop128_vects diff --git a/erasure_code/aarch64/gf_3vect_dot_prod_neon.S b/erasure_code/aarch64/gf_3vect_dot_prod_neon.S index cff34fc3..ba0f71da 100644 --- a/erasure_code/aarch64/gf_3vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_3vect_dot_prod_neon.S @@ -170,95 +170,93 @@ cdecl(gf_3vect_dot_prod_neon): add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 - ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - - /* data_0 */ + /* data_0 - load immediately before use */ + ldr q_data_0, [x_ptr], #16 and v_tmp1.16b, v_data_0.16b, v_mask0f.16b ushr v_data_0.16b, v_data_0.16b, #4 + /* Group all tbl instructions for data_0 using data_1, data_2, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_1.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_1 as temp + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_0.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_0.16b, {v_gft3_hi.16b}, v_data_0.16b // data_0 as temp + + /* Group all eor instructions for data_0 */ eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_1.16b, v_p2_0.16b + eor v_p3_0.16b, v_data_3.16b, v_p3_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_2.16b + eor v_p3_0.16b, v_p3_0.16b, v_data_0.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b - eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b - eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b - - /* data_1 */ + /* data_1 - load immediately before use */ + ldr q_data_1, [x_ptr], #16 and v_tmp1.16b, v_data_1.16b, v_mask0f.16b ushr v_data_1.16b, v_data_1.16b, #4 + /* Group all tbl instructions for data_1 using data_0, data_2, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_1.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_1.16b, {v_gft3_hi.16b}, v_data_1.16b // data_1 as temp + + /* Group all eor instructions for data_1 */ eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_0.16b, v_p2_1.16b + eor v_p3_1.16b, v_data_3.16b, v_p3_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_2.16b + eor v_p3_1.16b, v_p3_1.16b, v_data_1.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b - eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b - eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b - - /* data_2 */ + /* data_2 - load immediately before use */ + ldr q_data_2, [x_ptr], #16 and v_tmp1.16b, v_data_2.16b, v_mask0f.16b ushr v_data_2.16b, v_data_2.16b, #4 + /* Group all tbl instructions for data_2 using data_0, data_1, data_3 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_2.16b // data_1 as temp + tbl v_data_3.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_3 as temp + tbl v_data_2.16b, {v_gft3_hi.16b}, v_data_2.16b // data_2 as temp + + /* Group all eor instructions for data_2 */ eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_0.16b, v_p2_2.16b + eor v_p3_2.16b, v_data_3.16b, v_p3_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_1.16b + eor v_p3_2.16b, v_p3_2.16b, v_data_2.16b - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b - eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b - eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b - - /* data_3 */ + /* data_3 - load immediately before use */ + ldr q_data_3, [x_ptr], #16 and v_tmp1.16b, v_data_3.16b, v_mask0f.16b ushr v_data_3.16b, v_data_3.16b, #4 + /* Group all tbl instructions for data_3 using data_0, data_1, data_2 as temps */ tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b // data_0 as temp + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_3.16b // data_1 as temp + tbl v_data_2.16b, {v_gft3_lo.16b}, v_tmp1.16b // data_2 as temp + tbl v_data_3.16b, {v_gft3_hi.16b}, v_data_3.16b // data_3 as temp + + /* Group all eor instructions for data_3 */ eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_0.16b, v_p2_3.16b + eor v_p3_3.16b, v_data_2.16b, v_p3_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b - eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b - eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_1.16b + eor v_p3_3.16b, v_p3_3.16b, v_data_3.16b cmp x_vec_i, x_vec blt .Lloop64_vects diff --git a/erasure_code/aarch64/gf_4vect_dot_prod_neon.S b/erasure_code/aarch64/gf_4vect_dot_prod_neon.S index 6204102f..4d38295d 100644 --- a/erasure_code/aarch64/gf_4vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_4vect_dot_prod_neon.S @@ -186,30 +186,17 @@ cdecl(gf_4vect_dot_prod_neon): add x_tbl3, x_tbl2, x_vec, lsl #2 add x_tbl4, x_tbl3, x_vec, lsl #2 mov x_vec_i, #0 - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - prfm pldl1keep, [x_tbl4] .Lloop64_vects: ldr x_ptr, [x_src, x_vec_i] add x_vec_i, x_vec_i, #8 add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 + ldp q_data_0, q_data_1, [x_ptr], #32 ldp q_gft1_lo, q_gft1_hi, [x_tbl1], #32 ldp q_gft2_lo, q_gft2_hi, [x_tbl2], #32 ldp q_gft3_lo, q_gft3_hi, [x_tbl3], #32 ldp q_gft4_lo, q_gft4_hi, [x_tbl4], #32 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - - prfm pldl1strm, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1keep, [x_tbl2] - prfm pldl1keep, [x_tbl3] - prfm pldl1keep, [x_tbl4] /* data_0 */ and v_tmp1.16b, v_data_0.16b, v_mask0f.16b @@ -217,23 +204,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b + tbl v_data_3.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_2.16b, {v_gft2_hi.16b}, v_data_0.16b + eor v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b + eor v_p2_0.16b, v_data_3.16b, v_p2_0.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b - eor v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b - eor v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b + eor v_p2_0.16b, v_p2_0.16b, v_data_2.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b + tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_0.16b + eor v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b eor v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b - eor v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b - eor v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b + eor v_p4_0.16b, v_data_2.16b, v_p4_0.16b + eor v_p4_0.16b, v_p4_0.16b, v_data_3.16b /* data_1 */ and v_tmp1.16b, v_data_1.16b, v_mask0f.16b @@ -241,23 +228,25 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b + tbl v_data_2.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft2_hi.16b}, v_data_1.16b + eor v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b + eor v_p2_1.16b, v_data_2.16b, v_p2_1.16b eor v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b - eor v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_data_3.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b + tbl v_data_2.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_3.16b, {v_gft4_hi.16b}, v_data_1.16b + eor v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b eor v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b + eor v_p4_1.16b, v_data_2.16b, v_p4_1.16b + eor v_p4_1.16b, v_p4_1.16b, v_data_3.16b - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b - eor v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b - eor v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b + ldp q_data_2, q_data_3, [x_ptr], #32 /* data_2 */ and v_tmp1.16b, v_data_2.16b, v_mask0f.16b @@ -265,23 +254,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_2.16b + eor v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b + eor v_p2_2.16b, v_data_0.16b, v_p2_2.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b - eor v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b - eor v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b + eor v_p2_2.16b, v_p2_2.16b, v_data_1.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b + tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_2.16b + eor v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b eor v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b - eor v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b - eor v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b + eor v_p4_2.16b, v_data_0.16b, v_p4_2.16b + eor v_p4_2.16b, v_p4_2.16b, v_data_1.16b /* data_3 */ and v_tmp1.16b, v_data_3.16b, v_mask0f.16b @@ -289,23 +278,23 @@ cdecl(gf_4vect_dot_prod_neon): tbl v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft2_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft2_hi.16b}, v_data_3.16b + eor v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b + eor v_p2_3.16b, v_data_0.16b, v_p2_3.16b eor v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b - eor v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_data_1.16b tbl v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b tbl v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b + tbl v_data_0.16b, {v_gft4_lo.16b}, v_tmp1.16b + tbl v_data_1.16b, {v_gft4_hi.16b}, v_data_3.16b + eor v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b eor v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b - - tbl v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b - tbl v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b - eor v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b - eor v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b + eor v_p4_3.16b, v_data_0.16b, v_p4_3.16b + eor v_p4_3.16b, v_p4_3.16b, v_data_1.16b cmp x_vec_i, x_vec blt .Lloop64_vects diff --git a/erasure_code/aarch64/gf_5vect_dot_prod_neon.S b/erasure_code/aarch64/gf_5vect_dot_prod_neon.S index 13166665..14485257 100644 --- a/erasure_code/aarch64/gf_5vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_5vect_dot_prod_neon.S @@ -180,12 +180,13 @@ cdecl(gf_5vect_dot_prod_neon): cmp x_len, #64 blt .Lloop16_init - /* save d8 ~ d15 to stack */ - sub sp, sp, #64 + /* save d8 ~ d15 to stack and allocate additional space for register spilling */ + sub sp, sp, #128 stp d8, d9, [sp] stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] + /* Space from sp+64 to sp+128 is reserved for register spilling */ sub x_len, x_len, #64 @@ -216,11 +217,7 @@ cdecl(gf_5vect_dot_prod_neon): ldr x_ptr, [x_src, x_vec_i] add x_ptr, x_ptr, x_pos - ldr q_data_0, [x_ptr], #16 - ldr q_data_1, [x_ptr], #16 - ldr q_data_2, [x_ptr], #16 - ldr q_data_3, [x_ptr], #16 - prfm pldl2keep, [x_ptr] + ld1 { v_data_0.16b, v_data_1.16b, v_data_2.16b, v_data_3.16b }, [x_ptr], #64 movi v_mask0f.16b, #0x0f and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b @@ -236,127 +233,149 @@ cdecl(gf_5vect_dot_prod_neon): add x_tmp, x_tbl, x_vec_i, lsl #2 add x_vec_i, x_vec_i, #8 ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Spill p4 registers to stack to free them for temporary use + stp q_p4_0, q_p4_1, [sp, #64] + + // Use p4_0 and p4_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b + eor v_p1_1.16b, v_p4_0.16b, v_p1_1.16b eor v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b - eor v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b + eor v_p1_1.16b, v_p1_1.16b, v_p4_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b + eor v_p1_3.16b, v_p4_0.16b, v_p1_3.16b eor v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b + eor v_p1_3.16b, v_p1_3.16b, v_p4_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b - eor v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b + // Note: Not restoring p4 registers yet as they will be used in section 2 /* v_p2_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Continue using p4_0 and p4_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b + eor v_p2_1.16b, v_p4_0.16b, v_p2_1.16b eor v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b - eor v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b + eor v_p2_1.16b, v_p2_1.16b, v_p4_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p4_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p4_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b + eor v_p2_3.16b, v_p4_0.16b, v_p2_3.16b eor v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b + eor v_p2_3.16b, v_p2_3.16b, v_p4_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b - eor v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b + // Now restore p4 registers after using them for sections 1 and 2 + ldp q_p4_0, q_p4_1, [sp, #64] /* v_p3_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Spill p1 registers to stack to free them for temporary use + stp q_p1_0, q_p1_1, [sp, #64] + + // Use p1_0 and p1_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b + eor v_p3_1.16b, v_p1_0.16b, v_p3_1.16b eor v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b - eor v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b + eor v_p3_1.16b, v_p3_1.16b, v_p1_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b + eor v_p3_3.16b, v_p1_0.16b, v_p3_3.16b eor v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b + eor v_p3_3.16b, v_p3_3.16b, v_p1_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b - eor v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b + // Note: Not restoring p1 registers yet as they will be used in section 4 /* v_p4_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] add x_tmp, x_tmp, x_vec, lsl #2 + // Continue using p1_0 and p1_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b + eor v_p4_1.16b, v_p1_0.16b, v_p4_1.16b eor v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b - eor v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b + eor v_p4_1.16b, v_p4_1.16b, v_p1_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p1_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p1_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b + eor v_p4_3.16b, v_p1_0.16b, v_p4_3.16b eor v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b + eor v_p4_3.16b, v_p4_3.16b, v_p1_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b - eor v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b + // Now restore p1 registers after using them for sections 3 and 4 + ldp q_p1_0, q_p1_1, [sp, #64] /* v_p5_x */ ldp q_gft_lo, q_gft_hi, [x_tmp] - prfm pldl3keep, [x_tmp, #32] + // Spill p2 registers to stack to free them for temporary use + stp q_p2_0, q_p2_1, [sp, #64] + + // Use p2_0 and p2_1 registers as temporaries for instruction reordering tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b + tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_1_lo.16b + tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_1_hi.16b + eor v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b + eor v_p5_1.16b, v_p2_0.16b, v_p5_1.16b eor v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b - - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b - eor v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b - eor v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b + eor v_p5_1.16b, v_p5_1.16b, v_p2_1.16b tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b + tbl v_p2_0.16b, {v_gft_lo.16b}, v_data_3_lo.16b + tbl v_p2_1.16b, {v_gft_hi.16b}, v_data_3_hi.16b + eor v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b + eor v_p5_3.16b, v_p2_0.16b, v_p5_3.16b eor v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b + eor v_p5_3.16b, v_p5_3.16b, v_p2_1.16b - tbl v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b - tbl v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b - eor v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b - eor v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b + // Restore the p2 registers + ldp q_p2_0, q_p2_1, [sp, #64] cmp x_vec_i, x_vec blt .Lloop64_vects @@ -387,12 +406,12 @@ cdecl(gf_5vect_dot_prod_neon): ble .Lloop64 .Lloop64_end: - /* restore d8 ~ d15 */ + /* restore d8 ~ d15 and deallocate additional space for register spilling */ ldp d8, d9, [sp] ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] - add sp, sp, #64 + add sp, sp, #128 add x_len, x_len, #64 cmp x_pos, x_len diff --git a/erasure_code/aarch64/gf_vect_dot_prod_neon.S b/erasure_code/aarch64/gf_vect_dot_prod_neon.S index 4d173628..27cd351f 100644 --- a/erasure_code/aarch64/gf_vect_dot_prod_neon.S +++ b/erasure_code/aarch64/gf_vect_dot_prod_neon.S @@ -169,9 +169,6 @@ cdecl(gf_vect_dot_prod_neon): ldp q_data_4, q_data_5, [x_ptr], #32 ldp q_data_6, q_data_7, [x_ptr] - prfm pldl1keep, [x_tbl1] - prfm pldl1strm, [x_ptr] - and v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b and v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b and v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b @@ -209,20 +206,20 @@ cdecl(gf_vect_dot_prod_neon): tbl v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b eor v_p0.16b, v_data_0_lo.16b, v_p0.16b - eor v_p0.16b, v_p0.16b, v_data_0_hi.16b eor v_p1.16b, v_data_1_lo.16b, v_p1.16b - eor v_p1.16b, v_p1.16b, v_data_1_hi.16b eor v_p2.16b, v_data_2_lo.16b, v_p2.16b - eor v_p2.16b, v_p2.16b, v_data_2_hi.16b eor v_p3.16b, v_data_3_lo.16b, v_p3.16b - eor v_p3.16b, v_p3.16b, v_data_3_hi.16b eor v_p4.16b, v_data_4_lo.16b, v_p4.16b - eor v_p4.16b, v_p4.16b, v_data_4_hi.16b eor v_p5.16b, v_data_5_lo.16b, v_p5.16b - eor v_p5.16b, v_p5.16b, v_data_5_hi.16b eor v_p6.16b, v_data_6_lo.16b, v_p6.16b - eor v_p6.16b, v_p6.16b, v_data_6_hi.16b eor v_p7.16b, v_data_7_lo.16b, v_p7.16b + eor v_p0.16b, v_p0.16b, v_data_0_hi.16b + eor v_p1.16b, v_p1.16b, v_data_1_hi.16b + eor v_p2.16b, v_p2.16b, v_data_2_hi.16b + eor v_p3.16b, v_p3.16b, v_data_3_hi.16b + eor v_p4.16b, v_p4.16b, v_data_4_hi.16b + eor v_p5.16b, v_p5.16b, v_data_5_hi.16b + eor v_p6.16b, v_p6.16b, v_data_6_hi.16b eor v_p7.16b, v_p7.16b, v_data_7_hi.16b cmp x_vec_i, x_vec