diff --git a/configure.ac b/configure.ac index 47f34278..a425c607 100644 --- a/configure.ac +++ b/configure.ac @@ -71,8 +71,8 @@ case "${CPU}" in AM_CONDITIONAL([HAVE_RVV], [false]) rvv=no] ) if test "x$rvv" = "xyes"; then - CFLAGS+=" -march=rv64gcv" - CCASFLAGS+=" -march=rv64gcv" + CFLAGS+=" -march=rv64gcv" + CCASFLAGS+=" -march=rv64gcv" fi AC_MSG_RESULT([$rvv]) ;; diff --git a/erasure_code/riscv64/gf_2vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_2vect_dot_prod_rvv.S index 1f015952..338b70c8 100644 --- a/erasure_code/riscv64/gf_2vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_2vect_dot_prod_rvv.S @@ -74,9 +74,10 @@ gf_2vect_dot_prod_rvv: li t6, 16 blt x_len, t6, .return_fail - vsetvli a5, x0, e8, m1 /* Set vector length to maximum */ - + vsetvli a5, x0, e8, m1, ta, ma /* Set vector length to maximum */ li x_pos, 0 + slli x_vec, x_vec, 3 + ld x_dest1, 0(x_dest) ld x_dest2, 8(x_dest) @@ -92,16 +93,13 @@ gf_2vect_dot_prod_rvv: /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ mv x_tbl1, x_tbl /* reset x_tbl1 */ - slli t6, x_vec, 5 + slli t6, x_vec, 2 add x_tbl2, x_tbl1, t6 /* reset x_tbl2 */ /* Loop 2: x_vec, number of source vectors (ie. data blocks) */ .Llooprvv_vl_vects: /* load src data */ - slli a6, x_vec_i, 3 - add a6,x_src,a6 - ld x_ptr, 0(a6) - add x_ptr,x_ptr,x_pos + add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) /* load from: src base + pos offset */ /* split 4-bit lo; 4-bit hi */ @@ -120,6 +118,11 @@ gf_2vect_dot_prod_rvv: vle8.v v_gft2_hi, (x_tbl2) addi x_tbl2, x_tbl2, 16 + /* calc for next */ + addi x_vec_i, x_vec_i, 8 /* move x_vec_i to next */ + add a6, x_src, x_vec_i + ld x_ptr, 0(a6) + /* dest 1 */ /* table indexing, ie. gf(2^8) multiplication */ vrgather.vv v26, v_gft1_lo, v_src_lo @@ -134,16 +137,14 @@ gf_2vect_dot_prod_rvv: vxor.vv v_dest2, v_dest2, v26 vxor.vv v_dest2, v_dest2, v27 - /* calc for next */ - addi x_vec_i, x_vec_i, 1 /* move x_vec_i to next */ blt x_vec_i, x_vec, .Llooprvv_vl_vects /* end of Loop 2 */ /* store dest data */ vse8.v v_dest1, (x_dest1) vse8.v v_dest2, (x_dest2) - add x_dest1,x_dest1,a5 - add x_dest2,x_dest2,a5 + add x_dest1, x_dest1, a5 + add x_dest2, x_dest2, a5 /* increment one vector length */ add x_pos, x_pos, a5 diff --git a/erasure_code/riscv64/gf_2vect_mad_rvv.S b/erasure_code/riscv64/gf_2vect_mad_rvv.S index fb90f3a9..fd569e57 100644 --- a/erasure_code/riscv64/gf_2vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_2vect_mad_rvv.S @@ -71,7 +71,7 @@ gf_2vect_mad_rvv: li t3, 16 blt x_len, t3, .return_fail - vsetvli t4, x0, e8, m1 + vsetvli t4, x0, e8, m1, ta, ma /* load table 1 */ slli t3, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_3vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_3vect_dot_prod_rvv.S index c617ab3e..6975c087 100644 --- a/erasure_code/riscv64/gf_3vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_3vect_dot_prod_rvv.S @@ -57,7 +57,6 @@ #define x_dest3 a5 #define t_offset a6 - /* vectors */ #define v_src v1 #define v_src_lo v2 @@ -84,10 +83,11 @@ gf_3vect_dot_prod_rvv: sd s0, 0(sp) sd s1, 8(sp) - vsetvli a7, x0, e8, m1 /* Set vector length to maximum */ - + vsetvli a7, x0, e8, m1, ta, ma /* Set vector length to maximum */ li x_pos, 0 - slli t_offset, x_vec, 5 + slli x_vec, x_vec, 3 + + slli t_offset, x_vec, 2 ld x_dest1, 0(x_dest) ld x_dest2, 8(x_dest) ld x_dest3, 16(x_dest) @@ -101,20 +101,19 @@ gf_3vect_dot_prod_rvv: vmv.v.i v_dest2, 0 vmv.v.i v_dest3, 0 + /* Loop 2: x_vec, number of source vectors (ie. data blocks) */ + li x_vec_i, 0 + /* load source pointer */ + ld x_ptr, 0(x_src) + /* Reset table pointers */ mv x_tbl1, x_tbl add x_tbl2, x_tbl1, t_offset add x_tbl3, x_tbl2, t_offset - /* Loop 2: x_vec, number of source vectors (ie. data blocks) */ - li x_vec_i, 0 .Lloop_rvv_vl_vects: - /* Load source data */ - slli t0, x_vec_i, 3 - add t0,x_src,t0 - ld x_ptr, 0(t0) - add x_ptr,x_ptr,x_pos - + /* load source data */ + add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) /* Split 4-bit lo; 4-bit hi */ @@ -131,6 +130,10 @@ gf_3vect_dot_prod_rvv: vle8.v v_gft2_hi, (x_tbl2) addi x_tbl2, x_tbl2, 16 + /* Move to next source vector */ + addi x_vec_i, x_vec_i, 8 + add t0, x_src, x_vec_i + ld x_ptr, 0(t0) /* Load next gf_table's */ vle8.v v_gft3_lo, (x_tbl3) @@ -138,7 +141,7 @@ gf_3vect_dot_prod_rvv: vle8.v v_gft3_hi, (x_tbl3) addi x_tbl3, x_tbl3, 16 -/* dest 1 */ + /* dest 1 */ vrgather.vv v26, v_gft1_lo, v_src_lo vrgather.vv v27, v_gft1_hi, v_src_hi vxor.vv v_dest1, v_dest1, v26 @@ -156,9 +159,6 @@ gf_3vect_dot_prod_rvv: vxor.vv v_dest3, v_dest3, v26 vxor.vv v_dest3, v_dest3, v27 - /* Move to next source vector */ - addi x_vec_i, x_vec_i, 1 - /* Check if we have processed all vectors */ blt x_vec_i, x_vec, .Lloop_rvv_vl_vects @@ -166,9 +166,9 @@ gf_3vect_dot_prod_rvv: vse8.v v_dest1, (x_dest1) vse8.v v_dest2, (x_dest2) vse8.v v_dest3, (x_dest3) - add x_dest1,x_dest1, a7 - add x_dest2,x_dest2, a7 - add x_dest3,x_dest3, a7 + add x_dest1, x_dest1, a7 + add x_dest2, x_dest2, a7 + add x_dest3, x_dest3, a7 add x_pos, x_pos, a7 j .Lloop_rvv_vl diff --git a/erasure_code/riscv64/gf_3vect_mad_rvv.S b/erasure_code/riscv64/gf_3vect_mad_rvv.S index 8d33471c..920a0217 100644 --- a/erasure_code/riscv64/gf_3vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_3vect_mad_rvv.S @@ -75,7 +75,7 @@ gf_3vect_mad_rvv: li t4, 16 blt x_len, t4, .return_fail - vsetvli t5, x0, e8, m1 + vsetvli t5, x0, e8, m1, ta, ma /* Load table 1 */ slli t4, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_4vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_4vect_dot_prod_rvv.S index ace146dc..0a627e56 100644 --- a/erasure_code/riscv64/gf_4vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_4vect_dot_prod_rvv.S @@ -91,10 +91,12 @@ gf_4vect_dot_prod_rvv: sd s2, 16(sp) sd s3, 24(sp) - vsetvli t0, x0, e8, m1 /* Set vector length to maximum */ - + vsetvli t0, x0, e8, m1, ta, ma li x_pos, 0 - slli t_offset, x_vec, 5 + + slli x_vec, x_vec, 3 + slli t_offset, x_vec, 2 + ld x_dest1, 0(x_dest) ld x_dest2, 8(x_dest) ld x_dest3, 16(x_dest) @@ -111,20 +113,21 @@ gf_4vect_dot_prod_rvv: vmv.v.i v_dest3, 0 vmv.v.i v_dest4, 0 + /* x_vec, number of source vectors (ie. data blocks) */ + li x_vec_i, 0 + + /* load source pointer */ + ld x_ptr, 0(x_src) + /* Reset table pointers */ mv x_tbl1, x_tbl add x_tbl2, x_tbl1, t_offset add x_tbl3, x_tbl2, t_offset add x_tbl4, x_tbl3, t_offset - /* Loop 2: x_vec, number of source vectors (ie. data blocks) */ - li x_vec_i, 0 .Lloop_rvv_vl_vects: /* Load source data */ - slli a6, x_vec_i, 3 - add a6,x_src,a6 - ld x_ptr, 0(a6) - add x_ptr,x_ptr,x_pos + add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) @@ -142,6 +145,10 @@ gf_4vect_dot_prod_rvv: vle8.v v_gft2_hi, (x_tbl2) addi x_tbl2, x_tbl2, 16 + /* Move to next source vector */ + addi x_vec_i, x_vec_i, 8 + add a6, x_src, x_vec_i + ld x_ptr, 0(a6) /* Load next gf_table's */ vle8.v v_gft3_lo, (x_tbl3) @@ -178,9 +185,6 @@ gf_4vect_dot_prod_rvv: vxor.vv v_dest4, v_dest4, v26 vxor.vv v_dest4, v_dest4, v27 - /* Move to next source vector */ - addi x_vec_i, x_vec_i, 1 - /* Check if we have processed all vectors */ blt x_vec_i, x_vec, .Lloop_rvv_vl_vects @@ -189,16 +193,16 @@ gf_4vect_dot_prod_rvv: vse8.v v_dest2, (x_dest2) vse8.v v_dest3, (x_dest3) vse8.v v_dest4, (x_dest4) - add x_dest1,x_dest1, t0 - add x_dest2,x_dest2, t0 - add x_dest3,x_dest3, t0 - add x_dest4,x_dest4, t0 + add x_dest1, x_dest1, t0 + add x_dest2, x_dest2, t0 + add x_dest3, x_dest3, t0 + add x_dest4, x_dest4, t0 /* Increment position */ add x_pos, x_pos, t0 j .Lloop_rvv_vl .return_pass: -/* restore callee-saved registers */ + /* restore callee-saved registers */ ld s0, 0(sp) ld s1, 8(sp) ld s2, 16(sp) diff --git a/erasure_code/riscv64/gf_4vect_mad_rvv.S b/erasure_code/riscv64/gf_4vect_mad_rvv.S index 48b35eae..3c98bc7f 100644 --- a/erasure_code/riscv64/gf_4vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_4vect_mad_rvv.S @@ -79,7 +79,7 @@ gf_4vect_mad_rvv: li t5, 16 blt x_len, t5, .return_fail - vsetvli t6, x0, e8, m1 + vsetvli t6, x0, e8, m1, ta, ma /* load table 1 */ slli t5, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_5vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_5vect_dot_prod_rvv.S index 0b5cf3ee..629ef314 100644 --- a/erasure_code/riscv64/gf_5vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_5vect_dot_prod_rvv.S @@ -95,8 +95,7 @@ gf_5vect_dot_prod_rvv: sd s4, 32(sp) sd s5, 40(sp) - vsetvli a5, x0, e8, m1 - + vsetvli a5, x0, e8, m1, ta, ma /* Initialize position */ li x_pos, 0 @@ -131,7 +130,7 @@ gf_5vect_dot_prod_rvv: .Llooprvv_vl_vects: /* Load source data */ slli a6, x_vec_i, 3 - add a6,x_src,a6 + add a6, x_src, a6 ld x_ptr, 0(a6) add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) @@ -204,7 +203,6 @@ gf_5vect_dot_prod_rvv: /* Check if we have processed all vectors */ blt x_vec_i, x_vec, .Llooprvv_vl_vects - vse8.v v_dest1, (x_dest1) vse8.v v_dest2, (x_dest2) vse8.v v_dest3, (x_dest3) @@ -212,11 +210,11 @@ gf_5vect_dot_prod_rvv: vse8.v v_dest5, (x_dest5) /* Store destination data */ - add x_dest1,x_dest1,a5 - add x_dest2,x_dest2,a5 - add x_dest3,x_dest3,a5 - add x_dest4,x_dest4,a5 - add x_dest5,x_dest5,a5 + add x_dest1, x_dest1, a5 + add x_dest2, x_dest2, a5 + add x_dest3, x_dest3, a5 + add x_dest4, x_dest4, a5 + add x_dest5, x_dest5, a5 /* Increment position */ add x_pos, x_pos, a5 diff --git a/erasure_code/riscv64/gf_5vect_mad_rvv.S b/erasure_code/riscv64/gf_5vect_mad_rvv.S index 57227ed2..b4f5954e 100644 --- a/erasure_code/riscv64/gf_5vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_5vect_mad_rvv.S @@ -83,7 +83,7 @@ gf_5vect_mad_rvv: li t6, 16 blt x_len, t6, .return_fail - vsetvli a7, x0, e8, m1 + vsetvli a7, x0, e8, m1, ta, ma /* Load table 1 */ slli a6, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_6vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_6vect_dot_prod_rvv.S index 6cc9a168..f5789fe8 100644 --- a/erasure_code/riscv64/gf_6vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_6vect_dot_prod_rvv.S @@ -102,11 +102,13 @@ gf_6vect_dot_prod_rvv: sd s7, 56(sp) li t0, 0x0F - vsetvli a5, x0, e8, m1 + vsetvli a5, x0, e8, m1, ta, ma /* initialize position */ li x_pos, 0 + slli x_vec, x_vec, 3 + /* load destination pointers */ ld x_dest1, 0(x14) # a4 is also x14 ld x_dest2, 8(x_dest) @@ -136,7 +138,7 @@ gf_6vect_dot_prod_rvv: /* initialize table pointers */ /* gf_tbl base = (x_tbl + dest_idx * x_vec * 32) */ mv x_tbl1, x_tbl - slli t0, x_vec, 5 + slli t0, x_vec, 2 add x_tbl2, x_tbl1, t0 add x_tbl3, x_tbl2, t0 add x_tbl4, x_tbl3, t0 @@ -145,14 +147,9 @@ gf_6vect_dot_prod_rvv: .Llooprvv_vl_vects: /* load source data */ - slli a6, x_vec_i, 3 - add a6,x_src,a6 - ld x_ptr, 0(a6) - add x_ptr,x_ptr,x_pos - + add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) - /* split 4-bit lo; 4-bit hi */ vand.vi v_src_lo, v_src, 0x0F vsrl.vi v_src_hi, v_src, 4 @@ -168,6 +165,11 @@ gf_6vect_dot_prod_rvv: vle8.v v_gft2_hi, (x_tbl2) addi x_tbl2, x_tbl2, 16 + /* load next source pointer */ + addi x_vec_i, x_vec_i, 8 + add a6, x_src, x_vec_i + ld x_ptr, 0(a6) + vle8.v v_gft3_lo, (x_tbl3) addi x_tbl3, x_tbl3, 16 vle8.v v_gft3_hi, (x_tbl3) @@ -188,7 +190,6 @@ gf_6vect_dot_prod_rvv: vle8.v v_gft6_hi, (x_tbl6) addi x_tbl6, x_tbl6, 16 - /* dest 1 */ vrgather.vv v26, v_gft1_lo, v_src_lo vrgather.vv v27, v_gft1_hi, v_src_hi @@ -225,10 +226,6 @@ gf_6vect_dot_prod_rvv: vxor.vv v_dest6, v_dest6, v26 vxor.vv v_dest6, v_dest6, v27 - - /* load next source pointer */ - addi x_vec_i, x_vec_i,1 - /* check if we have processed all vectors */ blt x_vec_i, x_vec, .Llooprvv_vl_vects @@ -240,12 +237,12 @@ gf_6vect_dot_prod_rvv: vse8.v v_dest5, (x_dest5) # x_dest5 vse8.v v_dest6, (x_dest6) # x_dest6 - add x_dest1,x_dest1, a5 - add x_dest2,x_dest2, a5 - add x_dest3,x_dest3, a5 - add x_dest4,x_dest4, a5 - add x_dest5,x_dest5, a5 - add x_dest6,x_dest6, a5 + add x_dest1, x_dest1, a5 + add x_dest2, x_dest2, a5 + add x_dest3, x_dest3, a5 + add x_dest4, x_dest4, a5 + add x_dest5, x_dest5, a5 + add x_dest6, x_dest6, a5 /* increment position */ add x_pos, x_pos, a5 diff --git a/erasure_code/riscv64/gf_6vect_mad_rvv.S b/erasure_code/riscv64/gf_6vect_mad_rvv.S index 95d4a666..dbe093a0 100644 --- a/erasure_code/riscv64/gf_6vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_6vect_mad_rvv.S @@ -91,7 +91,7 @@ gf_6vect_mad_rvv: addi sp, sp, -16 sd s8, 0(sp) - vsetvli a6, x0, e8, m1 + vsetvli a6, x0, e8, m1, ta, ma /* Load table 1 */ slli s8, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_7vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_7vect_dot_prod_rvv.S index d4cc1d72..bf668716 100644 --- a/erasure_code/riscv64/gf_7vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_7vect_dot_prod_rvv.S @@ -94,7 +94,6 @@ #define v_gft7_lo v23 #define v_gft7_hi v24 - gf_7vect_dot_prod_rvv: /* less than 16 bytes, return_fail */ li t0, 16 @@ -112,8 +111,7 @@ gf_7vect_dot_prod_rvv: sd s7, 56(sp) sd s8, 64(sp) - vsetvli t0, x0, e8, m1 - + vsetvli t0, x0, e8, m1, ta, ma /* initialize position */ li x_pos, 0 @@ -160,9 +158,9 @@ gf_7vect_dot_prod_rvv: .Llooprvv_vl_vects: /* load source data */ slli a5, x_vec_i, 3 - add a5,x_src,a5 + add a5, x_src, a5 ld x_ptr, 0(a5) - add x_ptr,x_ptr,x_pos + add x_ptr, x_ptr, x_pos vle8.v v_src, (x_ptr) @@ -206,7 +204,6 @@ gf_7vect_dot_prod_rvv: vle8.v v_gft7_hi, (x_tbl7) addi x_tbl7, x_tbl7, 16 - /* dest 1 */ vrgather.vv v26, v_gft1_lo, v_src_lo vrgather.vv v27, v_gft1_hi, v_src_hi @@ -243,7 +240,6 @@ gf_7vect_dot_prod_rvv: vxor.vv v_dest6, v_dest6, v26 vxor.vv v_dest6, v_dest6, v27 - /* GF multiplication and accumulation for dest7 */ vrgather.vv v26, v_gft7_lo, v_src_lo vrgather.vv v27, v_gft7_hi, v_src_hi @@ -263,13 +259,13 @@ gf_7vect_dot_prod_rvv: vse8.v v_dest6, (x_dest6) vse8.v v_dest7, (x_dest7) - add x_dest1,x_dest1, t0 - add x_dest2,x_dest2, t0 - add x_dest3,x_dest3, t0 - add x_dest4,x_dest4, t0 - add x_dest5,x_dest5, t0 - add x_dest6,x_dest6, t0 - add x_dest7,x_dest7, t0 + add x_dest1, x_dest1, t0 + add x_dest2, x_dest2, t0 + add x_dest3, x_dest3, t0 + add x_dest4, x_dest4, t0 + add x_dest5, x_dest5, t0 + add x_dest6, x_dest6, t0 + add x_dest7, x_dest7, t0 /* increment one vector length */ add x_pos, x_pos, t0 diff --git a/erasure_code/riscv64/gf_vect_dot_prod_rvv.S b/erasure_code/riscv64/gf_vect_dot_prod_rvv.S index 471b65ed..a4af6d7c 100644 --- a/erasure_code/riscv64/gf_vect_dot_prod_rvv.S +++ b/erasure_code/riscv64/gf_vect_dot_prod_rvv.S @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ######################################################################## - # RISC-V RVV implementation of gf_vect_dot_prod_rvv # Function: gf_vect_dot_prod_rvv @@ -62,8 +61,7 @@ gf_vect_dot_prod_rvv: li t4, 16 blt a0, t4, .return_fail - vsetvli t5, zero, e8, m1 # Set vector length to maximum - + vsetvli t5, zero, e8, m1, ta, ma # Set vector length to maximum # Initialize pos = 0 li t2, 0 diff --git a/erasure_code/riscv64/gf_vect_mad_rvv.S b/erasure_code/riscv64/gf_vect_mad_rvv.S index 2c9aeb86..bb900514 100644 --- a/erasure_code/riscv64/gf_vect_mad_rvv.S +++ b/erasure_code/riscv64/gf_vect_mad_rvv.S @@ -65,7 +65,7 @@ gf_vect_mad_rvv: li t1, 16 blt x_len, t1, .return_fail - vsetvli t2, x0, e8, m1 + vsetvli t2, x0, e8, m1, ta, ma /* x_tbl += x_vec_i * 2^5 */ slli t1, x_vec_i, 5 diff --git a/erasure_code/riscv64/gf_vect_mul_rvv.S b/erasure_code/riscv64/gf_vect_mul_rvv.S index 92a8982f..aa72d1d4 100644 --- a/erasure_code/riscv64/gf_vect_mul_rvv.S +++ b/erasure_code/riscv64/gf_vect_mul_rvv.S @@ -67,7 +67,7 @@ gf_vect_mul_rvv: andi x_tmp, x_len, 0x1F bnez x_tmp, .return_fail - vsetvli t6, x0, e8, m1 + vsetvli t6, x0, e8, m1, ta, ma /* Load pre-calculated constants into v_gft1_lo and v_gft1_hi */ vle8.v v_gft1_lo, (x_tbl) @@ -79,7 +79,7 @@ gf_vect_mul_rvv: .Llooprvv_vl: /* Load source data into v_src */ - add x_ptr,x_src,x_pos + add x_ptr, x_src, x_pos vle8.v v_src, (x_ptr) /* Split 4-bit lo and 4-bit hi */