diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 9981c437b7..c3f62b58fd 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -14,8 +14,8 @@ jobs: if: "github.repository == 'OpenMathLib/OpenBLAS'" runs-on: ubuntu-latest env: - xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618 - toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz + xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1749714096626 + toolchain_file_name: Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0-20250522.tar.gz strategy: fail-fast: false matrix: @@ -77,7 +77,7 @@ jobs: run: | wget ${xuetie_toolchain}/${toolchain_file_name} tar -xvf ${toolchain_file_name} -C /opt - export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH" + export PATH="/opt/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0/bin:$PATH" make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) diff --git a/Makefile.prebuild b/Makefile.prebuild index b6c8d552f9..68fe01fdc7 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -52,7 +52,7 @@ TARGET_FLAGS = -mips64r6 endif ifeq ($(TARGET), C910V) -TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d +TARGET_FLAGS = -march=rv64gc_zfh_xtheadc_xtheadvector -mabi=lp64d endif ifeq ($(TARGET), CK860FV) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 8fe734186b..02c6558e8a 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -1,6 +1,6 @@ ifeq ($(CORE), C910V) -CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static +CCOMMON_OPT += -march=rv64imafdc_zfh_xtheadc_xtheadvector -mabi=lp64d -mtune=c920 +FCOMMON_OPT += -march=rv64imafdc_zfh_xtheadc_xtheadvector -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d diff --git a/common_macro.h b/common_macro.h index f9c22089b3..477818ff69 100644 --- a/common_macro.h +++ b/common_macro.h @@ -2709,7 +2709,7 @@ #ifndef ASSEMBLER #if !defined(DYNAMIC_ARCH) \ && (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \ - || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA)) + || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA) || defined(ARCH_RISCV64)) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG bgemm_p; diff --git a/common_riscv64.h b/common_riscv64.h index ba638e8be5..404dab7f9d 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -93,13 +93,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ # include #endif -#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 ) -// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this -#define RISCV_0p10_INTRINSICS -#define RISCV_RVV(x) x -#else #define RISCV_RVV(x) __riscv_ ## x -#endif #if defined(C910V) || defined(RISCV64_ZVL256B) # if !defined(DOUBLE) diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c index e972828b52..b6f0caebe6 100644 --- a/kernel/riscv64/dsdot_vector.c +++ b/kernel/riscv64/dsdot_vector.c @@ -37,114 +37,114 @@ double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vfloat32m2_t vx, vy; unsigned int gvl = 0; vfloat64m1_t v_res, v_z0; - gvl = vsetvlmax_e64m1(); - v_res = vfmv_v_f_f64m1(0, gvl); - v_z0 = vfmv_v_f_f64m1(0, gvl); + gvl = __riscv_vsetvlmax_e64m1(); + v_res = __riscv_vfmv_v_f_f64m1(0, gvl); + v_z0 = __riscv_vfmv_v_f_f64m1(0, gvl); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvl_e64m4(n); - vr = vfmv_v_f_f64m4(0, gvl); + gvl = __riscv_vsetvl_e64m4(n); + vr = __riscv_vfmv_v_f_f64m4(0, gvl); for(i=0,j=0; i 0){ - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } //tail if(j < n){ - gvl = vsetvl_e64m4(n-j); - vx = vle32_v_f32m2(&x[j], gvl); - vy = vle32_v_f32m2(&y[j], gvl); - vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); - //vr = vfdot_vv_f32m2(vx, vy, gvl); - vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + gvl = __riscv_vsetvl_e64m4(n-j); + vx = __riscv_vle32_v_f32m2(&x[j], gvl); + vy = __riscv_vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4(0, gvl); + //vr = __riscv_vfdot_vv_f32m2(vx, vy, gvl); + vr = __riscv_vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } }else if(inc_y == 1){ - gvl = vsetvl_e64m4(n); - vr = vfmv_v_f_f64m4(0, gvl); + gvl = __riscv_vsetvl_e64m4(n); + vr = __riscv_vfmv_v_f_f64m4(0, gvl); int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } //tail if(j < n){ - gvl = vsetvl_e64m4(n-j); - vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); - vy = vle32_v_f32m2(&y[j], gvl); - vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); - //vr = vfdot_vv_f32m2(vx, vy, gvl); - vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + gvl = __riscv_vsetvl_e64m4(n-j); + vx = __riscv_vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = __riscv_vle32_v_f32m2(&y[j], gvl); + vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4(0, gvl); + //vr = __riscv_vfdot_vv_f32m2(vx, vy, gvl); + vr = __riscv_vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } }else if(inc_x == 1){ - gvl = vsetvl_e64m4(n); - vr = vfmv_v_f_f64m4(0, gvl); + gvl = __riscv_vsetvl_e64m4(n); + vr = __riscv_vfmv_v_f_f64m4(0, gvl); int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } //tail if(j < n){ - gvl = vsetvl_e64m4(n-j); - vx = vle32_v_f32m2(&x[j], gvl); - vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); - vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); - //vr = vfdot_vv_f32m2(vx, vy, gvl); - vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + gvl = __riscv_vsetvl_e64m4(n-j); + vx = __riscv_vle32_v_f32m2(&x[j], gvl); + vy = __riscv_vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4(0, gvl); + //vr = __riscv_vfdot_vv_f32m2(vx, vy, gvl); + vr = __riscv_vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } }else{ - gvl = vsetvl_e64m4(n); - vr = vfmv_v_f_f64m4(0, gvl); + gvl = __riscv_vsetvl_e64m4(n); + vr = __riscv_vfmv_v_f_f64m4(0, gvl); int stride_x = inc_x * sizeof(FLOAT); int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } //tail if(j < n){ - gvl = vsetvl_e64m4(n-j); - vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); - vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); - vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl); - //vr = vfdot_vv_f32m2(vx, vy, gvl); - vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl); - v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl); - dot += (double)vfmv_f_s_f64m1_f64(v_res); + gvl = __riscv_vsetvl_e64m4(n-j); + vx = __riscv_vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl); + vy = __riscv_vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl); + vfloat64m4_t vz = __riscv_vfmv_v_f_f64m4(0, gvl); + //vr = __riscv_vfdot_vv_f32m2(vx, vy, gvl); + vr = __riscv_vfwmacc_vv_f64m4(vz, vx, vy, gvl); + v_res = __riscv_vfredusum_vs_f64m4_f64m1(vr, v_z0, gvl); + dot += (double)__riscv_vfmv_f_s_f64m1_f64(v_res); } }