@@ -3818,7 +3818,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
38183818 float sumf = 0;
38193819
38203820#if defined(__ARM_FEATURE_SVE)
3821- if (svcntb() == QK8_0) {
3821+ if (ggml_sve_cnt_b == QK8_0) {
38223822 const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
38233823 const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
38243824
@@ -5303,7 +5303,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
53035303 float sumf = 0;
53045304
53055305#if defined(__ARM_FEATURE_SVE)
5306- if (svcntb() == QK8_0) {
5306+ if (ggml_sve_cnt_b == QK8_0) {
53075307 svfloat32_t sumv0 = svdup_n_f32(0.0f);
53085308 svfloat32_t sumv1 = svdup_n_f32(0.0f);
53095309
@@ -6449,22 +6449,22 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
64496449 // compute mask for subtraction
64506450 vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
64516451 vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6452- vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m (vmask_0, q3_0, 0x4, vl);
6452+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu (vmask_0, q3_0 , q3_0, 0x4, vl);
64536453 m <<= 1;
64546454
64556455 vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
64566456 vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6457- vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m (vmask_1, q3_1, 0x4, vl);
6457+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu (vmask_1, q3_1 , q3_1, 0x4, vl);
64586458 m <<= 1;
64596459
64606460 vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
64616461 vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6462- vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m (vmask_2, q3_2, 0x4, vl);
6462+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu (vmask_2, q3_2 , q3_2, 0x4, vl);
64636463 m <<= 1;
64646464
64656465 vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
64666466 vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6467- vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m (vmask_3, q3_3, 0x4, vl);
6467+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu (vmask_3, q3_3 , q3_3, 0x4, vl);
64686468 m <<= 1;
64696469
64706470 // load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
77207720 vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
77217721 vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
77227722 vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7723- vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m (vmask_1, q5_a, 16, vl);
7723+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu (vmask_1, q5_a , q5_a, 16, vl);
77247724 m <<= 1;
77257725
77267726 vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
77277727 vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
77287728 vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7729- vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m (vmask_2, q5_l, 16, vl);
7729+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu (vmask_2, q5_l , q5_l, 16, vl);
77307730 m <<= 1;
77317731
77327732 vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
0 commit comments