@@ -927,6 +927,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
927927 const block_q8_0 * GGML_RESTRICT y = vy ;
928928
929929 int ib = 0 ;
930+ float sumf = 0 ;
930931
931932#if defined(__AVX2__ )
932933 // Initialize accumulator with zeros
@@ -945,7 +946,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
945946 acc = _mm256_fmadd_ps ( d , q , acc );
946947 }
947948
948- * s = hsum_float_8 (acc );
949+ sumf = hsum_float_8 (acc );
949950#elif defined(__AVX__ )
950951 __m256 accum = _mm256_setzero_ps ();
951952
@@ -964,14 +965,19 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
964965 accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
965966 }
966967
967- * s = hsum_float_8 (accum );
968- #else
969- UNUSED (nb );
970- UNUSED (ib );
971- UNUSED (x );
972- UNUSED (y );
973- ggml_vec_dot_q8_0_q8_0_generic (n , s , bs , vx , bx , vy , by , nrc );
968+ sumf = hsum_float_8 (accum );
974969#endif
970+ for (; ib < nb ; ++ ib ) {
971+ int sumi = 0 ;
972+
973+ for (int j = 0 ; j < qk ; j ++ ) {
974+ sumi += x [ib ].qs [j ]* y [ib ].qs [j ];
975+ }
976+
977+ sumf += sumi * (GGML_CPU_FP16_TO_FP32 (x [ib ].d )* GGML_CPU_FP16_TO_FP32 (y [ib ].d ));
978+ }
979+
980+ * s = sumf ;
975981}
976982
977983void ggml_vec_dot_tq1_0_q8_K (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
0 commit comments