@@ -1792,7 +1792,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
17921792 const int8x16_t y1_l = vld1q_s8 (b_y1 -> qs );
17931793 const int8x16_t y1_h = vld1q_s8 (b_y1 -> qs + 16 );
17941794
1795- float32_t _scale [4 ] = { GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
1795+ float32_t _scale [4 ] = {
1796+ GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
17961797 GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d ),
17971798 GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
17981799 GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d )};
@@ -2357,10 +2358,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
23572358 const block_q8_1 * restrict b_y0 = & vy0 [i ];
23582359 const block_q8_1 * restrict b_y1 = & vy1 [i ];
23592360
2360- float32_t summs_t [4 ] = {GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> s ),
2361+ float32_t summs_t [4 ] = {
2362+ GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> s ),
23612363 GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y0 -> s ),
23622364 GGML_FP16_TO_FP32 (b_x0 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> s ),
23632365 GGML_FP16_TO_FP32 (b_x1 -> m ) * GGML_FP16_TO_FP32 (b_y1 -> s )};
2366+
23642367 summs0 = vaddq_f32 (summs0 , vld1q_f32 (summs_t ));
23652368
23662369 const uint8x16_t m4b = vdupq_n_u8 (0x0F );
@@ -2381,10 +2384,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
23812384 const int8x16_t y1_h = vld1q_s8 (b_y1 -> qs + 16 );
23822385
23832386 // mmla into int32x4_t
2384- float32_t _scale [4 ] = {GGML_FP16_TO_FP32 (b_x0 -> d )* b_y0 -> d ,
2385- GGML_FP16_TO_FP32 (b_x0 -> d )* b_y1 -> d ,
2386- GGML_FP16_TO_FP32 (b_x1 -> d )* b_y0 -> d ,
2387- GGML_FP16_TO_FP32 (b_x1 -> d )* b_y1 -> d };
2387+ float32_t _scale [4 ] = {
2388+ GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
2389+ GGML_FP16_TO_FP32 (b_x0 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d ),
2390+ GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y0 -> d ),
2391+ GGML_FP16_TO_FP32 (b_x1 -> d )* GGML_FP16_TO_FP32 (b_y1 -> d )};
23882392 float32x4_t scale = vld1q_f32 (_scale );
23892393
23902394 int8x16_t l0 = vreinterpretq_s8_s64 (vzip1q_s64 (vreinterpretq_s64_s8 (x0_l ), vreinterpretq_s64_s8 (x1_l )));
0 commit comments