@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
150150#endif 
151151}
152152#elif  defined(__AVX__ )
153+ static  inline  __m128i  packNibbles ( __m128i  bytes1 , __m128i  bytes2  )
154+ {
155+     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh 
156+     const  __m128i  lowByte  =  _mm_set1_epi16 ( 0xFF  );
157+     __m128i  high  =  _mm_andnot_si128 ( lowByte , bytes1  );
158+     __m128i  low  =  _mm_and_si128 ( lowByte , bytes1  );
159+     high  =  _mm_srli_epi16 ( high , 4  );
160+     bytes1  =  _mm_or_si128 ( low , high  );
161+     high  =  _mm_andnot_si128 ( lowByte , bytes2  );
162+     low  =  _mm_and_si128 ( lowByte , bytes2  );
163+     high  =  _mm_srli_epi16 ( high , 4  );
164+     bytes2  =  _mm_or_si128 ( low , high  );
165+ 
166+     return  _mm_packus_epi16 ( bytes1 , bytes2 );
167+ }
168+ 
169+ static  inline  __m128i  mul_add_epi8_sse (const  __m128i  x , const  __m128i  y ) {
170+     const  __m128i  ax  =  _mm_sign_epi8 (x , x );
171+     const  __m128i  sy  =  _mm_sign_epi8 (y , x );
172+     return  _mm_maddubs_epi16 (ax , sy );
173+ }
174+ 
153175// spread 32 bits to 32 bytes { 0x00, 0xFF } 
154176static  inline  __m256i  bytes_from_bits_32 (const  uint8_t  *  x ) {
155177    uint32_t  x32 ;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
217239    return  sum_i16_pairs_float (doth , dotl );
218240}
219241
220- static  inline  __m128i  packNibbles ( __m128i  bytes1 , __m128i  bytes2  )
221- {
222-     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh 
223-     const  __m128i  lowByte  =  _mm_set1_epi16 ( 0xFF  );
224-     __m128i  high  =  _mm_andnot_si128 ( lowByte , bytes1  );
225-     __m128i  low  =  _mm_and_si128 ( lowByte , bytes1  );
226-     high  =  _mm_srli_epi16 ( high , 4  );
227-     bytes1  =  _mm_or_si128 ( low , high  );
228-     high  =  _mm_andnot_si128 ( lowByte , bytes2  );
229-     low  =  _mm_and_si128 ( lowByte , bytes2  );
230-     high  =  _mm_srli_epi16 ( high , 4  );
231-     bytes2  =  _mm_or_si128 ( low , high  );
242+ // larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors 
243+ static  inline  __m256  mul_sum_i8_quad_float (const  __m128i  x_1_0 , const  __m128i  x_1_1 , const  __m128i  x_2_0 , const  __m128i  x_2_1 ,
244+                                            const  __m128i  y_1_0 , const  __m128i  y_1_1 , const  __m128i  y_2_0 , const  __m128i  y_2_1 ) {
245+     const  __m128i  mone  =  _mm_set1_epi16 (1 );
232246
233-     return  _mm_packus_epi16 ( bytes1 , bytes2 );
247+     const  __m128i  p16_1_0  =  mul_add_epi8_sse (x_1_0 , y_1_0 );
248+     const  __m128i  p16_1_1  =  mul_add_epi8_sse (x_1_1 , y_1_1 );
249+     const  __m128i  p16_2_0  =  mul_add_epi8_sse (x_2_0 , y_2_0 );
250+     const  __m128i  p16_2_1  =  mul_add_epi8_sse (x_2_1 , y_2_1 );
251+     const  __m128i  p_1_0  =  _mm_madd_epi16 (p16_1_0 , mone );
252+     const  __m128i  p_1_1  =  _mm_madd_epi16 (p16_1_1 , mone );
253+     const  __m128i  p_2_0  =  _mm_madd_epi16 (p16_2_0 , mone );
254+     const  __m128i  p_2_1  =  _mm_madd_epi16 (p16_2_1 , mone );
255+     const  __m128i  p_1  =  _mm_add_epi32 (p_1_0 , p_1_1 );
256+     const  __m128i  p_2  =  _mm_add_epi32 (p_2_0 , p_2_1 );
257+     return  _mm256_cvtepi32_ps (MM256_SET_M128I (p_2 , p_1 ));
234258}
235259
236- static  inline  __m128i  mul_add_epi8_sse (const  __m128i  x , const  __m128i  y ) {
237-     const  __m128i  ax  =  _mm_sign_epi8 (x , x );
238-     const  __m128i  sy  =  _mm_sign_epi8 (y , x );
239-     return  _mm_maddubs_epi16 (ax , sy );
260+ // quad fp16 delta calculation 
261+ static  inline  __m256  quad_fp16_delta_float (const  float  x0 , const  float  y0 , const  float  x1 , const  float  y1 ) {
262+     // GGML_FP16_TO_FP32 is faster than Intel F16C 
263+     return  _mm256_set_m128 (_mm_set1_ps (GGML_FP16_TO_FP32 (x1 ) *  GGML_FP16_TO_FP32 (y1 )),
264+                            _mm_set1_ps (GGML_FP16_TO_FP32 (x0 ) *  GGML_FP16_TO_FP32 (y0 )));
240265}
241266#endif 
242267#elif  defined(__SSSE3__ )
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20042029
20052030    sumf  =  hsum_float_8 (acc );
20062031#elif  defined(__AVX__ )
2007-     const  __m128i  mone  =  _mm_set1_epi16 (1 );
2008- 
2009-     __m256  accum1  =  _mm256_setzero_ps ();
2010-     __m256  accum2  =  _mm256_setzero_ps ();
2032+     __m256  accum  =  _mm256_setzero_ps ();
20112033    for  (; ib  +  1  <  nb ; ib  +=  2 ) {
20122034        const  __m128i  q4bits_1  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  0 ].qs );
20132035        const  __m128i  q4bits_2  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  1 ].qs );
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20202042        const  __m128i  q4b_1_1  =  _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_1 , 4 )), _mm_set1_epi8 (8 ));
20212043        const  __m128i  q4b_2_0  =  _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), q4bits_2 ), _mm_set1_epi8 (8 ));
20222044        const  __m128i  q4b_2_1  =  _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_2 , 4 )), _mm_set1_epi8 (8 ));
2045+ 
20232046        const  __m128i  p16_1_0  =  mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
20242047        const  __m128i  p16_1_1  =  mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
20252048        const  __m128i  p16_2_0  =  mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
20262049        const  __m128i  p16_2_1  =  mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
2027-         const  __m128i  p_1_0  =  _mm_madd_epi16 (p16_1_0 , mone );
2028-         const  __m128i  p_1_1  =  _mm_madd_epi16 (p16_1_1 , mone );
2029-         const  __m128i  p_2_0  =  _mm_madd_epi16 (p16_2_0 , mone );
2030-         const  __m128i  p_2_1  =  _mm_madd_epi16 (p16_2_1 , mone );
2031-         accum1  =  _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib  +  0 ].d )* GGML_FP16_TO_FP32 (x [ib  +  0 ].d )),
2032-                 _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
2033-         accum2  =  _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib  +  1 ].d )* GGML_FP16_TO_FP32 (x [ib  +  1 ].d )),
2034-                 _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
2050+         const  __m128i  p_1  =  _mm_add_epi16 (p16_1_0 , p16_1_1 );
2051+         const  __m128i  p_2  =  _mm_add_epi16 (p16_2_0 , p16_2_1 );
2052+         const  __m256  p  =   sum_i16_pairs_float (p_2 , p_1 );
2053+ 
2054+         const  __m256  deltas  =  quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib  +  1 ].d , y [ib  +  1 ].d );
2055+         accum  =  _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
20352056    }
20362057
2037-     sumf  =  hsum_float_8 (_mm256_add_ps ( accum1 ,  accum2 ) );
2058+     sumf  =  hsum_float_8 (accum );
20382059#elif  defined(__SSSE3__ )
20392060    // set constants 
20402061    const  __m128i  lowMask  =  _mm_set1_epi8 (0xF );
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35353556    }
35363557
35373558    sumf  =  vaddvq_f32 (sumv0 ) +  vaddvq_f32 (sumv1 );
3538- #elif  defined(__AVX2__ )  ||  defined( __AVX__ ) 
3559+ #elif  defined(__AVX2__ )
35393560    // Initialize accumulator with zeros 
35403561    __m256  acc  =  _mm256_setzero_ps ();
35413562
@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35493570        const  __m256  q  =  mul_sum_i8_pairs_float (qx , qy );
35503571
35513572        // Multiply q with scale and accumulate 
3552- #if  defined(__AVX2__ )
35533573        acc  =  _mm256_fmadd_ps ( d , q , acc  );
3554- #else 
3555-         acc  =  _mm256_add_ps ( _mm256_mul_ps ( d , q  ), acc  );
3556- #endif 
35573574    }
35583575
35593576    sumf  =  hsum_float_8 (acc );
3577+ #elif  defined(__AVX__ )
3578+     __m256  accum  =  _mm256_setzero_ps ();
3579+ 
3580+     for  (; ib  +  1  <  nb ; ib  +=  2 ) {
3581+         const  __m128i  qx_1_0  =  _mm_loadu_si128 ((const  __m128i  * )x [ib ].qs );
3582+         const  __m128i  qx_1_1  =  _mm_loadu_si128 ((const  __m128i  * )x [ib ].qs  +  1 );
3583+         const  __m128i  qx_2_0  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  1 ].qs );
3584+         const  __m128i  qx_2_1  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  1 ].qs  +  1 );
3585+         const  __m128i  qy_1_0  =  _mm_loadu_si128 ((const  __m128i  * )y [ib ].qs );
3586+         const  __m128i  qy_1_1  =  _mm_loadu_si128 ((const  __m128i  * )y [ib ].qs  +  1 );
3587+         const  __m128i  qy_2_0  =  _mm_loadu_si128 ((const  __m128i  * )y [ib  +  1 ].qs );
3588+         const  __m128i  qy_2_1  =  _mm_loadu_si128 ((const  __m128i  * )y [ib  +  1 ].qs  +  1 );
3589+ 
3590+         const  __m256  p  =  mul_sum_i8_quad_float (qx_1_0 , qx_1_1 , qx_2_0 , qx_2_1 , qy_1_0 , qy_1_1 , qy_2_0 , qy_2_1 );
3591+         const  __m256  deltas  =  quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib  +  1 ].d , y [ib  +  1 ].d );
3592+         accum  =  _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
3593+     }
3594+ 
3595+     sumf  =  hsum_float_8 (accum );
35603596#elif  defined(__riscv_v_intrinsic )
35613597    size_t  vl  =  __riscv_vsetvl_e8m1 (qk );
35623598
@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1032210358#elif  defined __AVX__ 
1032310359    const  __m128i  values128  =  _mm_loadu_si128 ((const  __m128i * )kvalues_iq4nl );
1032410360    const  __m128i  m4b   =  _mm_set1_epi8 (0x0f );
10325-     const  __m128i  mone  =  _mm_set1_epi16 (1 );
1032610361
10327-     __m256  accum1  =  _mm256_setzero_ps ();
10328-     __m256  accum2  =  _mm256_setzero_ps ();
10362+     __m256  accum  =  _mm256_setzero_ps ();
1032910363    for  (; ib  +  1  <  nb ; ib  +=  2 ) {
1033010364        const  __m128i  q4bits_1  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  0 ].qs );
1033110365        const  __m128i  q4bits_2  =  _mm_loadu_si128 ((const  __m128i  * )x [ib  +  1 ].qs );
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1033810372        const  __m128i  q4b_1_1  =  _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_1 , 4 ), m4b ));
1033910373        const  __m128i  q4b_2_0  =  _mm_shuffle_epi8 (values128 , _mm_and_si128 (q4bits_2 , m4b ));
1034010374        const  __m128i  q4b_2_1  =  _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_2 , 4 ), m4b ));
10341-         const  __m128i  p16_1_0  =  mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
10342-         const  __m128i  p16_1_1  =  mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
10343-         const  __m128i  p16_2_0  =  mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
10344-         const  __m128i  p16_2_1  =  mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
10345-         const  __m128i  p_1_0  =  _mm_madd_epi16 (p16_1_0 , mone );
10346-         const  __m128i  p_1_1  =  _mm_madd_epi16 (p16_1_1 , mone );
10347-         const  __m128i  p_2_0  =  _mm_madd_epi16 (p16_2_0 , mone );
10348-         const  __m128i  p_2_1  =  _mm_madd_epi16 (p16_2_1 , mone );
10349-         accum1  =  _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib  +  0 ].d )* GGML_FP16_TO_FP32 (x [ib  +  0 ].d )),
10350-                 _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
10351-         accum2  =  _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib  +  1 ].d )* GGML_FP16_TO_FP32 (x [ib  +  1 ].d )),
10352-                 _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
10375+ 
10376+         const  __m256  p  =  mul_sum_i8_quad_float (q4b_1_0 , q4b_1_1 , q4b_2_0 , q4b_2_1 , q8b_1_0 , q8b_1_1 , q8b_2_0 , q8b_2_1 );
10377+         const  __m256  deltas  =  quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib  +  1 ].d , y [ib  +  1 ].d );
10378+         accum  =  _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
1035310379    }
1035410380
10355-     sumf  =  hsum_float_8 (_mm256_add_ps ( accum1 ,  accum2 ) );
10381+     sumf  =  hsum_float_8 (accum );
1035610382
1035710383#elif  defined(__POWER9_VECTOR__ )
1035810384    const  vector  signed  char  lowMask  =  vec_splats ((signed char  )0xF );
0 commit comments