@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
150150#endif
151151}
152152#elif defined(__AVX__ )
153+ static inline __m128i packNibbles ( __m128i bytes1 , __m128i bytes2 )
154+ {
155+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
156+ const __m128i lowByte = _mm_set1_epi16 ( 0xFF );
157+ __m128i high = _mm_andnot_si128 ( lowByte , bytes1 );
158+ __m128i low = _mm_and_si128 ( lowByte , bytes1 );
159+ high = _mm_srli_epi16 ( high , 4 );
160+ bytes1 = _mm_or_si128 ( low , high );
161+ high = _mm_andnot_si128 ( lowByte , bytes2 );
162+ low = _mm_and_si128 ( lowByte , bytes2 );
163+ high = _mm_srli_epi16 ( high , 4 );
164+ bytes2 = _mm_or_si128 ( low , high );
165+
166+ return _mm_packus_epi16 ( bytes1 , bytes2 );
167+ }
168+
169+ static inline __m128i mul_add_epi8_sse (const __m128i x , const __m128i y ) {
170+ const __m128i ax = _mm_sign_epi8 (x , x );
171+ const __m128i sy = _mm_sign_epi8 (y , x );
172+ return _mm_maddubs_epi16 (ax , sy );
173+ }
174+
153175// spread 32 bits to 32 bytes { 0x00, 0xFF }
154176static inline __m256i bytes_from_bits_32 (const uint8_t * x ) {
155177 uint32_t x32 ;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
217239 return sum_i16_pairs_float (doth , dotl );
218240}
219241
220- static inline __m128i packNibbles ( __m128i bytes1 , __m128i bytes2 )
221- {
222- // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
223- const __m128i lowByte = _mm_set1_epi16 ( 0xFF );
224- __m128i high = _mm_andnot_si128 ( lowByte , bytes1 );
225- __m128i low = _mm_and_si128 ( lowByte , bytes1 );
226- high = _mm_srli_epi16 ( high , 4 );
227- bytes1 = _mm_or_si128 ( low , high );
228- high = _mm_andnot_si128 ( lowByte , bytes2 );
229- low = _mm_and_si128 ( lowByte , bytes2 );
230- high = _mm_srli_epi16 ( high , 4 );
231- bytes2 = _mm_or_si128 ( low , high );
242+ // larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
243+ static inline __m256 mul_sum_i8_quad_float (const __m128i x_1_0 , const __m128i x_1_1 , const __m128i x_2_0 , const __m128i x_2_1 ,
244+ const __m128i y_1_0 , const __m128i y_1_1 , const __m128i y_2_0 , const __m128i y_2_1 ) {
245+ const __m128i mone = _mm_set1_epi16 (1 );
232246
233- return _mm_packus_epi16 ( bytes1 , bytes2 );
247+ const __m128i p16_1_0 = mul_add_epi8_sse (x_1_0 , y_1_0 );
248+ const __m128i p16_1_1 = mul_add_epi8_sse (x_1_1 , y_1_1 );
249+ const __m128i p16_2_0 = mul_add_epi8_sse (x_2_0 , y_2_0 );
250+ const __m128i p16_2_1 = mul_add_epi8_sse (x_2_1 , y_2_1 );
251+ const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
252+ const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
253+ const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
254+ const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
255+ const __m128i p_1 = _mm_add_epi32 (p_1_0 , p_1_1 );
256+ const __m128i p_2 = _mm_add_epi32 (p_2_0 , p_2_1 );
257+ return _mm256_cvtepi32_ps (MM256_SET_M128I (p_2 , p_1 ));
234258}
235259
236- static inline __m128i mul_add_epi8_sse (const __m128i x , const __m128i y ) {
237- const __m128i ax = _mm_sign_epi8 (x , x );
238- const __m128i sy = _mm_sign_epi8 (y , x );
239- return _mm_maddubs_epi16 (ax , sy );
260+ // quad fp16 delta calculation
261+ static inline __m256 quad_fp16_delta_float (const float x0 , const float y0 , const float x1 , const float y1 ) {
262+ // GGML_FP16_TO_FP32 is faster than Intel F16C
263+ return _mm256_set_m128 (_mm_set1_ps (GGML_FP16_TO_FP32 (x1 ) * GGML_FP16_TO_FP32 (y1 )),
264+ _mm_set1_ps (GGML_FP16_TO_FP32 (x0 ) * GGML_FP16_TO_FP32 (y0 )));
240265}
241266#endif
242267#elif defined(__SSSE3__ )
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20042029
20052030 sumf = hsum_float_8 (acc );
20062031#elif defined(__AVX__ )
2007- const __m128i mone = _mm_set1_epi16 (1 );
2008-
2009- __m256 accum1 = _mm256_setzero_ps ();
2010- __m256 accum2 = _mm256_setzero_ps ();
2032+ __m256 accum = _mm256_setzero_ps ();
20112033 for (; ib + 1 < nb ; ib += 2 ) {
20122034 const __m128i q4bits_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 0 ].qs );
20132035 const __m128i q4bits_2 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20202042 const __m128i q4b_1_1 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_1 , 4 )), _mm_set1_epi8 (8 ));
20212043 const __m128i q4b_2_0 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), q4bits_2 ), _mm_set1_epi8 (8 ));
20222044 const __m128i q4b_2_1 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_2 , 4 )), _mm_set1_epi8 (8 ));
2045+
20232046 const __m128i p16_1_0 = mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
20242047 const __m128i p16_1_1 = mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
20252048 const __m128i p16_2_0 = mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
20262049 const __m128i p16_2_1 = mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
2027- const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
2028- const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
2029- const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
2030- const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
2031- accum1 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_FP16_TO_FP32 (x [ib + 0 ].d )),
2032- _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
2033- accum2 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 1 ].d )* GGML_FP16_TO_FP32 (x [ib + 1 ].d )),
2034- _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
2050+ const __m128i p_1 = _mm_add_epi16 (p16_1_0 , p16_1_1 );
2051+ const __m128i p_2 = _mm_add_epi16 (p16_2_0 , p16_2_1 );
2052+ const __m256 p = sum_i16_pairs_float (p_2 , p_1 );
2053+
2054+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
2055+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
20352056 }
20362057
2037- sumf = hsum_float_8 (_mm256_add_ps ( accum1 , accum2 ) );
2058+ sumf = hsum_float_8 (accum );
20382059#elif defined(__SSSE3__ )
20392060 // set constants
20402061 const __m128i lowMask = _mm_set1_epi8 (0xF );
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35353556 }
35363557
35373558 sumf = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 );
3538- #elif defined(__AVX2__ ) || defined( __AVX__ )
3559+ #elif defined(__AVX2__ )
35393560 // Initialize accumulator with zeros
35403561 __m256 acc = _mm256_setzero_ps ();
35413562
@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35493570 const __m256 q = mul_sum_i8_pairs_float (qx , qy );
35503571
35513572 // Multiply q with scale and accumulate
3552- #if defined(__AVX2__ )
35533573 acc = _mm256_fmadd_ps ( d , q , acc );
3554- #else
3555- acc = _mm256_add_ps ( _mm256_mul_ps ( d , q ), acc );
3556- #endif
35573574 }
35583575
35593576 sumf = hsum_float_8 (acc );
3577+ #elif defined(__AVX__ )
3578+ __m256 accum = _mm256_setzero_ps ();
3579+
3580+ for (; ib + 1 < nb ; ib += 2 ) {
3581+ const __m128i qx_1_0 = _mm_loadu_si128 ((const __m128i * )x [ib ].qs );
3582+ const __m128i qx_1_1 = _mm_loadu_si128 ((const __m128i * )x [ib ].qs + 1 );
3583+ const __m128i qx_2_0 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
3584+ const __m128i qx_2_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs + 1 );
3585+ const __m128i qy_1_0 = _mm_loadu_si128 ((const __m128i * )y [ib ].qs );
3586+ const __m128i qy_1_1 = _mm_loadu_si128 ((const __m128i * )y [ib ].qs + 1 );
3587+ const __m128i qy_2_0 = _mm_loadu_si128 ((const __m128i * )y [ib + 1 ].qs );
3588+ const __m128i qy_2_1 = _mm_loadu_si128 ((const __m128i * )y [ib + 1 ].qs + 1 );
3589+
3590+ const __m256 p = mul_sum_i8_quad_float (qx_1_0 , qx_1_1 , qx_2_0 , qx_2_1 , qy_1_0 , qy_1_1 , qy_2_0 , qy_2_1 );
3591+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
3592+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
3593+ }
3594+
3595+ sumf = hsum_float_8 (accum );
35603596#elif defined(__riscv_v_intrinsic )
35613597 size_t vl = __riscv_vsetvl_e8m1 (qk );
35623598
@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1032210358#elif defined __AVX__
1032310359 const __m128i values128 = _mm_loadu_si128 ((const __m128i * )kvalues_iq4nl );
1032410360 const __m128i m4b = _mm_set1_epi8 (0x0f );
10325- const __m128i mone = _mm_set1_epi16 (1 );
1032610361
10327- __m256 accum1 = _mm256_setzero_ps ();
10328- __m256 accum2 = _mm256_setzero_ps ();
10362+ __m256 accum = _mm256_setzero_ps ();
1032910363 for (; ib + 1 < nb ; ib += 2 ) {
1033010364 const __m128i q4bits_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 0 ].qs );
1033110365 const __m128i q4bits_2 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1033810372 const __m128i q4b_1_1 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_1 , 4 ), m4b ));
1033910373 const __m128i q4b_2_0 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (q4bits_2 , m4b ));
1034010374 const __m128i q4b_2_1 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_2 , 4 ), m4b ));
10341- const __m128i p16_1_0 = mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
10342- const __m128i p16_1_1 = mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
10343- const __m128i p16_2_0 = mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
10344- const __m128i p16_2_1 = mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
10345- const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
10346- const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
10347- const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
10348- const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
10349- accum1 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_FP16_TO_FP32 (x [ib + 0 ].d )),
10350- _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
10351- accum2 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 1 ].d )* GGML_FP16_TO_FP32 (x [ib + 1 ].d )),
10352- _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
10375+
10376+ const __m256 p = mul_sum_i8_quad_float (q4b_1_0 , q4b_1_1 , q4b_2_0 , q4b_2_1 , q8b_1_0 , q8b_1_1 , q8b_2_0 , q8b_2_1 );
10377+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
10378+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
1035310379 }
1035410380
10355- sumf = hsum_float_8 (_mm256_add_ps ( accum1 , accum2 ) );
10381+ sumf = hsum_float_8 (accum );
1035610382
1035710383#elif defined(__POWER9_VECTOR__ )
1035810384 const vector signed char lowMask = vec_splats ((signed char )0xF );
0 commit comments