@@ -9104,10 +9104,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
91049104
91059105#elif defined __AVX__
91069106
9107- const __m128i m4 = _mm_set1_epi8(0xF);
9108- const __m128i m3 = _mm_set1_epi8(3);
9109- const __m128i m32s = _mm_set1_epi8(32);
91109107 const __m128i m2 = _mm_set1_epi8(2);
9108+ const __m128i m3 = _mm_set1_epi8(3);
9109+ const __m128i m15 = _mm_set1_epi8(15);
9110+ const __m128i m32 = _mm_set1_epi8(32);
91119111
91129112 __m256 acc = _mm256_setzero_ps();
91139113
@@ -9133,26 +9133,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
91339133
91349134 const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
91359135 const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
9136- const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16( q4bitsH_0, 2), m3 ), 4 );
9137- const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16( q4bitsH_1, 2), m3 ), 4 );
9138- const __m128i q4h_4 = _mm_slli_epi16( _mm_and_si128(_mm_srli_epi16( q4bitsH_0, 4), m3), 4 );
9139- const __m128i q4h_5 = _mm_slli_epi16( _mm_and_si128(_mm_srli_epi16( q4bitsH_1, 4), m3), 4 );
9140- const __m128i q4h_6 = _mm_slli_epi16 (_mm_and_si128(_mm_srli_epi16( q4bitsH_0, 6), m3 ), 4 );
9141- const __m128i q4h_7 = _mm_slli_epi16 (_mm_and_si128(_mm_srli_epi16( q4bitsH_1, 6), m3 ), 4 );
9136+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x0C) ), 2 );
9137+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x0C) ), 2 );
9138+ const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x30) );
9139+ const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x30) );
9140+ const __m128i q4h_6 = _mm_srli_epi16 (_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0xC0) ), 2 );
9141+ const __m128i q4h_7 = _mm_srli_epi16 (_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0xC0) ), 2 );
91429142
91439143 const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91449144 const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91459145 const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91469146 const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91479147
9148- const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m4 ), q4h_0), m32s );
9149- const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m4 ), q4h_1), m32s );
9150- const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m4 ), q4h_2), m32s );
9151- const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m4 ), q4h_3), m32s );
9152- const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4 ), q4h_4), m32s );
9153- const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4 ), q4h_5), m32s );
9154- const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4 ), q4h_6), m32s );
9155- const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4 ), q4h_7), m32s );
9148+ const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m15 ), q4h_0), m32 );
9149+ const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m15 ), q4h_1), m32 );
9150+ const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m15 ), q4h_2), m32 );
9151+ const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m15 ), q4h_3), m32 );
9152+ const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15 ), q4h_4), m32 );
9153+ const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15 ), q4h_5), m32 );
9154+ const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15 ), q4h_6), m32 );
9155+ const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15 ), q4h_7), m32 );
91569156
91579157 const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
91589158 const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
0 commit comments