Skip to content

Commit a420e4c

Browse files
committed
optimize bit fiddling
1 parent 0b75215 commit a420e4c

File tree

1 file changed

+17
-17
lines changed

1 file changed

+17
-17
lines changed

ggml/src/ggml-quants.c

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9104,10 +9104,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
91049104

91059105
#elif defined __AVX__
91069106

9107-
const __m128i m4 = _mm_set1_epi8(0xF);
9108-
const __m128i m3 = _mm_set1_epi8(3);
9109-
const __m128i m32s = _mm_set1_epi8(32);
91109107
const __m128i m2 = _mm_set1_epi8(2);
9108+
const __m128i m3 = _mm_set1_epi8(3);
9109+
const __m128i m15 = _mm_set1_epi8(15);
9110+
const __m128i m32 = _mm_set1_epi8(32);
91119111

91129112
__m256 acc = _mm256_setzero_ps();
91139113

@@ -9133,26 +9133,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
91339133

91349134
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
91359135
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
9136-
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
9137-
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
9138-
const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
9139-
const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
9140-
const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
9141-
const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
9136+
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x0C)), 2);
9137+
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x0C)), 2);
9138+
const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x30));
9139+
const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x30));
9140+
const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0xC0)), 2);
9141+
const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0xC0)), 2);
91429142

91439143
const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91449144
const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91459145
const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91469146
const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
91479147

9148-
const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0), m32s);
9149-
const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1), m32s);
9150-
const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2), m32s);
9151-
const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3), m32s);
9152-
const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4), m32s);
9153-
const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5), m32s);
9154-
const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6), m32s);
9155-
const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7), m32s);
9148+
const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0), m32);
9149+
const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1), m32);
9150+
const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2), m32);
9151+
const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3), m32);
9152+
const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4), m32);
9153+
const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5), m32);
9154+
const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6), m32);
9155+
const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7), m32);
91569156

91579157
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
91589158
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;

0 commit comments

Comments
 (0)