Skip to content

Commit f4202c8

Browse files
ikawrakowIwan Kawrakow
andauthored
Fix repacked legacy quants (#951)
* Fix q5_0_r4 The issue waqs in the tail part. As almost all models have tensor rows that are multiple of 128, that part was never triggered in testing. But ithe gpt-oss models have an embedding size of 2880, so we end up there and trigger the bug. * Fix q6_0_r4 Same fix as q5_0_r4 * Fix q4_0_r8 * Fix q5_0_r4 and q6_0_r4 also on Zen4 * Fix q4_0_r8 also on Zen4 --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 38abd0e commit f4202c8

File tree

1 file changed

+29
-30
lines changed

1 file changed

+29
-30
lines changed

ggml/src/iqk/iqk_gemm_legacy_quants.cpp

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,11 +1003,10 @@ inline __m256i accum_q4_0_quants(const __m256i * v, const int8_t * qs) {
10031003
template <int nrc_y>
10041004
static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const DataInfo& info, int nrc_x) {
10051005
GGML_ASSERT(nrc_x%8 == 0);
1006-
Q8<nrc_y, block_q8_1_x4> q8(info);
1006+
Q8<nrc_y, block_q8_2_x4> q8(info);
10071007
auto m4 = _mm256_set1_epi8(0xf);
10081008
int nb = n / QK4_NL;
10091009
__m256i v[8];
1010-
GGML_ASSERT(nb%4 == 0);
10111010
if constexpr (nrc_y == 1) {
10121011
union { __m256 vec; float val[8]; } helper;
10131012
for (int ix = 0; ix < nrc_x; ix += 8) {
@@ -1026,14 +1025,14 @@ static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const D
10261025
}
10271026
}
10281027
for (int ib = 4*(nb/4); ib < nb; ++ib) {
1029-
auto qy = (const block_q8_1 *)q8.y[0];
1028+
auto qy = (const block_q8_2 *)q8.y[0];
10301029
auto scales = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)iq4[ib].d));
10311030
prepare_q4_0_quants_avx2(iq4[ib].qs, v, m4);
10321031
auto sumi = accum_q4_0_quants(v, qy[ib].qs);
1033-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1034-
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d)));
1032+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1033+
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(d8));
10351034
acc1 = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc1);
1036-
acc2 = _mm256_fmadd_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(s)), acc2);
1035+
acc2 = _mm256_fmadd_ps(scales, _mm256_set1_ps(m8), acc2);
10371036
}
10381037
acc1 = _mm256_fmadd_ps(acc2, _mm256_set1_ps(-8.f), acc1);
10391038
info.store(ix, 0, acc1);
@@ -1077,12 +1076,12 @@ static void mul_mat_q4_0_r8_q8_2_avx2(int n, const void * vx, size_t bx, const D
10771076
auto scales_m = _mm256_mul_ps(scales, _mm256_set1_ps(-8.f));
10781077
prepare_q4_0_quants_avx2(iq4[ib].qs, v, m4);
10791078
for (int iy = 0; iy < nrc_y; ++iy) {
1080-
auto qy = (const block_q8_1 *)q8.y[iy];
1079+
auto qy = (const block_q8_2 *)q8.y[iy];
10811080
auto sumi = accum_q4_0_quants(v, qy[ib].qs);
1082-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1083-
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d)));
1081+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1082+
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(d8));
10841083
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
1085-
acc[iy] = _mm256_fmadd_ps(scales_m, _mm256_set1_ps(GGML_BF16_TO_FP32(s)), acc[iy]);
1084+
acc[iy] = _mm256_fmadd_ps(scales_m, _mm256_set1_ps(m8), acc[iy]);
10861085
}
10871086
}
10881087
for (int iy = 0; iy < nrc_y; ++iy) {
@@ -1101,7 +1100,7 @@ static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
11011100
return;
11021101
}
11031102
GGML_ASSERT(nrc_x%16 == 0);
1104-
Q8<nrc_y, block_q8_1_x4> q8(info);
1103+
Q8<nrc_y, block_q8_2_x4> q8(info);
11051104
auto m4 = _mm512_set1_epi8(0xf);
11061105
int nb = n / QK4_NL;
11071106
__m512 acc[2*nrc_y] = {};
@@ -1159,10 +1158,10 @@ static void mul_mat_q4_0_r8_q8_2(int n, const void * vx, size_t bx, const DataIn
11591158
for (int iy = 0; iy < nrc_y; ++iy) {
11601159
auto qy = (const block_q8_1 *)q8.y[iy];
11611160
auto sumi = dot(qy[ib].qs);
1162-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1163-
auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d));
1161+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1162+
auto dy = _mm512_set1_ps(d8);
11641163
acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]);
1165-
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]);
1164+
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(m8), acc[2*iy+1]);
11661165
}
11671166
}
11681167
for (int iy = 0; iy < nrc_y; ++iy) {
@@ -1245,12 +1244,12 @@ static void mul_mat_q5_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const D
12451244
for (int ib = 4*(nb/4); ib < nb; ++ib) {
12461245
auto scales = prepare(iq5[ib]);
12471246
for (int iy = 0; iy < nrc_y; ++iy) {
1248-
auto qy = (const block_q8_1 *)q8.y[iy];
1247+
auto qy = (const block_q8_2 *)q8.y[iy];
12491248
auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs));
1250-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1251-
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d)));
1249+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1250+
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(d8));
12521251
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
1253-
acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-8.f*GGML_BF16_TO_FP32(s)), acc[iy]);
1252+
acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-8.f*m8), acc[iy]);
12541253
}
12551254
}
12561255
for (int iy = 0; iy < nrc_y; ++iy) {
@@ -1325,12 +1324,12 @@ static void mul_mat_q5_0_r4_q8_2(int n, const void * vx, size_t bx, const DataIn
13251324
for (int ib = 4*(nb/4); ib < nb; ++ib) {
13261325
auto scales = prepare(iq5l[ib], iq5h[ib]);
13271326
for (int iy = 0; iy < nrc_y; ++iy) {
1328-
auto qy = (const block_q8_1 *)q8.y[iy];
1327+
auto qy = (const block_q8_2 *)q8.y[iy];
13291328
auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs));
1330-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1331-
auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d));
1329+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1330+
auto dy = _mm512_set1_ps(d8);
13321331
acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]);
1333-
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]);
1332+
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(m8), acc[2*iy+1]);
13341333
}
13351334
}
13361335
for (int iy = 0; iy < nrc_y; ++iy) {
@@ -1415,12 +1414,12 @@ static void mul_mat_q6_0_r4_q8_2_avx2(int n, const void * vx, size_t bx, const D
14151414
for (int ib = 4*(nb/4); ib < nb; ++ib) {
14161415
auto scales = prepare(iq6[ib]);
14171416
for (int iy = 0; iy < nrc_y; ++iy) {
1418-
auto qy = (const block_q8_1 *)q8.y[iy];
1417+
auto qy = (const block_q8_2 *)q8.y[iy];
14191418
auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs));
1420-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1421-
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(GGML_BF16_TO_FP32(d)));
1419+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1420+
auto d4d8 = _mm256_mul_ps(scales, _mm256_set1_ps(d8));
14221421
acc[iy] = _mm256_fmadd_ps(d4d8, _mm256_cvtepi32_ps(sumi), acc[iy]);
1423-
acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-16.f*GGML_BF16_TO_FP32(s)), acc[iy]);
1422+
acc[iy] = _mm256_fmadd_ps(scales, _mm256_set1_ps(-16.f*m8), acc[iy]);
14241423
}
14251424
}
14261425

@@ -1495,12 +1494,12 @@ static void mul_mat_q6_0_r4_q8_2(int n, const void * vx, size_t bx, const DataIn
14951494
for (int ib = 4*(nb/4); ib < nb; ++ib) {
14961495
auto scales = prepare(iq6l[ib], iq6h[ib]);
14971496
for (int iy = 0; iy < nrc_y; ++iy) {
1498-
auto qy = (const block_q8_1 *)q8.y[iy];
1497+
auto qy = (const block_q8_2 *)q8.y[iy];
14991498
auto sumi = dot(_mm256_loadu_si256((const __m256i*)qy[ib].qs));
1500-
ggml_bf16_t d{qy[ib].d}, s{qy[ib].s};
1501-
auto dy = _mm512_set1_ps(GGML_BF16_TO_FP32(d));
1499+
auto [d8, m8] = ScaleHelperQ8_2::prepare1(qy + ib);
1500+
auto dy = _mm512_set1_ps(d8);
15021501
acc[2*iy+0] = _mm512_fmadd_ps(_mm512_mul_ps(scales, dy), _mm512_cvtepi32_ps(sumi), acc[2*iy+0]);
1503-
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(GGML_BF16_TO_FP32(s)), acc[2*iy+1]);
1502+
acc[2*iy+1] = _mm512_fmadd_ps(scales, _mm512_set1_ps(m8), acc[2*iy+1]);
15041503
}
15051504
}
15061505
for (int iy = 0; iy < nrc_y; ++iy) {

0 commit comments

Comments
 (0)