Skip to content

Commit 62f82d6

Browse files
Iwan KawrakowNexesenex
authored andcommitted
quantization tweaks
iq3_ks quantization tweaks Minor iq3_k tweak q2_K tweaks q3_K tweaks q4_K tweaks q5_K tweaks
1 parent 7949a68 commit 62f82d6

File tree

1 file changed

+117
-8
lines changed

1 file changed

+117
-8
lines changed

ggml/src/ggml-quants.c

Lines changed: 117 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,7 +1176,7 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
11761176
memset(sw, 0, QK_K/16*sizeof(float));
11771177
float sumx2 = 0;
11781178
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
1179-
float sigma2 = 0.75f*sumx2/QK_K;
1179+
float sigma2 = 0.5f*sumx2/QK_K;
11801180
for (int j = 0; j < QK_K/16; ++j) {
11811181
const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
11821182
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
@@ -1191,6 +1191,30 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
11911191
y[i].dmin = GGML_FP32_TO_FP16(mm);
11921192

11931193
for (int j = 0; j < QK_K/16; ++j) {
1194+
const float * restrict qw = quant_weights + QK_K * i + 16*j;
1195+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1196+
int lmin = MAX(Ls[j]-1, 0);
1197+
int lmax = MIN(Ls[j]+1,15);
1198+
int mmin = MAX(Lm[j]-1, 0);
1199+
int mmax = MIN(Lm[j]+1,15);
1200+
float best_score = INFINITY;
1201+
for (int il = lmin; il <= lmax; ++il) {
1202+
float d = dm*il;
1203+
float id = d ? 1/d : 0.f;
1204+
for (int im = mmin; im <= mmax; ++im) {
1205+
float m = mm*im;
1206+
float score = 0;
1207+
for (int ii = 0; ii < 16; ++ii) {
1208+
int q = nearest_int((x[16*j + ii] + m)*id);
1209+
q = MAX(0, MIN(3, q));
1210+
float diff = d*q - m - x[16*j + ii];
1211+
score += weight[ii] * diff * diff;
1212+
}
1213+
if (score < best_score) {
1214+
best_score = score; Ls[j] = il; Lm[j] = im;
1215+
}
1216+
}
1217+
}
11941218
float d = dm*Ls[j];
11951219
float m = mm*Lm[j];
11961220
float id = d ? 1/d : 0.f;
@@ -1393,6 +1417,30 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
13931417

13941418
float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw);
13951419
for (int j = 0; j < QK_K/16; ++j) {
1420+
// Somehow this does not help
1421+
//if (quant_weights) {
1422+
// const float * qw = quant_weights + QK_K * i + 16*j;
1423+
// for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
1424+
//} else {
1425+
// for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
1426+
//}
1427+
//int lmin = MAX( 0, Ls[j]-1);
1428+
//int lmax = MIN(63, Ls[j]+1);
1429+
//float best_score = INFINITY;
1430+
//for (int ls = lmin; ls <= lmax; ++ls) {
1431+
// float dl = d_block * (ls - 32);
1432+
// float idl = dl ? 1/dl : 0.f;
1433+
// float score = 0;
1434+
// for (int ii = 0; ii < 16; ++ii) {
1435+
// int q = nearest_int(idl*x[16*j + ii]);
1436+
// q = MAX(-4, MIN(3, q));
1437+
// float diff = dl*q - x[16*j + ii];
1438+
// score += weight[ii] * diff * diff;
1439+
// }
1440+
// if (score < best_score) {
1441+
// best_score = score; Ls[j] = ls;
1442+
// }
1443+
//}
13961444
int l = Ls[j];
13971445
if (j < 8) {
13981446
y[i].scales[j] = l & 0xF;
@@ -1408,7 +1456,8 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
14081456
for (int j = 0; j < QK_K/16; ++j) {
14091457
sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
14101458
sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
1411-
float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1459+
//float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1460+
float d = d_block * sc;
14121461
if (!d) {
14131462
continue;
14141463
}
@@ -1438,6 +1487,8 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
14381487
}
14391488
}
14401489

1490+
y[i].d = GGML_FP32_TO_FP16(1.015f*d_block);
1491+
14411492
x += QK_K;
14421493
}
14431494
}
@@ -1592,6 +1643,35 @@ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * G
15921643
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
15931644
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
15941645
for (int j = 0; j < QK_K/32; ++j) {
1646+
if (quant_weights) {
1647+
const float * qw = quant_weights + QK_K*i + 32*j;
1648+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
1649+
} else {
1650+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
1651+
}
1652+
int lmin = MAX( 0, Ls[j] - 1);
1653+
int lmax = MIN(63, Ls[j] + 1);
1654+
int mmin = MAX( 0, Lm[j] - 1);
1655+
int mmax = MIN(63, Lm[j] + 1);
1656+
float best_score = INFINITY;
1657+
for (int il = lmin; il <= lmax; ++il) {
1658+
float dl = d_block * il;
1659+
float idl = dl ? 1/dl : 0.f;
1660+
for (int im = mmin; im <= mmax; ++im) {
1661+
float dm = m_block * im;
1662+
float score = 0;
1663+
for (int ii = 0; ii < 32; ++ii) {
1664+
int q = nearest_int((x[32*j + ii] + dm)*idl);
1665+
q = MAX(0, MIN(15, q));
1666+
float diff = dl * q - dm - x[32*j + ii];
1667+
score += weights[ii] * diff * diff;
1668+
}
1669+
if (score < best_score) {
1670+
best_score = score;
1671+
Ls[j] = il; Lm[j] = im;
1672+
}
1673+
}
1674+
}
15951675
uint8_t ls = Ls[j];
15961676
uint8_t lm = Lm[j];
15971677
if (j < 4) {
@@ -1609,9 +1689,11 @@ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * G
16091689
uint8_t sc, m;
16101690
for (int j = 0; j < QK_K/32; ++j) {
16111691
get_scale_min_k4(j, y[i].scales, &sc, &m);
1612-
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1692+
//const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1693+
const float d = d_block * sc;
16131694
if (!d) continue;
1614-
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
1695+
//const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
1696+
const float dm = m_block * m;
16151697
for (int ii = 0; ii < 32; ++ii) {
16161698
int l = nearest_int((x[32*j + ii] + dm)/d);
16171699
l = MAX(0, MIN(15, l));
@@ -1799,10 +1881,37 @@ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * G
17991881
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
18001882

18011883
for (int j = 0; j < QK_K/32; ++j) {
1884+
if (quant_weights) {
1885+
const float * qw = quant_weights + QK_K*i + 32*j;
1886+
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
1887+
} else {
1888+
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
1889+
}
1890+
int lmin = MAX( 0, Ls[j] - 2);
1891+
int lmax = MIN(63, Ls[j] + 2);
1892+
int mmin = MAX( 0, Lm[j] - 2);
1893+
int mmax = MIN(63, Lm[j] + 2);
1894+
float best_score = INFINITY;
1895+
for (int il = lmin; il <= lmax; ++il) {
1896+
float dl = d_block * il;
1897+
float idl = dl ? 1/dl : 0.f;
1898+
for (int im = mmin; im <= mmax; ++im) {
1899+
float dm = m_block * im;
1900+
float score = 0;
1901+
for (int ii = 0; ii < 32; ++ii) {
1902+
int q = nearest_int((x[32*j + ii] + dm)*idl);
1903+
q = MAX(0, MIN(31, q));
1904+
float diff = dl * q - dm - x[32*j + ii];
1905+
score += weights[ii] * diff * diff;
1906+
}
1907+
if (score < best_score) {
1908+
best_score = score;
1909+
Ls[j] = il; Lm[j] = im;
1910+
}
1911+
}
1912+
}
18021913
uint8_t ls = Ls[j];
18031914
uint8_t lm = Lm[j];
1804-
ls = MIN(63, ls);
1805-
lm = MIN(63, lm);
18061915
if (j < 4) {
18071916
y[i].scales[j] = ls;
18081917
y[i].scales[j+4] = lm;
@@ -1818,9 +1927,9 @@ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * G
18181927
uint8_t sc, m;
18191928
for (int j = 0; j < QK_K/32; ++j) {
18201929
get_scale_min_k4(j, y[i].scales, &sc, &m);
1821-
const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1930+
const float d = d_block * sc;
18221931
if (!d) continue;
1823-
const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
1932+
const float dm = m_block * m;
18241933
for (int ii = 0; ii < 32; ++ii) {
18251934
int l = nearest_int((x[32*j + ii] + dm)/d);
18261935
l = MAX(0, MIN(31, l));

0 commit comments

Comments
 (0)