@@ -1176,7 +1176,7 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
11761176 memset (sw , 0 , QK_K /16 * sizeof (float ));
11771177 float sumx2 = 0 ;
11781178 for (int j = 0 ; j < QK_K ; ++ j ) sumx2 += x [j ]* x [j ];
1179- float sigma2 = 0.75f * sumx2 /QK_K ;
1179+ float sigma2 = 0.5f * sumx2 /QK_K ;
11801180 for (int j = 0 ; j < QK_K /16 ; ++ j ) {
11811181 const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16 * j ;
11821182 for (int l = 0 ; l < 16 ; ++ l ) weight [l ] = qw [l ] * sqrtf (sigma2 + x [16 * j + l ]* x [16 * j + l ]);
@@ -1191,6 +1191,30 @@ static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * G
11911191 y [i ].dmin = GGML_FP32_TO_FP16 (mm );
11921192
11931193 for (int j = 0 ; j < QK_K /16 ; ++ j ) {
1194+ const float * restrict qw = quant_weights + QK_K * i + 16 * j ;
1195+ for (int l = 0 ; l < 16 ; ++ l ) weight [l ] = qw [l ] * sqrtf (sigma2 + x [16 * j + l ]* x [16 * j + l ]);
1196+ int lmin = MAX (Ls [j ]- 1 , 0 );
1197+ int lmax = MIN (Ls [j ]+ 1 ,15 );
1198+ int mmin = MAX (Lm [j ]- 1 , 0 );
1199+ int mmax = MIN (Lm [j ]+ 1 ,15 );
1200+ float best_score = INFINITY ;
1201+ for (int il = lmin ; il <= lmax ; ++ il ) {
1202+ float d = dm * il ;
1203+ float id = d ? 1 /d : 0.f ;
1204+ for (int im = mmin ; im <= mmax ; ++ im ) {
1205+ float m = mm * im ;
1206+ float score = 0 ;
1207+ for (int ii = 0 ; ii < 16 ; ++ ii ) {
1208+ int q = nearest_int ((x [16 * j + ii ] + m )* id );
1209+ q = MAX (0 , MIN (3 , q ));
1210+ float diff = d * q - m - x [16 * j + ii ];
1211+ score += weight [ii ] * diff * diff ;
1212+ }
1213+ if (score < best_score ) {
1214+ best_score = score ; Ls [j ] = il ; Lm [j ] = im ;
1215+ }
1216+ }
1217+ }
11941218 float d = dm * Ls [j ];
11951219 float m = mm * Lm [j ];
11961220 float id = d ? 1 /d : 0.f ;
@@ -1393,6 +1417,30 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
13931417
13941418 float d_block = make_qx_quants (QK_K /16 , 32 , scales , Ls , 1 , sw );
13951419 for (int j = 0 ; j < QK_K /16 ; ++ j ) {
1420+ // Somehow this does not help
1421+ //if (quant_weights) {
1422+ // const float * qw = quant_weights + QK_K * i + 16*j;
1423+ // for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
1424+ //} else {
1425+ // for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
1426+ //}
1427+ //int lmin = MAX( 0, Ls[j]-1);
1428+ //int lmax = MIN(63, Ls[j]+1);
1429+ //float best_score = INFINITY;
1430+ //for (int ls = lmin; ls <= lmax; ++ls) {
1431+ // float dl = d_block * (ls - 32);
1432+ // float idl = dl ? 1/dl : 0.f;
1433+ // float score = 0;
1434+ // for (int ii = 0; ii < 16; ++ii) {
1435+ // int q = nearest_int(idl*x[16*j + ii]);
1436+ // q = MAX(-4, MIN(3, q));
1437+ // float diff = dl*q - x[16*j + ii];
1438+ // score += weight[ii] * diff * diff;
1439+ // }
1440+ // if (score < best_score) {
1441+ // best_score = score; Ls[j] = ls;
1442+ // }
1443+ //}
13961444 int l = Ls [j ];
13971445 if (j < 8 ) {
13981446 y [i ].scales [j ] = l & 0xF ;
@@ -1408,7 +1456,8 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
14081456 for (int j = 0 ; j < QK_K /16 ; ++ j ) {
14091457 sc = j < 8 ? y [i ].scales [j ] & 0xF : y [i ].scales [j - 8 ] >> 4 ;
14101458 sc = (sc | (((y [i ].scales [8 + j %4 ] >> (2 * (j /4 ))) & 3 ) << 4 )) - 32 ;
1411- float d = GGML_FP16_TO_FP32 (y [i ].d ) * sc ;
1459+ //float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1460+ float d = d_block * sc ;
14121461 if (!d ) {
14131462 continue ;
14141463 }
@@ -1438,6 +1487,8 @@ static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * G
14381487 }
14391488 }
14401489
1490+ y [i ].d = GGML_FP32_TO_FP16 (1.015f * d_block );
1491+
14411492 x += QK_K ;
14421493 }
14431494}
@@ -1592,6 +1643,35 @@ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * G
15921643 float d_block = make_qp_quants (QK_K /32 , 63 , scales , Ls , sw );
15931644 float m_block = make_qp_quants (QK_K /32 , 63 , mins , Lm , sw );
15941645 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
1646+ if (quant_weights ) {
1647+ const float * qw = quant_weights + QK_K * i + 32 * j ;
1648+ for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
1649+ } else {
1650+ for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
1651+ }
1652+ int lmin = MAX ( 0 , Ls [j ] - 1 );
1653+ int lmax = MIN (63 , Ls [j ] + 1 );
1654+ int mmin = MAX ( 0 , Lm [j ] - 1 );
1655+ int mmax = MIN (63 , Lm [j ] + 1 );
1656+ float best_score = INFINITY ;
1657+ for (int il = lmin ; il <= lmax ; ++ il ) {
1658+ float dl = d_block * il ;
1659+ float idl = dl ? 1 /dl : 0.f ;
1660+ for (int im = mmin ; im <= mmax ; ++ im ) {
1661+ float dm = m_block * im ;
1662+ float score = 0 ;
1663+ for (int ii = 0 ; ii < 32 ; ++ ii ) {
1664+ int q = nearest_int ((x [32 * j + ii ] + dm )* idl );
1665+ q = MAX (0 , MIN (15 , q ));
1666+ float diff = dl * q - dm - x [32 * j + ii ];
1667+ score += weights [ii ] * diff * diff ;
1668+ }
1669+ if (score < best_score ) {
1670+ best_score = score ;
1671+ Ls [j ] = il ; Lm [j ] = im ;
1672+ }
1673+ }
1674+ }
15951675 uint8_t ls = Ls [j ];
15961676 uint8_t lm = Lm [j ];
15971677 if (j < 4 ) {
@@ -1609,9 +1689,11 @@ static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * G
16091689 uint8_t sc , m ;
16101690 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
16111691 get_scale_min_k4 (j , y [i ].scales , & sc , & m );
1612- const float d = GGML_FP16_TO_FP32 (y [i ].d ) * sc ;
1692+ //const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
1693+ const float d = d_block * sc ;
16131694 if (!d ) continue ;
1614- const float dm = GGML_FP16_TO_FP32 (y [i ].dmin ) * m ;
1695+ //const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
1696+ const float dm = m_block * m ;
16151697 for (int ii = 0 ; ii < 32 ; ++ ii ) {
16161698 int l = nearest_int ((x [32 * j + ii ] + dm )/d );
16171699 l = MAX (0 , MIN (15 , l ));
@@ -1799,10 +1881,37 @@ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * G
17991881 float m_block = make_qp_quants (QK_K /32 , 63 , mins , Lm , sw );
18001882
18011883 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
1884+ if (quant_weights ) {
1885+ const float * qw = quant_weights + QK_K * i + 32 * j ;
1886+ for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
1887+ } else {
1888+ for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
1889+ }
1890+ int lmin = MAX ( 0 , Ls [j ] - 2 );
1891+ int lmax = MIN (63 , Ls [j ] + 2 );
1892+ int mmin = MAX ( 0 , Lm [j ] - 2 );
1893+ int mmax = MIN (63 , Lm [j ] + 2 );
1894+ float best_score = INFINITY ;
1895+ for (int il = lmin ; il <= lmax ; ++ il ) {
1896+ float dl = d_block * il ;
1897+ float idl = dl ? 1 /dl : 0.f ;
1898+ for (int im = mmin ; im <= mmax ; ++ im ) {
1899+ float dm = m_block * im ;
1900+ float score = 0 ;
1901+ for (int ii = 0 ; ii < 32 ; ++ ii ) {
1902+ int q = nearest_int ((x [32 * j + ii ] + dm )* idl );
1903+ q = MAX (0 , MIN (31 , q ));
1904+ float diff = dl * q - dm - x [32 * j + ii ];
1905+ score += weights [ii ] * diff * diff ;
1906+ }
1907+ if (score < best_score ) {
1908+ best_score = score ;
1909+ Ls [j ] = il ; Lm [j ] = im ;
1910+ }
1911+ }
1912+ }
18021913 uint8_t ls = Ls [j ];
18031914 uint8_t lm = Lm [j ];
1804- ls = MIN (63 , ls );
1805- lm = MIN (63 , lm );
18061915 if (j < 4 ) {
18071916 y [i ].scales [j ] = ls ;
18081917 y [i ].scales [j + 4 ] = lm ;
@@ -1818,9 +1927,9 @@ static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * G
18181927 uint8_t sc , m ;
18191928 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
18201929 get_scale_min_k4 (j , y [i ].scales , & sc , & m );
1821- const float d = GGML_FP16_TO_FP32 ( y [ i ]. d ) * sc ;
1930+ const float d = d_block * sc ;
18221931 if (!d ) continue ;
1823- const float dm = GGML_FP16_TO_FP32 ( y [ i ]. dmin ) * m ;
1932+ const float dm = m_block * m ;
18241933 for (int ii = 0 ; ii < 32 ; ++ ii ) {
18251934 int l = nearest_int ((x [32 * j + ii ] + dm )/d );
18261935 l = MAX (0 , MIN (31 , l ));
0 commit comments