@@ -2381,45 +2381,38 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
23812381
23822382 uint8_t L [QK_K ];
23832383 uint8_t Laux [32 ];
2384+ uint8_t Ls [QK_K /32 ];
2385+ uint8_t Lm [QK_K /32 ];
23842386 float weights [32 ];
2385- float mins [QK_K /32 ];
2386- float scales [QK_K /32 ];
2387+ float sw [QK_K /32 ];
2388+ float mins [QK_K /32 ];
2389+ float scales [QK_K /32 ];
23872390
23882391 for (int i = 0 ; i < nb ; i ++ ) {
23892392
23902393 float sum_x2 = 0 ;
23912394 for (int l = 0 ; l < QK_K ; ++ l ) sum_x2 += x [l ] * x [l ];
2392- float sigma2 = sum_x2 /QK_K ;
2395+ float sigma2 = 2 * sum_x2 /QK_K ;
23932396 float av_x = sqrtf (sigma2 );
23942397
2395- float max_scale = 0 ; // as we are deducting the min, scales are always positive
2396- float max_min = 0 ;
23972398 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
23982399 if (quant_weights ) {
23992400 const float * qw = quant_weights + QK_K * i + 32 * j ;
24002401 for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
24012402 } else {
24022403 for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
24032404 }
2405+ float sumw = 0 ;
2406+ for (int l = 0 ; l < 32 ; ++ l ) sumw += weights [l ];
2407+ sw [j ] = sumw ;
24042408 scales [j ] = make_qkx3_quants (32 , 15 , x + 32 * j , weights , L + 32 * j , & mins [j ], Laux , -0.9f , 0.05f , 36 , false);
2405- //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2406- float scale = scales [j ];
2407- if (scale > max_scale ) {
2408- max_scale = scale ;
2409- }
2410- float min = mins [j ];
2411- if (min > max_min ) {
2412- max_min = min ;
2413- }
24142409 }
24152410
2416- float inv_scale = max_scale > 0 ? 63.f / max_scale : 0.f ;
2417- float inv_min = max_min > 0 ? 63.f / max_min : 0.f ;
2411+ float d_block = make_qp_quants ( QK_K / 32 , 63 , scales , Ls , sw ) ;
2412+ float m_block = make_qp_quants ( QK_K / 32 , 63 , mins , Lm , sw ) ;
24182413 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2419- uint8_t ls = nearest_int (inv_scale * scales [j ]);
2420- uint8_t lm = nearest_int (inv_min * mins [j ]);
2421- ls = MIN (63 , ls );
2422- lm = MIN (63 , lm );
2414+ uint8_t ls = Ls [j ];
2415+ uint8_t lm = Lm [j ];
24232416 if (j < 4 ) {
24242417 y [i ].scales [j ] = ls ;
24252418 y [i ].scales [j + 4 ] = lm ;
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
24292422 y [i ].scales [j - 0 ] |= ((lm >> 4 ) << 6 );
24302423 }
24312424 }
2432- y [i ].d = GGML_FP32_TO_FP16 (max_scale / 63.f );
2433- y [i ].dmin = GGML_FP32_TO_FP16 (max_min / 63.f );
2425+ y [i ].d = GGML_FP32_TO_FP16 (d_block );
2426+ y [i ].dmin = GGML_FP32_TO_FP16 (m_block );
24342427
24352428 uint8_t sc , m ;
24362429 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
@@ -2688,43 +2681,41 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
26882681 const int nb = n_per_row / QK_K ;
26892682
26902683 uint8_t L [QK_K ];
2691- float mins [QK_K /32 ];
2692- float scales [QK_K /32 ];
2693- float weights [32 ];
26942684 uint8_t Laux [32 ];
2685+ uint8_t Ls [QK_K /32 ];
2686+ uint8_t Lm [QK_K /32 ];
2687+ float mins [QK_K /32 ];
2688+ float scales [QK_K /32 ];
2689+ float sw [QK_K /32 ];
2690+ float weights [32 ];
26952691
26962692 for (int i = 0 ; i < nb ; i ++ ) {
26972693
26982694 float sum_x2 = 0 ;
26992695 for (int l = 0 ; l < QK_K ; ++ l ) sum_x2 += x [l ] * x [l ];
2700- float sigma2 = sum_x2 /QK_K ;
2696+ float sigma2 = 2 * sum_x2 /QK_K ;
27012697 float av_x = sqrtf (sigma2 );
27022698
2703- float max_scale = 0 ; // as we are deducting the min, scales are always positive
2704- float max_min = 0 ;
27052699 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
27062700 if (quant_weights ) {
27072701 const float * qw = quant_weights + QK_K * i + 32 * j ;
27082702 for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
27092703 } else {
27102704 for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
27112705 }
2706+ float sumw = 0 ;
2707+ for (int l = 0 ; l < 32 ; ++ l ) sumw += weights [l ];
2708+ sw [j ] = sumw ;
2709+
27122710 scales [j ] = make_qkx3_quants (32 , 31 , x + 32 * j , weights , L + 32 * j , & mins [j ], Laux , -0.9f , 0.05f , 36 , false);
2713- float scale = scales [j ];
2714- if (scale > max_scale ) {
2715- max_scale = scale ;
2716- }
2717- float min = mins [j ];
2718- if (min > max_min ) {
2719- max_min = min ;
2720- }
27212711 }
27222712
2723- float inv_scale = max_scale > 0 ? 63.f /max_scale : 0.f ;
2724- float inv_min = max_min > 0 ? 63.f /max_min : 0.f ;
2713+ float d_block = make_qp_quants (QK_K /32 , 63 , scales , Ls , sw );
2714+ float m_block = make_qp_quants (QK_K /32 , 63 , mins , Lm , sw );
2715+
27252716 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2726- uint8_t ls = nearest_int ( inv_scale * scales [j ]) ;
2727- uint8_t lm = nearest_int ( inv_min * mins [j ]) ;
2717+ uint8_t ls = Ls [j ];
2718+ uint8_t lm = Lm [j ];
27282719 ls = MIN (63 , ls );
27292720 lm = MIN (63 , lm );
27302721 if (j < 4 ) {
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
27362727 y [i ].scales [j - 0 ] |= ((lm >> 4 ) << 6 );
27372728 }
27382729 }
2739- y [i ].d = GGML_FP32_TO_FP16 (max_scale / 63.f );
2740- y [i ].dmin = GGML_FP32_TO_FP16 (max_min / 63.f );
2730+ y [i ].d = GGML_FP32_TO_FP16 (d_block );
2731+ y [i ].dmin = GGML_FP32_TO_FP16 (m_block );
27412732
27422733 uint8_t sc , m ;
27432734 for (int j = 0 ; j < QK_K /32 ; ++ j ) {
0 commit comments