@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
596596 return new_size;
597597}
598598
599- // Returns per- tensor type overrides to meet target BPW at lowest error
599+ // Returns tensor type overrides to meet a global bpw target
600600static std::unordered_map<std::string, ggml_type> target_bpw_type (
601601 llama_model_loader & ml,
602602 std::vector<no_init<uint8_t >> & buffer,
@@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
650650 };
651651
652652 constexpr double epsilon = 1e-12 ;
653+ constexpr double infinity = std::numeric_limits<double >::infinity ();
653654
654655 auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
655656 const int64_t n_per_row = t->ne [0 ];
@@ -680,7 +681,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
680681
681682 auto name_tn = LLM_TN (model.arch );
682683 auto can_quantize = [&](const ggml_tensor * t) -> bool {
683- // This list should be kept in sync with llama_tensor_quantize_impl()
684+ // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
684685 const std::string name = ggml_get_name (t);
685686 bool q = name.rfind (" weight" ) == name.size () - 6 ;
686687 q &= ggml_n_dims (t) >= 2 ;
@@ -730,9 +731,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
730731 const int64_t ne2 = t->ne [2 ] > 0 ? t->ne [2 ] : 1 ;
731732
732733 const size_t sample_element_count = f32_sample.size ();
733- const size_t sample_row_count = sample_element_count / (size_t )n_per_row;
734+ const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t )n_per_row : 0 ;
734735 if (sample_row_count == 0 ) { return 0.0 ; }
735736
737+ size_t expected_rows = 0 ;
738+ for (int64_t s = 0 ; s < ne2; ++s) {
739+ expected_rows += (size_t )sample_rows_per_slice[s];
740+ }
741+ if (expected_rows != sample_row_count) { return infinity; }
742+
736743 const size_t row_sz = ggml_row_size (quant_type, n_per_row);
737744 const size_t buffer_sz = row_sz * sample_row_count;
738745
@@ -750,15 +757,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
750757 const float * activations = activations_sample + s * n_per_row;
751758 double denom = 0.0 ;
752759 for (int64_t j = 0 ; j < n_per_row; ++j) {
760+ const double w = values ? std::max (0 .0f , values[j]) : 1.0 ;
753761 const double a = activations[j];
754- const double w = values ? values[j] : 1.0 ;
755762 denom += w * a * a;
756763 }
757764 bias_denominator_per_slice[s] = denom;
758765 }
759766 }
760767
761- // Compute per -row squared norms with weighting (if values are provided)
768+ // Per -row squared norms with weighting
762769 std::vector<double > row_sq_norm (sample_row_count, 0.0 );
763770 {
764771 size_t offset = 0 ;
@@ -768,15 +775,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
768775 if (rs == 0 ) { continue ; }
769776
770777 const float * values = has_values ? values_sample + s * n_per_row : nullptr ;
771-
772778 for (int64_t r = 0 ; r < rs; ++r, ++row_idx) {
773779 const float * x = f32_sample.data () + offset;
774780 double rsn = 0.0 ;
775781 if (values) {
776782 for (int64_t j = 0 ; j < n_per_row; ++j) {
777- const double v = values[j];
783+ const double w = std::max ( 0 . 0f , values[j]) ;
778784 const double xx = x[j];
779- rsn += v * xx * xx;
785+ rsn += w * xx * xx;
780786 }
781787 } else {
782788 for (int64_t j = 0 ; j < n_per_row; ++j) {
@@ -790,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
790796 }
791797 }
792798
793- // Quantize sampled rows slice-by-slice into quantized_buffer
799+ // Quantize sampled rows per slice -> quantized_buffer
794800 {
795801 size_t q_offset = 0 ;
796802 size_t f_offset = 0 ;
@@ -800,70 +806,66 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
800806
801807 const float * value = has_values ? values_sample + slice * n_per_row : nullptr ;
802808 (void )ggml_quantize_chunk (quant_type, f32_sample.data () + f_offset, quantized_buffer.data () + q_offset, 0 , rs, n_per_row, value);
803-
804809 q_offset += row_sz * (size_t )rs;
805810 f_offset += (size_t )rs * (size_t )n_per_row;
806811 }
807812 }
808813
809- // Dequantize into dequantized_buffer
814+ // quantized_buffer -> dequantized_buffer
810815 {
811816 const ggml_type_traits * traits = ggml_get_type_traits (quant_type);
812- auto row_to_float = [&](size_t r) {
813- uint8_t * src = quantized_buffer.data () + r * row_sz;
814- float * dst = dequantized_buffer.data () + r * (size_t )n_per_row;
815- if (quant_type == GGML_TYPE_F16) {
816- ggml_fp16_to_fp32_row ((const ggml_fp16_t *)src, dst, (int )n_per_row);
817- } else if (quant_type == GGML_TYPE_BF16) {
818- ggml_bf16_to_fp32_row ((const ggml_bf16_t *)src, dst, (int )n_per_row);
819- } else {
820- if (!traits || !traits->to_float ) {
821- LLAMA_LOG_WARN (" %s: unsupported quantization type %s\n " , __func__, ggml_type_name (quant_type));
822- return false ;
817+
818+ const bool is_fp16 = quant_type == GGML_TYPE_F16;
819+ const bool is_bf16 = quant_type == GGML_TYPE_BF16;
820+ if (!is_fp16 && !is_bf16 && traits && traits->to_float ) {
821+ traits->to_float (quantized_buffer.data (), dequantized_buffer.data (), (int )(sample_row_count * (size_t )n_per_row));
822+ } else {
823+ for (size_t r = 0 ; r < sample_row_count; ++r) {
824+ uint8_t * src = quantized_buffer.data () + r * row_sz;
825+ float * dst = dequantized_buffer.data () + r * (size_t ) n_per_row;
826+ if (is_fp16) {
827+ ggml_fp16_to_fp32_row ((const ggml_fp16_t *) src, dst, (int )n_per_row);
828+ } else if (is_bf16) {
829+ ggml_bf16_to_fp32_row ((const ggml_bf16_t *) src, dst, (int )n_per_row);
830+ } else {
831+ if (!traits || !traits->to_float ) { return infinity; }
832+ traits->to_float (src, dst, (int )n_per_row);
823833 }
824- traits->to_float (src, dst, (int )n_per_row);
825834 }
826-
827- return true ;
828- };
829-
830- for (size_t r = 0 ; r < sample_row_count; ++r) {
831- if (!row_to_float (r)) { return 1e35 ; }
832835 }
833836 }
834837
835838 // Compute error
836839 size_t offset = 0 ;
837840 size_t row_idx = 0 ;
838841 double total_err = 0.0 ;
842+
839843 for (int64_t slice = 0 ; slice < ne2; ++slice) {
840844 const int64_t rs = sample_rows_per_slice[slice];
841845 if (rs == 0 ) { continue ; }
842846
843847 const float * values = has_values ? values_sample + slice * n_per_row : nullptr ;
844848 const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr ;
845849 const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0 ;
846-
847850 double slice_err = 0.0 ;
848-
849851 for (int64_t r = 0 ; r < rs; ++r, ++row_idx) {
850852 const float * x = f32_sample.data () + offset;
851853 const float * y = dequantized_buffer.data () + offset;
852854 double weighted_mse = 0.0 ;
853855 double bias_num = 0.0 ;
854856 if (values && activations) {
855857 for (int64_t j = 0 ; j < n_per_row; ++j) {
856- const double v = values[j];
858+ const double w = std::max ( 0 . 0f , values[j]) ;
857859 const double e = y[j] - x[j];
858860 const double a = activations[j];
859- weighted_mse += v * e * e;
860- bias_num += v * e * a;
861+ weighted_mse += w * e * e;
862+ bias_num += w * e * a;
861863 }
862864 } else if (values) {
863865 for (int64_t j = 0 ; j < n_per_row; ++j) {
864- const double v = values[j];
866+ const double w = std::max ( 0 . 0f , values[j]) ;
865867 const double e = y[j] - x[j];
866- weighted_mse += v * e * e;
868+ weighted_mse += w * e * e;
867869 }
868870 } else if (activations) {
869871 for (int64_t j = 0 ; j < n_per_row; ++j) {
@@ -881,26 +883,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
881883
882884 double err_num = weighted_mse;
883885 if (activations && bias_lambda != 0 .0f ) {
884- const double proj = bias_num * bias_num / (bias_denom + epsilon);
885- err_num += (double )bias_lambda * proj;
886+ if (bias_denom > 0.0 ) {
887+ const double proj = bias_num * bias_num / (bias_denom + epsilon);
888+ err_num += bias_lambda * proj;
889+ }
886890 }
887891
888- const double err_den = row_sq_norm[row_idx] + epsilon;
889- slice_err += err_num / err_den ;
892+ const double denom = row_sq_norm[row_idx] + epsilon;
893+ slice_err += err_num / denom ;
890894 offset += (size_t )n_per_row;
891895 }
892896
893897 const double scale_rows = (double )nrows / std::max (1.0 , (double )rs);
894898 total_err += slice_err * scale_rows;
899+ if (!std::isfinite (total_err)) { return infinity; }
895900 }
896901
897- return std::isfinite (total_err) ? total_err : 1e35 ;
902+ return std::isfinite (total_err) ? total_err : infinity ;
898903 };
899904
905+ // Scaling factor to increase lambda when activations are concentrated
900906 auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
901907 if (!activations) { return 1 .0f ; }
902- // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
903- // If no values, use v=1
904908 double sum_v = 0.0 ;
905909 double sum_aw2 = 0.0 ;
906910 double sum_a2 = 0.0 ;
@@ -915,24 +919,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
915919 const double denom = std::sqrt (std::max (epsilon, sum_v)) * std::max (epsilon, rms_a);
916920 const double scale = denom > 0.0 ? std::sqrt (sum_aw2) / denom : 1.0 ;
917921
918- // Clamp to a reasonable range
919922 return (float )std::clamp (scale, 0.5 , 2.0 );
920923 };
921924
922- // Returns an adaptive lambda for this tensor using a small probe set
923- // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
924- // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
925+ // Higher precision but much longer to compute
925926 auto precise_lambda = [&](const ggml_tensor * t,
926927 const std::vector<float > & f32_sample,
927928 const std::vector<int64_t > & sample_rows_per_slice,
928929 const float * values,
929930 const float * activations,
930931 const std::vector<ggml_type> & compatible_candidates) -> float
931932 {
932- // No activations => no projection term
933933 if (!activations) { return 0 .0f ; }
934934
935- // pick a tiny probe set: try to spread around mid-range types
936935 std::vector<ggml_type> probes;
937936 probes.reserve (3 );
938937 auto push_if = [&](const ggml_type tiny) {
@@ -941,7 +940,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
941940 }
942941 };
943942
944- // Prefer family-consistent probes; fall back to whatever exists
945943 push_if (GGML_TYPE_Q4_K);
946944 push_if (GGML_TYPE_Q3_K);
947945 push_if (GGML_TYPE_Q5_K);
@@ -953,19 +951,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
953951 }
954952 if (probes.empty ()) { return 0 .0f ; }
955953
956- // Scratch buffers (reused)
954+ // Scratch buffers
957955 const int64_t n_per_row = t->ne [0 ];
958956 const size_t total_sampled_rows = f32_sample.size () / n_per_row;
959957 size_t max_row_sz = 0 ;
960958 for (auto pt : probes) {
961959 max_row_sz = std::max (max_row_sz, ggml_row_size (pt, n_per_row));
962960 }
961+
963962 std::vector<uint8_t > quantized_buffer (max_row_sz * total_sampled_rows);
964963 std::vector<float > dequantized_buffer (f32_sample.size ());
965-
966964 std::vector<double > ratios;
967965 ratios.reserve (probes.size ());
968-
969966 for (const auto pt : probes) {
970967 // err at lambda=0 => pure weighted MSE part
971968 double err0 = estimate_error (t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0 .0f );
@@ -984,17 +981,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
984981 std::nth_element (ratios.begin (), ratios.begin () + ratios.size () / 2 , ratios.end ());
985982 double lambda = ratios[ratios.size () / 2 ];
986983
987- // activations directional scale
988984 const float scale = directional_scale (values, activations, n_per_row);
989985 lambda *= scale;
990-
991- // clamp to safe range
992986 lambda = std::clamp (lambda, 0.0 , 8.0 );
987+
993988 return (float )lambda;
994989 };
995990
991+ // Faster to compute but lower precision. Best option for the vast majority of models
996992 auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
997993 if (!activations) { return 0 .0f ; }
994+
998995 double s = 0.0 ;
999996 double s2 = 0.0 ;
1000997 for (int64_t j = 0 ; j < n_per_row; ++j) {
@@ -1004,17 +1001,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10041001 s += aw2;
10051002 s2 += aw2 * aw2;
10061003 }
1004+
10071005 if (s2 <= 0.0 ) { return 0 .0f ; }
10081006 const auto d = (double )n_per_row;
1009- // const double p = s * s / (d * s2 + epsilon);
1010- // const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
1011- // Map p in (0,1] to lambda in [0,8] decreasing
10121007 double base = 1.0 - s * s / (d * s2 + epsilon);
10131008 base = std::clamp (base, 0.0 , 1.0 );
10141009
1015- // activations directional scale
10161010 const double scale = directional_scale (values, activations, n_per_row);
1017- // clamp to safe range
10181011 const double lambda = std::clamp (base * scale, 0.0 , 1.0 ) * 8.0 ;
10191012
10201013 return (float )lambda;
@@ -1036,13 +1029,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10361029 }
10371030 ml.load_data_for (t);
10381031
1039- // Dequantize only sampled rows into f32_sample
1032+ // Dequantize sampled rows into f32_sample
10401033 const int64_t n_per_row = t->ne [0 ];
10411034 const int64_t nrows_total = t->ne [1 ];
10421035 const int64_t ne2 = t->ne [2 ] > 0 ? t->ne [2 ] : 1 ;
10431036
1044- // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
1045- constexpr int sample_rows_per_expert = 384 ;
1037+ // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1038+ constexpr int sample_rows_per_expert = 256 ;
10461039 std::vector<float > f32_sample;
10471040 f32_sample.reserve ((size_t )ne2 * (size_t )std::min<int64_t >(nrows_total, sample_rows_per_expert) * (size_t )n_per_row);
10481041
@@ -1096,6 +1089,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
10961089 const std::string key = remap_imatrix (tensor_name, mapped);
10971090 const auto it = m->find (key);
10981091 if (it == m->end ()) { return {nullptr , 0 }; }
1092+
10991093 return { it->second .data (), it->second .size () };
11001094 };
11011095
@@ -1104,7 +1098,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11041098 const size_t want = (size_t )ne2 * (size_t )n_per_row;
11051099 dst.clear ();
11061100 if (!src || src_sz == 0 ) { return ; }
1107-
11081101 if (src_sz == want) {
11091102 dst.resize (want);
11101103 std::memcpy (dst.data (), src, want * sizeof (float ));
@@ -1160,7 +1153,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11601153 std::sort (compatible_candidates.begin (), compatible_candidates.end ());
11611154 compatible_candidates.erase (std::unique (compatible_candidates.begin (), compatible_candidates.end ()), compatible_candidates.end ());
11621155
1163- // Compute adaptive bias_lambda for this tensor
1156+ // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
1157+ // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
11641158 float bias_lambda = 0 .0f ;
11651159 {
11661160 const float * values = values_sample.empty () ? nullptr : values_sample.data ();
0 commit comments