@@ -868,8 +868,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
868868 size_t row_idx = 0 ;
869869 double total_mse = 0.0 ;
870870 double total_proj = 0.0 ;
871+ double total_bias = 0.0 ;
871872 for (int64_t slice = 0 ; slice < ne2; ++slice) {
872- const int64_t rs = sample_rows_per_slice [slice];
873+ const int64_t rs = rows_sample [slice];
873874 if (rs == 0 ) { continue ; }
874875
875876 const float * values = has_values ? values_sample + slice * n_per_row : nullptr ;
@@ -918,21 +919,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
918919 }
919920 row_proj_norm.push_back (p_norm);
920921 }
922+
921923 offset += (size_t )n_per_row;
922924 }
923925
924926 // Trimmed sum to avoid outlier rows dominating the results
925927 auto trimmed_sum = [&](std::vector<double > & v) -> double {
926928 if (v.empty ()) { return 0.0 ; }
929+
927930 const int64_t n = (int64_t )v.size ();
928931 if (n < 50 ) {
929932 double s = 0.0 ;
930933 for (const double z : v) { s += z; }
934+
931935 return s;
932936 }
933937
934- int64_t k = (int64_t ) std::floor (0.02 * (double )n); // trim 2% on each side
935- k = std::max<int64_t >(0 , std::min<int64_t >(k, n / 32 )); // but not more than 3.125%
938+ int64_t k = (int64_t )std::floor (0.02 * (double )n); // trim 2% each side
939+ k = std::max<int64_t >(0 , std::min<int64_t >(k, n / 32 )); // cap at ~ 3.125%
936940 std::nth_element (v.begin (), v.begin () + k, v.end ());
937941 std::nth_element (v.begin () + k, v.begin () + (n - k), v.end ());
938942 double s = 0.0 ;
@@ -944,11 +948,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
944948 };
945949
946950 const double scale_rows = (double )nrows / std::max (1.0 , (double )rs);
951+ const double slice_mse = trimmed_sum (row_mse_norm) * scale_rows;
952+ const double slice_proj = activations ? trimmed_sum (row_proj_norm) * scale_rows : 0.0 ;
947953
948- total_mse += trimmed_sum (row_mse_norm) * scale_rows ;
949- if (activations) { total_proj += trimmed_sum (row_proj_norm) * scale_rows; }
954+ total_mse += slice_mse ;
955+ total_proj += slice_proj;
950956
951- if (!std::isfinite (total_mse) || !std::isfinite (total_proj)) {
957+ // per-slice lambda if provided, otherwise use scalar
958+ const double bl = slice_bias_lambda ? (double )std::max (0 .0f , slice_bias_lambda[slice]) : (double )tensor_bias_lambda;
959+ total_bias += bl * slice_proj;
960+
961+ if (!std::isfinite (total_mse) || !std::isfinite (total_proj) || !std::isfinite (total_bias)) {
952962 if (out_mse) { *out_mse = infinity; }
953963 if (out_proj) { *out_proj = 0.0 ; }
954964
@@ -959,133 +969,76 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
959969 if (out_mse) { *out_mse = total_mse; }
960970 if (out_proj) { *out_proj = total_proj; }
961971
962- const double total_err = total_mse + bias_lambda * total_proj;
972+ const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
973+
963974 return std::isfinite (total_err) ? total_err : infinity;
964975 };
965976
966- // Higher precision but longer to compute
967- auto precise_lambda = [&](const ggml_tensor * t,
968- const std::vector<float > & f32_sample,
969- const std::vector<int64_t > & sample_rows_per_slice,
970- const float * values,
971- const float * activations,
972- const std::vector<ggml_type> & compatible_candidates) -> float
977+ // Returns lambda per slice or 0.0 if no activations
978+ auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float >
973979 {
974- if (!activations) { return 0 .0f ; }
975-
976- std::vector<ggml_type> probes;
977- probes.reserve (3 );
978- auto push_if = [&](const ggml_type tiny) {
979- if (std::find (compatible_candidates.begin (), compatible_candidates.end (), tiny) != compatible_candidates.end ()) {
980- probes.push_back (tiny);
981- }
982- };
983-
984- push_if (GGML_TYPE_Q3_K);
985- push_if (GGML_TYPE_Q4_K);
986- push_if (GGML_TYPE_Q5_K);
987- if (probes.empty () && !compatible_candidates.empty ()) {
988- probes.push_back (compatible_candidates[compatible_candidates.size () / 2 ]);
989- }
990- if (probes.size () == 1 && compatible_candidates.size () >= 2 ) {
991- probes.push_back (compatible_candidates.front ());
992- }
993- if (probes.empty ()) { return 0 .0f ; }
994-
995- // Scratch buffers
996- const int64_t n_per_row = t->ne [0 ];
997- const size_t total_sampled_rows = f32_sample.size () / n_per_row;
998- size_t max_row_sz = 0 ;
999- for (auto pt : probes) max_row_sz = std::max (max_row_sz, ggml_row_size (pt, n_per_row));
1000-
1001- std::vector<uint8_t > quantized_buffer (max_row_sz * total_sampled_rows);
1002- std::vector<float > dequantized_buffer (f32_sample.size ());
1003-
1004- std::vector<double > ratios;
1005- ratios.reserve (probes.size ());
1006- for (const auto pt : probes) {
1007- double m = 0.0 ;
1008- double p = 0.0 ;
1009- (void )estimate_error (t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0 .0f , &m, &p);
1010- if (p > epsilon && std::isfinite (m) && std::isfinite (p)) {
1011- ratios.push_back (m / p);
1012- }
1013- }
1014-
1015- if (ratios.empty ()) { return 0 .0f ; }
1016-
1017- std::nth_element (ratios.begin (), ratios.begin () + ratios.size () / 2 , ratios.end ());
1018- const double lambda = std::clamp (ratios[ratios.size () / 2 ], 0.0 , 8.0 );
1019-
1020- return (float )lambda;
1021- };
1022-
1023- // Faster to compute but may yield lower precision. Best option for the vast majority of cases
1024- auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
1025- if (!activations) { return 0 .0f ; }
1026-
1027- double accum = 0.0 ;
1028- int ns = 0 ;
980+ std::vector<float > lambdas (std::max<int64_t >(1 , ne2), 0 .0f );
981+ if (!activations) { return lambdas; }
1029982
1030983 for (int64_t s = 0 ; s < std::max<int64_t >(1 , ne2); ++s) {
1031984 const float * v = values ? values + s * n_per_row : nullptr ;
1032985 const float * a = activations + s * n_per_row;
1033-
1034986 double s1 = 0.0 ;
1035987 double s2 = 0.0 ;
1036988 for (int64_t j = 0 ; j < n_per_row; ++j) {
1037- const double w = v ? std::max (0 .0f , v[j]) : 1.0 ;
989+ const double w = v ? std::max (0 .0f , v[j]) : 1.0 ;
1038990 const double aw = std::sqrt (w) * a[j];
1039991 const double aw2 = aw * aw;
1040992 s1 += aw2;
1041993 s2 += aw2 * aw2;
1042994 }
1043995
996+ float l = 0 .0f ;
1044997 if (s1 > 0.0 ) {
1045- const double n = (double )n_per_row;
1046- double c = std::max (0.0 , s2 / (s1 * s1 + epsilon) - 1.0 / n);
998+ const auto n = (double )n_per_row;
999+ const double c = std::max (0.0 , s2 / (s1 * s1 + epsilon) - 1.0 / n);
10471000 double lambda = 8.0 * (c / (c + 1.0 ));
1048- accum += std::clamp (lambda, 0.0 , 8.0 );
1049- ++ns;
1001+ l = (float )std::clamp (lambda, 0.0 , 12.0 );
10501002 }
1051- }
10521003
1053- if (ns == 0 ) { return 0 .0f ; }
1004+ lambdas[(size_t )s] = l;
1005+ }
10541006
1055- return ( float )(accum / ns) ;
1007+ return lambdas ;
10561008 };
10571009
10581010 std::vector<tensor_info> all;
10591011 all.reserve (tensors.size ());
10601012 for (const auto * tw : tensors) {
10611013 std::vector<std::thread> workers;
10621014 workers.reserve (std::max (1 , nthread));
1063- ggml_tensor * t = tw->tensor ;
1064- const std::string name = ggml_get_name (t );
1065- if (!can_quantize (t )) { continue ; }
1015+ ggml_tensor * tensor = tw->tensor ;
1016+ const std::string name = ggml_get_name (tensor );
1017+ if (!can_quantize (tensor )) { continue ; }
10661018
1067- LLAMA_LOG_INFO (" \t %s: - processing tensor %45s \t (%12d elements)\n " , __func__, name.c_str (), (int )ggml_nelements (t ));
1019+ LLAMA_LOG_INFO (" \t %s: - processing tensor %45s \t (%12d elements)\n " , __func__, name.c_str (), (int )ggml_nelements (tensor ));
10681020 if (!ml.use_mmap ) {
1069- if (buffer.size () < ggml_nbytes (t )) { buffer.resize (ggml_nbytes (t )); }
1070- t ->data = buffer.data ();
1021+ if (buffer.size () < ggml_nbytes (tensor )) { buffer.resize (ggml_nbytes (tensor )); }
1022+ tensor ->data = buffer.data ();
10711023 }
1072- ml.load_data_for (t);
1024+
1025+ ml.load_data_for (tensor);
10731026
10741027 // Dequantize sampled rows into f32_sample
1075- const int64_t n_per_row = t ->ne [0 ];
1076- const int64_t nrows_total = t ->ne [1 ];
1077- const int64_t ne2 = t ->ne [2 ] > 0 ? t ->ne [2 ] : 1 ;
1028+ const int64_t n_per_row = tensor ->ne [0 ];
1029+ const int64_t nrows_total = tensor ->ne [1 ];
1030+ const int64_t ne2 = tensor ->ne [2 ] > 0 ? tensor ->ne [2 ] : 1 ;
10781031
1079- // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1080- const int sample_rows_per_expert = activations_data ? 512 : 256 ;
1032+ // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
1033+ const int rows_sample_per_expert = activations_data ? 512 : 256 ;
10811034 std::vector<float > f32_sample;
1082- f32_sample.reserve ((size_t )ne2 * (size_t )std::min<int64_t >(nrows_total, sample_rows_per_expert ) * (size_t )n_per_row);
1035+ f32_sample.reserve ((size_t )ne2 * (size_t )std::min<int64_t >(nrows_total, rows_sample_per_expert ) * (size_t )n_per_row);
10831036
1084- std::vector<int64_t > sample_rows_per_slice (ne2, 0 );
1085- const int64_t sample_rows_max = std::max<int64_t >(1 , std::min<int64_t >(nrows_total, sample_rows_per_expert ));
1086- const int64_t stride = std::max<int64_t >(1 , nrows_total / sample_rows_max );
1037+ std::vector<int64_t > rows_sample (ne2, 0 );
1038+ const int64_t rows_sample_max = std::max<int64_t >(1 , std::min<int64_t >(nrows_total, rows_sample_per_expert ));
1039+ const int64_t stride = std::max<int64_t >(1 , nrows_total / rows_sample_max );
10871040 std::vector<float > row_buffer (n_per_row);
1088- const ggml_type src_type = t ->type ;
1041+ const ggml_type src_type = tensor ->type ;
10891042 const ggml_type_traits *src_traits = ggml_get_type_traits (src_type);
10901043 const bool src_is_quant = ggml_is_quantized (src_type);
10911044 const size_t src_row_sz = ggml_row_size (src_type, n_per_row);
@@ -1199,23 +1152,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11991152
12001153 // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
12011154 // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
1202- float bias_lambda = 0 .0f ;
1203- {
1204- const float * values = values_sample.empty () ? nullptr : values_sample.data ();
1205- const float * activations = activations_sample.empty () ? nullptr : activations_sample.data ();
1206- if (params->bpw_bias == 1 ) {
1207- bias_lambda = fast_lambda (values, activations, n_per_row, ne2);
1208- } else if (params->bpw_bias == 2 ) {
1209- bias_lambda = precise_lambda (t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
1210- }
1211- }
1212-
1213- // Now evaluate candidates
1214- std::vector<candidate_types> eval_candidates (compatible_candidates.size ());
1155+ float tensor_lambda = 0 .0f ;
12151156 const float * values = values_sample.empty () ? nullptr : values_sample.data ();
12161157 const float * activations = activations_sample.empty () ? nullptr : activations_sample.data ();
1158+ auto lambdas = estimate_lambda (values, activations, n_per_row, ne2);
1159+ double acc = 0.0 ;
1160+ int ns = 0 ;
1161+ for (float l : lambdas) { acc += l; ++ns; }
1162+ tensor_lambda = ns ? (float )(acc / ns) : 0 .0f ;
1163+
1164+ // Evaluate candidates
1165+ std::vector<candidate_types> eval_candidates (compatible_candidates.size ());
12171166 std::vector<uint8_t > quantized_buffer (max_row_sz * total_sampled_rows);
12181167 std::vector<float > dequantised_buffer (f32_sample.size ());
1168+ const float * slice_lambda = lambdas.empty () ? nullptr : lambdas.data ();
12191169 int n_eval_threads = std::max (1 , std::min<int >(nthread, (int )compatible_candidates.size ()));
12201170 std::atomic<size_t > cidx{0 };
12211171 std::vector<std::thread> eval_workers;
@@ -1476,7 +1426,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
14761426 int best_j = -1 ;
14771427 double best_ratio = -1.0 ;
14781428 size_t best_delta = 0 ;
1479-
14801429 for (int i = 0 ; i < (int )all.size (); ++i) {
14811430 const auto & ti = all[i];
14821431 if (ti.choice >= (int )ti.candidate .size () - 1 ) {
0 commit comments