@@ -722,7 +722,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
722722 const float * values_sample,
723723 const float * activations_sample,
724724 std::vector<uint8_t > & quantized_buffer,
725- std::vector<float > & dequantized_buffer) -> double
725+ std::vector<float > & dequantized_buffer,
726+ float bias_lambda) -> double
726727 {
727728 const int64_t n_per_row = t->ne [0 ];
728729 const int64_t nrows = t->ne [1 ];
@@ -878,10 +879,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
878879 }
879880 }
880881
881- // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
882- // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
883- constexpr float bias_lambda = 1 .5f ;
884- constexpr double epsilon = 1e-12 ;
885882 double err_num = weighted_mse;
886883 if (activations && bias_lambda != 0 .0f ) {
887884 const double proj = bias_num * bias_num / (bias_denom + epsilon);
@@ -1163,6 +1160,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11631160 std::sort (compatible_candidates.begin (), compatible_candidates.end ());
11641161 compatible_candidates.erase (std::unique (compatible_candidates.begin (), compatible_candidates.end ()), compatible_candidates.end ());
11651162
1163+ // Compute adaptive bias_lambda for this tensor
1164+ float bias_lambda = 0 .0f ;
1165+ {
1166+ const float * values = values_sample.empty () ? nullptr : values_sample.data ();
1167+ const float * activations = activations_sample.empty () ? nullptr : activations_sample.data ();
1168+ bias_lambda = params->precise_lambda ? precise_lambda (t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
1169+ fast_lambda (values, activations, n_per_row);
1170+ }
1171+
11661172 // Now evaluate candidates
11671173 std::vector<candidate_types> eval_candidates (compatible_candidates.size ());
11681174 const float * values = values_sample.empty () ? nullptr : values_sample.data ();
@@ -1186,7 +1192,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
11861192 const ggml_type tt = compatible_candidates[i];
11871193 const auto bpw = (float )tensor_bpw (t, tt);
11881194 const size_t bytes = tensor_bytes (t, tt);
1189- const auto err = (float )estimate_error (t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
1195+ const auto err = (float )estimate_error (t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda );
11901196 eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
11911197 }
11921198 });
@@ -1301,7 +1307,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
13011307 };
13021308
13031309 auto recompute_best_upgrade = [&]() -> upgrade {
1304- const double eps = 1e-12 ;
13051310 upgrade best{ -1 , -1 , 0.0 , 0 , -1.0 };
13061311 for (int i = 0 ; i < (int ) all.size (); ++i) {
13071312 const auto & ti = all[i];
@@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
16531658 if (params->target_bpw != -1 .0f && !params->only_copy ) {
16541659 if (params->imatrix ) {
16551660 if (params->activations ) {
1656- LLAMA_LOG_INFO (" %s: imatrix with activations provided, target bpw quantization will be more accurate\n " , __func__);
1661+ LLAMA_LOG_INFO (" %s: imatrix with activations provided, target bpw quantization will be more accurate - " , __func__);
16571662 } else {
1658- LLAMA_LOG_WARN (" %s: imatrix without activations provided, target bpw quantization will be less accurate\n " , __func__);
1663+ LLAMA_LOG_WARN (" %s: imatrix without activations provided, target bpw quantization will be less accurate - " , __func__);
16591664 }
1665+ LLAMA_LOG_INFO (" using %s\n " , params->precise_lambda ? " precise lambda (slow)" : " fast lambda" );
16601666 LLAMA_LOG_INFO (" %s: computing tensor quantization mix to achieve %.4f bpw\n " , __func__, params->target_bpw );
16611667 bpw_overrides = target_bpw_type (ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
16621668 } else {
@@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
19661972 /* .kv_overrides =*/ nullptr ,
19671973 /* .tensor_type =*/ nullptr ,
19681974 /* .prune_layers =*/ nullptr ,
1969- /* .target_bpw =*/ -1 .0f
1975+ /* .target_bpw =*/ -1 .0f ,
1976+ /* .precise_lambda =*/ false
19701977 };
19711978
19721979 return result;
0 commit comments