From ba7335efb363515052a5f8aa755e4a5cd1250150 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 09:54:29 +0100 Subject: [PATCH 001/148] Refactor variable name --- include/llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/llama.h b/include/llama.h index 545e957e5f5..b17e8f33533 100644 --- a/include/llama.h +++ b/include/llama.h @@ -354,6 +354,7 @@ extern "C" { bool pure; // quantize all tensors to the default type bool keep_split; // quantize to the same number of shards void * imatrix; // pointer to importance matrix data + void * activations; // pointer to activations data void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune From 4d9491141b591d31f7fb91940ef4b1cf41bf94f6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:43:21 +0100 Subject: [PATCH 002/148] Add target_bpw parameter --- include/llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/llama.h b/include/llama.h index b17e8f33533..f44e2383d0e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -358,6 +358,7 @@ extern "C" { void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune + float target_bpw; // target bits per weight (bpw) } llama_model_quantize_params; typedef struct llama_logit_bias { From cfec4048abc478cd2769d1908e3ecc53ad2f28bd Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:43:51 +0100 Subject: [PATCH 003/148] Update usage --- tools/quantize/quantize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 470dc3d916b..b2d62f1490d 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,6 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From 5e85fb3ff34c5253c3dfa51eb5b9b9bfd6aaaaea Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:46:36 +0100 Subject: [PATCH 004/148] Add parse_target_bpw() --- tools/quantize/quantize.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index b2d62f1490d..afd2edb156e 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -441,6 +441,27 @@ static bool parse_layer_prune(const char * data, std::vector & prune_layers return true; } +static bool parse_target_bpw(const char * data, float & target_bpw) { + if (!data) { + printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__); + return false; + } + + try { + target_bpw = std::stof(data); + if (target_bpw < 0.0f || target_bpw > 8.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + return false; + } + } + catch (const std::exception & e) { + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + return false; + } + + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); From e6d55dc47b42054dcef4a72145cfffb3cb26bd0f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:49:01 +0100 Subject: [PATCH 005/148] Load activations --- tools/quantize/quantize.cpp | 46 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index afd2edb156e..3d07abd2d0a 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -247,56 +247,69 @@ static int load_imatrix(const std::string & imatrix_file, std::vector> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; if (name.empty()) { continue; } - if (string_remove_suffix(name, sums_suffix)) { + if (string_remove_suffix(name, sums2_suffix)) { // in_sum2 - sums_counts_for[std::move(name)].first = cur; + std::get<0>(sums_counts_for[std::move(name)]) = cur; } else if (string_remove_suffix(name, counts_suffix)) { // counts - sums_counts_for[std::move(name)].second = cur; - } else { + std::get<1>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, sums_suffix)) { + // in_sum + std::get<2>(sums_counts_for[std::move(name)]) = cur; + } + else { // ignore other tensors } } for (const auto & sc : sums_counts_for) { const std::string & name = sc.first; - const struct ggml_tensor * sums = sc.second.first; - const struct ggml_tensor * counts = sc.second.second; + const struct ggml_tensor * sums = std::get<2>(sc.second); + const struct ggml_tensor * sums2 = std::get<0>(sc.second); + const struct ggml_tensor * counts = std::get<1>(sc.second); - if (!sums || !counts) { + // check that sums, sums2 and counts have the same shape + if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); gguf_free(ctx_gguf); ggml_free(ctx); exit(1); } - const int64_t ne0 = sums->ne[0]; - const int64_t ne1 = sums->ne[1]; + const int64_t ne0 = sums2->ne[0]; + const int64_t ne1 = sums2->ne[1]; - auto & e = imatrix_data[name]; - e.resize(ggml_nelements(sums)); + auto & activations = activations_data[name]; + auto & values = values_data[name]; + if (sums) { + activations.resize(ggml_nelements(sums)); + } + values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { const float count = ((const float *) counts->data)[j]; if (count > 0.0f) { for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; + values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count; + if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; } } } else { // Partial imatrix data, this tensor never got any input during calibration for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = 1; + values[j*ne0 + i] = 1; + if (sums) { activations[j*ne0 + i] = 0; } } } if (count > max_count) { @@ -304,7 +317,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector Date: Tue, 19 Aug 2025 10:50:37 +0100 Subject: [PATCH 006/148] Populate activations_data with imatrix activations if present --- tools/quantize/quantize.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 3d07abd2d0a..c2a4767fc9e 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -561,10 +561,11 @@ int main(int argc, char ** argv) { } std::vector imatrix_datasets; - std::unordered_map> imatrix_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data); - if (!imatrix_data.empty()) { - params.imatrix = &imatrix_data; + std::unordered_map> values_data; + std::unordered_map> activations_data; + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); + if (!values_data.empty()) { + params.imatrix = &values_data; { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); From 0edbf0c176236b795d8707504388052839556b67 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:51:58 +0100 Subject: [PATCH 007/148] Process activations --- tools/quantize/quantize.cpp | 51 +++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index c2a4767fc9e..2c45adab751 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -215,7 +215,10 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { +static int load_imatrix(const std::string & imatrix_file, + std::vector & imatrix_datasets, + std::unordered_map> & values_data, + std::unordered_map> & activations_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -225,7 +228,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector & imatrix_dataset, const std::vector & included_weights, const std::vector & excluded_weights, - std::unordered_map> & imatrix_data) { + std::unordered_map> & values_data, + std::unordered_map> & activations_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); } - if (imatrix_data.empty()) { + if (values_data.empty()) { return m_last_call; } if (!excluded_weights.empty()) { for (const auto & name : excluded_weights) { - for (auto it = imatrix_data.begin(); it != imatrix_data.end();) { + for (auto it = values_data.begin(); it != values_data.end();) { auto pos = it->first.find(name); if (pos != std::string::npos) { - it = imatrix_data.erase(it); + it = values_data.erase(it); } else { ++it; } } + for (auto at = activations_data.begin(); at != activations_data.end();) { + auto pos = at->first.find(name); + if (pos != std::string::npos) { + at = activations_data.erase(at); + } else { + ++at; + } + } } } if (!included_weights.empty()) { - std::unordered_map> tmp; + std::unordered_map> tmp_values; + std::unordered_map> tmp_activations; for (const auto & name : included_weights) { - for (auto & e : imatrix_data) { + for (auto & e : values_data) { auto pos = e.first.find(name); if (pos != std::string::npos) { - tmp.emplace(std::move(e)); + tmp_values.emplace(std::move(e)); + } + } + for (auto & a : activations_data) { + auto pos = a.first.find(name); + if (pos != std::string::npos) { + tmp_activations.emplace(std::move(a)); } } } - imatrix_data = std::move(tmp); + values_data = std::move(tmp_values); + activations_data = std::move(tmp_activations); + } + if (!values_data.empty()) { + printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size())); } - if (!imatrix_data.empty()) { - printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); + if (!activations_data.empty()) { + printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size())); } return m_last_call; } From e8774744584689db682866b71121597fe4d35c84 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:54:02 +0100 Subject: [PATCH 008/148] Process target_bpw parameter --- tools/quantize/quantize.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 2c45adab751..5331dec80ca 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -512,6 +512,7 @@ int main(int argc, char ** argv) { std::vector kv_overrides; std::vector tensor_types; std::vector prune_layers; + float target_bpw = -1.0f; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -538,6 +539,10 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) { + if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 1b3d5b574414ffc03c5d575ef470c74f4e509a80 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:56:02 +0100 Subject: [PATCH 009/148] Populate params --- tools/quantize/quantize.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 5331dec80ca..86a96cdfcca 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = imatrix_data.size(); + kvo.val_i64 = values_data.size(); kv_overrides.emplace_back(std::move(kvo)); } @@ -628,6 +628,9 @@ int main(int argc, char ** argv) { kv_overrides.emplace_back(std::move(kvo)); } } + if (!activations_data.empty()) { + params.activations = &activations_data; + } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; @@ -639,6 +642,9 @@ int main(int argc, char ** argv) { if (!prune_layers.empty()) { params.prune_layers = &prune_layers; } + if (target_bpw != -1.0f) { + params.target_bpw = target_bpw; + } llama_backend_init(); @@ -701,7 +707,7 @@ int main(int argc, char ** argv) { params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && values_data.empty()) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "==========================================================================================================\n\n\n"); From a22a9deeeeb51e6f647bb185301b9874538d0324 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:57:44 +0100 Subject: [PATCH 010/148] Refactor variable and add target_bpw --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1d0361cc166..2e1ca7216e9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1062,9 +1062,11 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.pure =*/ false, /*.keep_split =*/ false, /*.imatrix =*/ nullptr, + /*.activations =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, - /*.prune_layers =*/ nullptr + /*.prune_layers =*/ nullptr, + /*.target_bpw =*/ -1.0f }; return result; From c96b8eef949b479d505b63788d2c214e4221abcb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:00:05 +0100 Subject: [PATCH 011/148] Add fallback_type enum --- src/llama-quant.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2e1ca7216e9..b2879bc8470 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -19,6 +19,32 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static enum ggml_type fallback_type(const enum ggml_type new_type) { + switch (new_type) { + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + return GGML_TYPE_Q4_0; // symmetric-ish fallback + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_IQ4_XS: + return GGML_TYPE_IQ4_NL; + case GGML_TYPE_Q4_K: + return GGML_TYPE_Q5_0; + case GGML_TYPE_Q5_K: + return GGML_TYPE_Q5_1; + case GGML_TYPE_Q6_K: + return GGML_TYPE_Q8_0; + default: + return new_type; + } +} static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { From 9adae08789aefeb945b55858afbdf047e818147f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:00:50 +0100 Subject: [PATCH 012/148] Add is_iq() --- src/llama-quant.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b2879bc8470..1e837a7d41c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -19,6 +19,22 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static bool is_iq(const enum ggml_type t) { + switch (t) { + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } +} static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: From 017945a3b20726dc000da1245ecdbf539a7ba0cf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:03:52 +0100 Subject: [PATCH 013/148] Validate if imatrix contains activations --- src/llama-quant.cpp | 48 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e837a7d41c..fdda5d35a10 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -35,6 +35,7 @@ static bool is_iq(const enum ggml_type t) { return false; } } + static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -61,6 +62,7 @@ static enum ggml_type fallback_type(const enum ggml_type new_type) { return new_type; } } + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -131,10 +133,11 @@ struct quantize_state_impl { int i_ffn_gate = 0; int i_ffn_up = 0; - int n_k_quantized = 0; - int n_fallback = 0; + int n_k_quantized = 0; + int n_fallback = 0; - bool has_imatrix = false; + bool has_imatrix = false; + bool has_activations = false; // used to figure out if a model shares tok_embd with the output weight bool has_output = false; @@ -652,14 +655,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->only_copy) { ftype = ml.ftype; } - const std::unordered_map> * imatrix_data = nullptr; + const std::unordered_map> * values_data = nullptr; + const std::unordered_map> * activations_data = nullptr; if (params->imatrix) { - imatrix_data = static_cast>*>(params->imatrix); - if (imatrix_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + values_data = static_cast>*>(params->imatrix); + if (values_data) { + LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size())); qs.has_imatrix = true; // check imatrix for nans or infs - for (const auto & kv : *imatrix_data) { + for (const auto & kv : *values_data) { for (float f : kv.second) { if (!std::isfinite(f)) { throw std::runtime_error(format("imatrix contains non-finite value %f\n", f)); @@ -668,8 +672,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + if (params->activations) { + activations_data = static_cast>*>(params->activations); + if (activations_data) { + LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size())); + qs.has_activations = true; + // check activations for nans or infs + for (const auto & kv : *activations_data) { + for (float f : kv.second) { + if (!std::isfinite(f)) { + throw std::runtime_error(format("activations contain non-finite value %f\n", f)); + } + } + } + } + } - const size_t align = GGUF_DEFAULT_ALIGNMENT; gguf_context_ptr ctx_out { gguf_init_empty() }; std::vector prune_list = {}; @@ -846,6 +864,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const auto tn = LLM_TN(model.arch); new_ofstream(0); for (const auto * it : tensors) { + const size_t align = GGUF_DEFAULT_ALIGNMENT; const auto & weight = *it; ggml_tensor * tensor = weight.tensor; if (weight.idx != cur_split && params->keep_split) { @@ -864,10 +883,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml.n_tensors, - ggml_get_name(tensor), - llama_format_tensor_shape(tensor).c_str(), - ggml_type_name(tensor->type)); + ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? @@ -967,9 +983,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t nelements = ggml_nelements(tensor); const float * imatrix = nullptr; - if (imatrix_data) { - auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); - if (it == imatrix_data->end()) { + if (values_data) { + auto it = values_data->find(remap_imatrix(tensor->name, mapped)); + if (it == values_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { From 92f49ab39949221ff84b4f70d4528e4f5f43db93 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:05:01 +0100 Subject: [PATCH 014/148] Add target_bpw_type() logic --- src/llama-quant.cpp | 482 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 482 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fdda5d35a10..1e24303c528 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -575,6 +575,488 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } +// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality. +// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert +// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert +// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2]) +static std::unordered_map target_bpw_type( + llama_model_loader & ml, + std::vector> & read_data, + const llama_model & model, + const std::vector & tensors, + const std::map & mapped, + const std::unordered_map> * values_data, + const std::unordered_map> * activations_data, + float target_bpw, + int nthread, + int sample_rows_per_expert = 128, + float bias_lambda = 1.0 +) { + struct candidate_types { + ggml_type type; + float bpw; + size_t bytes; + float error; // lower is better + }; + + struct tensor_info { + const llama_model_loader::llama_tensor_weight * w; + std::vector candidate; // sorted by bpw ascending + int choice = -1; // index into cand + float min_bpw = 0.0; + float max_bpw = 0.0; + size_t n_elements = 0; + }; + + auto name_tn = LLM_TN(model.arch); + + // The candidate types we consider; adjust as needed + const ggml_type base_candidates[] = { + // Model's + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q2_K, + GGML_TYPE_Q3_K, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 + }; + + auto can_quantize = [&](const ggml_tensor * t) -> bool { + const std::string name = ggml_get_name(t); + bool q = name.rfind("weight") == name.size() - 6; + q &= (ggml_n_dims(t) >= 2); + q &= name.find("_norm.weight") == std::string::npos; + //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); + //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight"); + q &= name.find("ffn_gate_inp.weight") == std::string::npos; + q &= name.find("altup") == std::string::npos; + q &= name.find("laurel") == std::string::npos; + q &= name.find("per_layer_model_proj") == std::string::npos; + q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); + q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); + q &= name.find("ssm_conv1d.weight") == std::string::npos; + q &= name.find("shortconv.conv.weight") == std::string::npos; + q &= name.find("time_mix_first.weight") == std::string::npos; + q &= name.find("time_mix_w0.weight") == std::string::npos; + q &= name.find("time_mix_w1.weight") == std::string::npos; + q &= name.find("time_mix_w2.weight") == std::string::npos; + q &= name.find("time_mix_v0.weight") == std::string::npos; + q &= name.find("time_mix_v1.weight") == std::string::npos; + q &= name.find("time_mix_v2.weight") == std::string::npos; + q &= name.find("time_mix_a0.weight") == std::string::npos; + q &= name.find("time_mix_a1.weight") == std::string::npos; + q &= name.find("time_mix_a2.weight") == std::string::npos; + q &= name.find("time_mix_g1.weight") == std::string::npos; + q &= name.find("time_mix_g2.weight") == std::string::npos; + q &= name.find("time_mix_decay_w1.weight") == std::string::npos; + q &= name.find("time_mix_decay_w2.weight") == std::string::npos; + q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + q &= name.find("attn_rel_b.weight") == std::string::npos; + return q; + }; + + auto get_values = [&](const std::string & tensor_name) -> const float * { + if (!values_data) { return nullptr; } + const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); + if (it == values_data->end()) { return nullptr; } + return it->second.data(); + }; + + auto get_activations = [&](const std::string & tensor_name) -> const float * { + if (!activations_data) { return nullptr; } + const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); + if (it == activations_data->end()) { return nullptr; } + return it->second.data(); + }; + + auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { + const int64_t n_per_row = t->ne[0]; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const size_t row_sz = ggml_row_size(typ, n_per_row); + return (size_t)ne2 * (size_t)nrows * row_sz; + }; + + auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { + const int64_t nelem = ggml_nelements(t); + const size_t bytes = total_bytes(t, typ); + return bytes * 8.0 / nelem; + }; + + auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { + const int64_t n_per_row = t->ne[0]; + const int64_t blck = ggml_blck_size(typ); + if (blck <= 1) { return true; } // FP16/BF16/Q8_0 etc + return n_per_row % blck == 0; + }; + + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { + if (is_compatible(t, typ)) { return typ; } + ggml_type fb = fallback_type(typ); + if (is_compatible(t, fb)) { return fb; } + return GGML_TYPE_F16; // final guard + }; + + // Estimate error for a given type using a sampled subset of rows. + // Uses both imatrix (E[a^2]) and activations (E[a]) if available. + auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double { + const int64_t n_per_row = t->ne[0]; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + + const ggml_type_traits * traits = ggml_get_type_traits(typ); + if (!traits || !traits->to_float) { + // cannot dequantize candidate -> assign very high error + return 1e35f; + } + + // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly + const int64_t rows_per_expert = nrows; + const int64_t sample_rows = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); + const int64_t stride = std::max(1, rows_per_expert / sample_rows); + + const size_t row_sz = ggml_row_size(typ, n_per_row); + std::vector qbuf(row_sz * sample_rows); + std::vector f32_sample(sample_rows * n_per_row); + std::vector deq(sample_rows * n_per_row); + + float total_err = 0.0; + + for (int64_t i03 = 0; i03 < ne2; ++i03) { + const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr; + const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr; + + // Assemble sampled rows into contiguous f32_sample + int64_t rs = 0; + for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { + const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row; + std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); + ++rs; + } + if (rs == 0) { continue; } + + // Quantize sampled rows in one chunk; pass the imatrix for this expert slice + const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); + (void)got; // not strictly needed here + + // Dequantize + traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); + + // Compute error proxy per sampled row + for (int64_t s = 0; s < rs; ++s) { + const float * xs = f32_sample.data() + s * n_per_row; + const float * ys = deq.data() + s * n_per_row; + + float mse_w = 0.0; + float bias = 0.0; + float bias_sum = 0.0; + + if (value) { + for (int64_t j = 0; j < n_per_row; ++j) { + const float e = ys[j] - xs[j]; + mse_w += e * e * value[j]; + if (activation) { + bias_sum += e * activation[j]; + } + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const float e = ys[j] - xs[j]; + mse_w += e*e; + if (activation) { + bias_sum += e * activation[j]; + } + } + } + + if (activation) { + bias = std::abs(bias_sum); + } + + // Normalize by n_per_row to get a per-row average scale + float row_err = mse_w / std::max(1, n_per_row); + if (bias_lambda != 0.0) { + row_err += bias_lambda * (bias / std::max(1, n_per_row)); + } + + total_err += row_err; + } + + // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor + const float scale_rows = rows_per_expert / std::max(1, rs); + total_err *= scale_rows; + } + + return total_err; + }; + + // Produce per-tensor candidate lists + std::vector all; + all.reserve(tensors.size()); + + for (const auto * tw : tensors) { + // Temporary workers for dequantization + std::vector workers; + workers.reserve(std::max(1, nthread)); + + ggml_tensor * t = tw->tensor; + const std::string name = ggml_get_name(t); + + if (!can_quantize(t)) { + continue; + } + + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(t)) { + read_data.resize(ggml_nbytes(t)); + } + t->data = read_data.data(); + } + ml.load_data_for(t); + + // Prepare f32 weights for error estimates + const int64_t nelem = ggml_nelements(t); + std::vector> f32_conv_buf; + float * f32_data = nullptr; + + if (t->type == GGML_TYPE_F32) { + f32_data = (float *)t->data; + } else { + llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread); + f32_data = (float *)f32_conv_buf.data(); + } + + const float * values = get_values(name); + const float * activations = get_activations(name); + + tensor_info info; + info.w = tw; + info.n_elements = nelem; + + // Candidate build with compatibility handling and availability checks + for (ggml_type ts_type : base_candidates) { + // Skip IQ* without imatrix + if (is_iq(ts_type) && !values) { continue; } + ggml_type tt = make_compatible(t, ts_type); + // After fallback, if still incompatible, skip + if (!is_compatible(t, tt)) { continue; } + + // Compute bpw and bytes + auto bpw = (float)tensor_bpw(t, tt); + size_t bytes = total_bytes(t, tt); + + // Estimate error + auto err = (float)estimate_error(t, f32_data, tt, values, activations); + + info.candidate.push_back(candidate_types{tt, bpw, bytes, err}); + } + + if (info.candidate.empty()) { + // as a last resort, keep original type + float bpw = ggml_nbytes(t) * 8.0f / nelem; + info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); + } + + // Sort by bpw ascending + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + if (a.bpw != b.bpw) { return a.bpw < b.bpw; } + if (a.error != b.error) { return a.error < b.error; } + return a.bytes < b.bytes; + }); + + // collapse candidates with identical storage size (bytes) + { + std::vector uniq; + uniq.reserve(info.candidate.size()); + + for (size_t i = 0; i < info.candidate.size(); ) { + size_t j = i + 1; + candidate_types best = info.candidate[i]; + // group same-byte entries, keep the one with the lowest error + while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { + if (info.candidate[j].error < best.error) { best = info.candidate[j]; } + ++j; + } + uniq.push_back(best); + i = j; + } + info.candidate.swap(uniq); + } + + // Initialize choice at the smallest bpw candidate + info.choice = 0; + info.min_bpw = info.candidate.front().bpw; + info.max_bpw = info.candidate.back().bpw; + + all.push_back(std::move(info)); + } + + if (all.empty()) { return {}; } + + // Greedy allocation from minimum bpw upward to reach target_bpw + // Start with minimal bpw assignment + auto current_total_bytes = [&]() -> size_t { + size_t b = 0; + for (const auto & ti : all) { + b += ti.candidate[ti.choice].bytes; + } + return b; + }; + + auto total_weights = [&]() -> size_t { + size_t w = 0; + for (const auto & ti : all) { + w += ti.n_elements; + } + return w; + }; + + const size_t tw = total_weights(); + auto current_bpw = [&]() -> double { + return (double)current_total_bytes() * 8.0f / (double)tw; + }; + + // Precompute current bpw + double bpw_now = current_bpw(); + + // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) + if (bpw_now >= target_bpw) { + std::unordered_map overrides; + for (const auto & ti : all) { + overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + } + return overrides; + } + + struct upgrade { + int idx; // tensor index + int next; // next candidate index (strictly larger bytes) + double err; // error reduction + size_t delta_bytes; // increase in bytes + double ratio; // err per added bit + }; + + // Find next strictly-larger candidate index for a tensor + auto next_distinct_idx = [&](const tensor_info &ti) -> int { + const auto &cand = ti.candidate; + const auto &cur = cand[ti.choice]; + int j = ti.choice + 1; + while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j; + return j < (int)cand.size() ? j : -1; + }; + + auto recompute_best_upgrade = [&]() -> upgrade { + const double eps = 1e-12; + upgrade best{-1, -1, 0.0, 0, -1.0}; + for (int i = 0; i < (int)all.size(); ++i) { + const auto &ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } + + int j = next_distinct_idx(ti); + if (j < 0) { continue; } // no larger-size candidate remains + + const auto &cur = ti.candidate[ti.choice]; + const auto &nxt = ti.candidate[j]; + + size_t delta_bytes = nxt.bytes - cur.bytes; + if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe + + double err = (double)cur.error - (double)nxt.error; + err = std::max(err, 0.0); // do not penalize due to sampling noise + + double ratio = err / (double)(delta_bytes * 8ull); + if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { + best = upgrade{i, j, err, delta_bytes, ratio}; + } + } + return best; + }; + + while (true) { + upgrade up = recompute_best_upgrade(); + if (up.idx < 0) { break; } + + size_t now_bytes = current_total_bytes(); + size_t next_bytes = now_bytes + up.delta_bytes; + double bpw_next = (double)next_bytes * 8.0 / (double)tw; + + if (bpw_next <= (double)target_bpw + 1e-12) { + all[up.idx].choice = up.next; + bpw_now = bpw_next; + } else { + break; + } + } + + // We might still be below target but taking any single upgrade overshoots. + { + double under_gap = (double)target_bpw - bpw_now; + + upgrade best_over{-1, -1, 0.0, 0, -1.0}; + double best_over_gap = 1e300; + + size_t now_bytes = current_total_bytes(); + + for (int i = 0; i < (int)all.size(); ++i) { + const auto &ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } + + int j = next_distinct_idx(ti); + if (j < 0) { continue; } + + const auto &cur = ti.candidate[ti.choice]; + const auto &nxt = ti.candidate[j]; + + size_t delta_bytes = nxt.bytes - cur.bytes; + if (delta_bytes == 0) { continue; } + + size_t over_bytes = now_bytes + delta_bytes; + double bpw_over = (double)over_bytes * 8.0 / (double)tw; + + double over_gap = std::abs(bpw_over - (double)target_bpw); + + double err = (double)cur.error - (double)nxt.error; + if (err < 0.0) { err = 0.0; } + double ratio = err / (double)(delta_bytes * 8ull); + + if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { + best_over_gap = over_gap; + best_over = upgrade{i, j, err, delta_bytes, ratio}; + } + } + + if (best_over.idx >= 0) { + if (best_over_gap < under_gap) { + all[best_over.idx].choice = best_over.next; + } + } + } + + // Build the override map + std::unordered_map overrides; + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw); + for (const auto & ti : all) { + LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", + __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); + overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + } + return overrides; +} + static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; From 1187f6aa9eb4cf7a3bf3945d0ecd292a49c03efa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:07:03 +0100 Subject: [PATCH 015/148] Implement bpw_overrides call --- src/llama-quant.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e24303c528..b0b3be76cad 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1314,6 +1314,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + std::unordered_map bpw_overrides = {}; + if (params->target_bpw != -1.0f) { + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread); + } + int cur_split = -1; std::ofstream fout; auto close_ofstream = [&]() { @@ -1430,6 +1436,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!params->pure && ggml_is_quantized(default_type)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + // get bpw override + const auto override = bpw_overrides.find(name); + if (override != bpw_overrides.end()) { new_type = override->second; } // unless the user specifies a type, and the tensor geometry will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); From 5aceb9e3ae016ed057a0963934c53203b74ad3c5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:29:27 +0100 Subject: [PATCH 016/148] Refactor variable names --- src/llama-quant.cpp | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b0b3be76cad..5af70c1c9b8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -575,13 +575,13 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality. -// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert -// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert -// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2]) +// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl +// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute +// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight +// for bias and error, 2.0 means twice as much weight for bias static std::unordered_map target_bpw_type( llama_model_loader & ml, - std::vector> & read_data, + std::vector> & buffer, const llama_model & model, const std::vector & tensors, const std::map & mapped, @@ -735,24 +735,21 @@ static std::unordered_map target_bpw_type( float total_err = 0.0; - for (int64_t i03 = 0; i03 < ne2; ++i03) { - const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr; - const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr; + for (int64_t slice = 0; slice < ne2; ++slice) { + const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; + const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr; - // Assemble sampled rows into contiguous f32_sample int64_t rs = 0; for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { - const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row; + const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); ++rs; } if (rs == 0) { continue; } - // Quantize sampled rows in one chunk; pass the imatrix for this expert slice const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - (void)got; // not strictly needed here + (void)got; - // Dequantize traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); // Compute error proxy per sampled row @@ -821,10 +818,8 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); if (!ml.use_mmap) { - if (read_data.size() < ggml_nbytes(t)) { - read_data.resize(ggml_nbytes(t)); - } - t->data = read_data.data(); + if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); } + t->data = buffer.data(); } ml.load_data_for(t); From ee05d6bc0b250a7c19b9dedf504163509ef736f8 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:32:53 +0100 Subject: [PATCH 017/148] Update comments --- src/llama-quant.cpp | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5af70c1c9b8..546f6b438c7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,13 +596,13 @@ static std::unordered_map target_bpw_type( ggml_type type; float bpw; size_t bytes; - float error; // lower is better + float error; }; struct tensor_info { const llama_model_loader::llama_tensor_weight * w; - std::vector candidate; // sorted by bpw ascending - int choice = -1; // index into cand + std::vector candidate; + int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; @@ -610,7 +610,6 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); - // The candidate types we consider; adjust as needed const ggml_type base_candidates[] = { // Model's GGML_TYPE_IQ1_S, @@ -639,8 +638,6 @@ static std::unordered_map target_bpw_type( bool q = name.rfind("weight") == name.size() - 6; q &= (ggml_n_dims(t) >= 2); q &= name.find("_norm.weight") == std::string::npos; - //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); - //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight"); q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("altup") == std::string::npos; q &= name.find("laurel") == std::string::npos; @@ -719,7 +716,7 @@ static std::unordered_map target_bpw_type( const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { - // cannot dequantize candidate -> assign very high error + // Cannot dequantize candidate -> assign very high error return 1e35f; } @@ -842,12 +839,10 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; - // Candidate build with compatibility handling and availability checks + // Build per-tensor candidate list for (ggml_type ts_type : base_candidates) { - // Skip IQ* without imatrix if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); - // After fallback, if still incompatible, skip if (!is_compatible(t, tt)) { continue; } // Compute bpw and bytes @@ -861,19 +856,18 @@ static std::unordered_map target_bpw_type( } if (info.candidate.empty()) { - // as a last resort, keep original type + // As a last resort, keep original type float bpw = ggml_nbytes(t) * 8.0f / nelem; info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); } - // Sort by bpw ascending std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } return a.bytes < b.bytes; }); - // collapse candidates with identical storage size (bytes) + // Collapse candidates with identical storage size (bytes) { std::vector uniq; uniq.reserve(info.candidate.size()); @@ -903,7 +897,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } // Greedy allocation from minimum bpw upward to reach target_bpw - // Start with minimal bpw assignment auto current_total_bytes = [&]() -> size_t { size_t b = 0; for (const auto & ti : all) { @@ -938,11 +931,11 @@ static std::unordered_map target_bpw_type( } struct upgrade { - int idx; // tensor index - int next; // next candidate index (strictly larger bytes) - double err; // error reduction - size_t delta_bytes; // increase in bytes - double ratio; // err per added bit + int idx; + int next; + double err; + size_t delta_bytes; + double ratio; }; // Find next strictly-larger candidate index for a tensor @@ -998,6 +991,7 @@ static std::unordered_map target_bpw_type( } // We might still be below target but taking any single upgrade overshoots. + // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. { double under_gap = (double)target_bpw - bpw_now; From f22b3097eb144a913d02fbb445cbdb9b97e91859 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:34:01 +0100 Subject: [PATCH 018/148] Avoid division by zero if truncation occurs --- src/llama-quant.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 546f6b438c7..3911eba43b6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -790,28 +790,24 @@ static std::unordered_map target_bpw_type( } // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const float scale_rows = rows_per_expert / std::max(1, rs); + const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs); total_err *= scale_rows; } return total_err; }; - // Produce per-tensor candidate lists std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { - // Temporary workers for dequantization std::vector workers; workers.reserve(std::max(1, nthread)); ggml_tensor * t = tw->tensor; const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { - continue; - } + if (!can_quantize(t)) { continue; } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); if (!ml.use_mmap) { @@ -820,7 +816,6 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - // Prepare f32 weights for error estimates const int64_t nelem = ggml_nelements(t); std::vector> f32_conv_buf; float * f32_data = nullptr; @@ -955,13 +950,13 @@ static std::unordered_map target_bpw_type( if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } int j = next_distinct_idx(ti); - if (j < 0) { continue; } // no larger-size candidate remains + if (j < 0) { continue; } const auto &cur = ti.candidate[ti.choice]; const auto &nxt = ti.candidate[j]; size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe + if (delta_bytes == 0) { continue; } double err = (double)cur.error - (double)nxt.error; err = std::max(err, 0.0); // do not penalize due to sampling noise From 936294f6afb10aea69ac5ae85fcc29313b49cd9e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 23:31:22 +0100 Subject: [PATCH 019/148] Increase precision for error calculation --- src/llama-quant.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3911eba43b6..a4a10da062b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -730,7 +730,7 @@ static std::unordered_map target_bpw_type( std::vector f32_sample(sample_rows * n_per_row); std::vector deq(sample_rows * n_per_row); - float total_err = 0.0; + double total_err = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; @@ -754,9 +754,9 @@ static std::unordered_map target_bpw_type( const float * xs = f32_sample.data() + s * n_per_row; const float * ys = deq.data() + s * n_per_row; - float mse_w = 0.0; - float bias = 0.0; - float bias_sum = 0.0; + double mse_w = 0.0; + double bias = 0.0; + double bias_sum = 0.0; if (value) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -769,19 +769,17 @@ static std::unordered_map target_bpw_type( } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; - mse_w += e*e; + mse_w += e * e; if (activation) { bias_sum += e * activation[j]; } } } - if (activation) { - bias = std::abs(bias_sum); - } + if (activation) { bias = std::abs(bias_sum); } // Normalize by n_per_row to get a per-row average scale - float row_err = mse_w / std::max(1, n_per_row); + double row_err = mse_w / std::max(1, n_per_row); if (bias_lambda != 0.0) { row_err += bias_lambda * (bias / std::max(1, n_per_row)); } @@ -790,11 +788,11 @@ static std::unordered_map target_bpw_type( } // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs); + const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); total_err *= scale_rows; } - return total_err; + return std::isfinite(total_err) ? total_err : 1e35; }; std::vector all; From 5cd69a6809c56922e1b973ce900f3680c28a5117 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 09:41:39 +0100 Subject: [PATCH 020/148] Add F16/BF16 type --- src/llama-quant.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a4a10da062b..5522fe39d28 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -630,7 +630,13 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, +// TODO: find better way to handle F16/BF16 +#ifdef GGML_USE_METAL + GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; auto can_quantize = [&](const ggml_tensor * t) -> bool { From 69586e212e76849fcdff17e68e8023b91025b415 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 13:23:11 +0100 Subject: [PATCH 021/148] Add F16/BF16 type --- tools/quantize/quantize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 86a96cdfcca..b907008cb4f 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -486,13 +486,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 8.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 16.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); return false; } From 29b2dc3ec0ddefde21394007649df6c268ebca3d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 13:27:01 +0100 Subject: [PATCH 022/148] Do not mix K and IQ quants --- src/llama-quant.cpp | 62 +++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5522fe39d28..9dc903874fb 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) { } } +static bool is_iq(const enum llama_ftype t) { + switch (t) { + case LLAMA_FTYPE_MOSTLY_IQ1_S: + case LLAMA_FTYPE_MOSTLY_IQ1_M: + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: + case LLAMA_FTYPE_MOSTLY_IQ2_XS: + case LLAMA_FTYPE_MOSTLY_IQ2_S: + case LLAMA_FTYPE_MOSTLY_IQ2_M: + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: + case LLAMA_FTYPE_MOSTLY_IQ3_XS: + case LLAMA_FTYPE_MOSTLY_IQ3_S: + case LLAMA_FTYPE_MOSTLY_IQ3_M: + case LLAMA_FTYPE_MOSTLY_IQ4_XS: + case LLAMA_FTYPE_MOSTLY_IQ4_NL: + return true; + default: + return false; + } +} + static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -587,7 +607,7 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, - float target_bpw, + const llama_model_quantize_params * params, int nthread, int sample_rows_per_expert = 128, float bias_lambda = 1.0 @@ -608,19 +628,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - auto name_tn = LLM_TN(model.arch); - - const ggml_type base_candidates[] = { - // Model's - GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, - GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, - GGML_TYPE_IQ3_S, - GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, + const ggml_type k_candidates[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, @@ -639,6 +647,21 @@ static std::unordered_map target_bpw_type( #endif }; + const ggml_type iq_candidates[] = { + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, + }; + + auto name_tn = LLM_TN(model.arch); + float target_bpw = params->target_bpw; + auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; @@ -838,8 +861,15 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; + std::vector quant_candidates; + if (is_iq(params->ftype)) { + quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); + } else { + quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); + } + // Build per-tensor candidate list - for (ggml_type ts_type : base_candidates) { + for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } @@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f) { LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } int cur_split = -1; From 43caadf783a4bae41011e3b9aca5bbe79185a7a6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:24:48 +0100 Subject: [PATCH 023/148] Add better fallbacks for IQ mixes --- src/llama-quant.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9dc903874fb..c412191c8f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -657,6 +657,12 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, + // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 }; auto name_tn = LLM_TN(model.arch); From 52da4a4f8c28d063378d54dd806da03614251e76 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:26:05 +0100 Subject: [PATCH 024/148] Skip if output.weight or type is COPY --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c412191c8f3..786adfe547b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -697,6 +697,9 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_decay_w2.weight") == std::string::npos; q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; + q &= params->quantize_output_tensor || name != "output.weight"; + q &= !params->only_copy; + return q; }; From 3f0118d6029450955c43cd84109bdfc36a8cecd3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:26:37 +0100 Subject: [PATCH 025/148] Fix bias lambda bug --- src/llama-quant.cpp | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 786adfe547b..44cf9e30e3c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -782,52 +782,47 @@ static std::unordered_map target_bpw_type( } if (rs == 0) { continue; } - const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - (void)got; - + // Quantize sample rows and dequantize back + (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); - // Compute error proxy per sampled row + // Compute error proxy per sampled slice + double slice_err = 0.0; for (int64_t s = 0; s < rs; ++s) { const float * xs = f32_sample.data() + s * n_per_row; const float * ys = deq.data() + s * n_per_row; double mse_w = 0.0; - double bias = 0.0; double bias_sum = 0.0; if (value) { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e * value[j]; - if (activation) { - bias_sum += e * activation[j]; - } + if (activation) { bias_sum += e * activation[j]; } } } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e; - if (activation) { - bias_sum += e * activation[j]; - } + if (activation) { bias_sum += e * activation[j]; } } } - if (activation) { bias = std::abs(bias_sum); } - // Normalize by n_per_row to get a per-row average scale double row_err = mse_w / std::max(1, n_per_row); - if (bias_lambda != 0.0) { - row_err += bias_lambda * (bias / std::max(1, n_per_row)); + if (activation && bias_lambda != 0.0) { + // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) + const double bias = std::abs(bias_sum) / std::max(1, n_per_row); + row_err += bias_lambda * bias; } - total_err += row_err; + slice_err += row_err; } - // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); - total_err *= scale_rows; + // Scale the slice contribution by the sampling factor + const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; @@ -1002,7 +997,7 @@ static std::unordered_map target_bpw_type( if (delta_bytes == 0) { continue; } double err = (double)cur.error - (double)nxt.error; - err = std::max(err, 0.0); // do not penalize due to sampling noise + err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { From b0b33b7ccbc5880e6ac5206ea309ee328e685c08 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 20:58:26 +0100 Subject: [PATCH 026/148] Optimise tensor sampling --- src/llama-quant.cpp | 197 ++++++++++++++++++++++++++------------------ 1 file changed, 119 insertions(+), 78 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 44cf9e30e3c..830bf915cfc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -609,7 +609,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 128, + int sample_rows_per_expert = 256, float bias_lambda = 1.0 ) { struct candidate_types { @@ -671,7 +671,7 @@ static std::unordered_map target_bpw_type( auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; - q &= (ggml_n_dims(t) >= 2); + q &= ggml_n_dims(t) >= 2; q &= name.find("_norm.weight") == std::string::npos; q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("altup") == std::string::npos; @@ -719,9 +719,9 @@ static std::unordered_map target_bpw_type( auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t row_sz = ggml_row_size(typ, n_per_row); + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const size_t row_sz = ggml_row_size(typ, n_per_row); return (size_t)ne2 * (size_t)nrows * row_sz; }; @@ -734,7 +734,7 @@ static std::unordered_map target_bpw_type( auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t n_per_row = t->ne[0]; const int64_t blck = ggml_blck_size(typ); - if (blck <= 1) { return true; } // FP16/BF16/Q8_0 etc + if (blck <= 1) { return true; } return n_per_row % blck == 0; }; @@ -742,15 +742,20 @@ static std::unordered_map target_bpw_type( if (is_compatible(t, typ)) { return typ; } ggml_type fb = fallback_type(typ); if (is_compatible(t, fb)) { return fb; } - return GGML_TYPE_F16; // final guard + return GGML_TYPE_F16; }; - // Estimate error for a given type using a sampled subset of rows. - // Uses both imatrix (E[a^2]) and activations (E[a]) if available. - auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double { + // Estimate error for a given type using a sampled subset of rows + auto estimate_error = [&](const ggml_tensor * t, + const ggml_type typ, + const std::vector & f32_sample, + const std::vector & sample_rows_per_slice, + const std::vector & values_sample, + const std::vector & activations_sample) -> double + { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { @@ -758,70 +763,73 @@ static std::unordered_map target_bpw_type( return 1e35f; } - // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly - const int64_t rows_per_expert = nrows; - const int64_t sample_rows = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); - const int64_t stride = std::max(1, rows_per_expert / sample_rows); - - const size_t row_sz = ggml_row_size(typ, n_per_row); - std::vector qbuf(row_sz * sample_rows); - std::vector f32_sample(sample_rows * n_per_row); - std::vector deq(sample_rows * n_per_row); + const size_t total_sampled_rows = f32_sample.size() / n_per_row; + if (total_sampled_rows == 0) { return 0.0; } - double total_err = 0.0; + const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows; + std::vector qbuf(qbuf_size); + std::vector deq(f32_sample.size()); + // Quantize all sampled rows at once and dequantize back + size_t qbuf_offset = 0; + size_t f32_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; - const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr; - - int64_t rs = 0; - for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { - const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; - std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); - ++rs; - } + const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - // Quantize sample rows and dequantize back - (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); + const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value); + qbuf_offset += ggml_row_size(typ, n_per_row) * rs; + f32_offset += rs * n_per_row; + } + + traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + + double total_err = 0.0; + size_t sample_offset = 0; + + for (int64_t slice = 0; slice < ne2; ++slice) { + const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; + const int64_t rs = sample_rows_per_slice[slice]; - // Compute error proxy per sampled slice double slice_err = 0.0; for (int64_t s = 0; s < rs; ++s) { - const float * xs = f32_sample.data() + s * n_per_row; - const float * ys = deq.data() + s * n_per_row; + const float * xs = f32_sample.data() + sample_offset; + const float * ys = deq.data() + sample_offset; - double mse_w = 0.0; + double mse_w = 0.0; double bias_sum = 0.0; - if (value) { + if (value_slice) { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; - mse_w += e * e * value[j]; - if (activation) { bias_sum += e * activation[j]; } + mse_w += e * e * value_slice[j]; + if (activation_slice) { bias_sum += e * activation_slice[j]; } } } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e; - if (activation) { bias_sum += e * activation[j]; } + if (activation_slice) { bias_sum += e * activation_slice[j]; } } } // Normalize by n_per_row to get a per-row average scale double row_err = mse_w / std::max(1, n_per_row); - if (activation && bias_lambda != 0.0) { + if (activation_slice && bias_lambda != 0.0) { // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) const double bias = std::abs(bias_sum) / std::max(1, n_per_row); row_err += bias_lambda * bias; } slice_err += row_err; + sample_offset += n_per_row; } // Scale the slice contribution by the sampling factor - const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + const double rows_per_expert = (double) nrows; + const auto scale_rows = rows_per_expert / std::max(1.0, (double) rs); total_err += slice_err * scale_rows; } @@ -858,8 +866,40 @@ static std::unordered_map target_bpw_type( f32_data = (float *)f32_conv_buf.data(); } - const float * values = get_values(name); - const float * activations = get_activations(name); + const float * values_all = get_values(name); + const float * activations_all = get_activations(name); + + // Sample the tensor rows once, before looping through quantization candidates. + const int64_t n_per_row = t->ne[0]; + const int64_t nrows_total = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t rows_per_expert = nrows_total; + const int64_t sample_rows_max = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); + const int64_t stride = std::max(1, rows_per_expert / sample_rows_max); + + std::vector f32_sample; + std::vector values_sample; + std::vector activations_sample; + std::vector sample_rows_per_slice(ne2); + + for (int64_t slice = 0; slice < ne2; ++slice) { + int64_t current_sampled_rows = 0; + for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) { + const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; + f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); + current_sampled_rows++; + } + sample_rows_per_slice[slice] = current_sampled_rows; + } + + if (values_all) { + values_sample.resize(ne2 * n_per_row); + std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float)); + } + if (activations_all) { + activations_sample.resize(ne2 * n_per_row); + std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float)); + } tensor_info info; info.w = tw; @@ -874,7 +914,7 @@ static std::unordered_map target_bpw_type( // Build per-tensor candidate list for (ggml_type ts_type : quant_candidates) { - if (is_iq(ts_type) && !values) { continue; } + if (is_iq(ts_type) && !values_all) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } @@ -882,19 +922,18 @@ static std::unordered_map target_bpw_type( auto bpw = (float)tensor_bpw(t, tt); size_t bytes = total_bytes(t, tt); - // Estimate error - auto err = (float)estimate_error(t, f32_data, tt, values, activations); - - info.candidate.push_back(candidate_types{tt, bpw, bytes, err}); + // Estimate error using the pre-sampled data + auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample); + info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); } if (info.candidate.empty()) { // As a last resort, keep original type float bpw = ggml_nbytes(t) * 8.0f / nelem; - info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); + info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } return a.bytes < b.bytes; @@ -905,7 +944,7 @@ static std::unordered_map target_bpw_type( std::vector uniq; uniq.reserve(info.candidate.size()); - for (size_t i = 0; i < info.candidate.size(); ) { + for (size_t i = 0; i < info.candidate.size();) { size_t j = i + 1; candidate_types best = info.candidate[i]; // group same-byte entries, keep the one with the lowest error @@ -972,36 +1011,39 @@ static std::unordered_map target_bpw_type( }; // Find next strictly-larger candidate index for a tensor - auto next_distinct_idx = [&](const tensor_info &ti) -> int { - const auto &cand = ti.candidate; - const auto &cur = cand[ti.choice]; + auto next_distinct_idx = [&](const tensor_info & ti) -> int { + const auto & cand = ti.candidate; + const auto & cur = cand[ti.choice]; int j = ti.choice + 1; - while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j; + while (j < (int)cand.size() && cand[j].bytes == cur.bytes) { + ++j; + } + return j < (int)cand.size() ? j : -1; }; auto recompute_best_upgrade = [&]() -> upgrade { const double eps = 1e-12; - upgrade best{-1, -1, 0.0, 0, -1.0}; - for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + upgrade best{ -1, -1, 0.0, 0, -1.0 }; + for (int i = 0; i < (int) all.size(); ++i) { + const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - int j = next_distinct_idx(ti); + const int j = next_distinct_idx(ti); if (j < 0) { continue; } - const auto &cur = ti.candidate[ti.choice]; - const auto &nxt = ti.candidate[j]; + const auto & cur = ti.candidate[ti.choice]; + const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; + const size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } - double err = (double)cur.error - (double)nxt.error; + double err = cur.error - nxt.error; err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { - best = upgrade{i, j, err, delta_bytes, ratio}; + best = upgrade{ i, j, err, delta_bytes, ratio }; } } return best; @@ -1014,8 +1056,7 @@ static std::unordered_map target_bpw_type( size_t now_bytes = current_total_bytes(); size_t next_bytes = now_bytes + up.delta_bytes; double bpw_next = (double)next_bytes * 8.0 / (double)tw; - - if (bpw_next <= (double)target_bpw + 1e-12) { + if (bpw_next <= target_bpw + 1e-12) { all[up.idx].choice = up.next; bpw_now = bpw_next; } else { @@ -1026,22 +1067,22 @@ static std::unordered_map target_bpw_type( // We might still be below target but taking any single upgrade overshoots. // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. { - double under_gap = (double)target_bpw - bpw_now; + double under_gap = target_bpw - bpw_now; - upgrade best_over{-1, -1, 0.0, 0, -1.0}; - double best_over_gap = 1e300; + upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; + double best_over_gap = 1e300; size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + for (int i = 0; i < (int) all.size(); ++i) { + const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } int j = next_distinct_idx(ti); if (j < 0) { continue; } - const auto &cur = ti.candidate[ti.choice]; - const auto &nxt = ti.candidate[j]; + const auto & cur = ti.candidate[ti.choice]; + const auto & nxt = ti.candidate[j]; size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } @@ -1051,13 +1092,13 @@ static std::unordered_map target_bpw_type( double over_gap = std::abs(bpw_over - (double)target_bpw); - double err = (double)cur.error - (double)nxt.error; + double err = cur.error - nxt.error; if (err < 0.0) { err = 0.0; } double ratio = err / (double)(delta_bytes * 8ull); if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { best_over_gap = over_gap; - best_over = upgrade{i, j, err, delta_bytes, ratio}; + best_over = upgrade{ i, j, err, delta_bytes, ratio }; } } From 35ad0fc4addf92e9dc0700a88004962731f3c9e0 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 23:27:20 +0100 Subject: [PATCH 027/148] Improve error estimation using weighted MSE --- src/llama-quant.cpp | 62 +++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 830bf915cfc..f5fa309c444 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -783,14 +783,26 @@ static std::unordered_map target_bpw_type( f32_offset += rs * n_per_row; } - traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + if (typ == GGML_TYPE_F16) { + const auto *const src = (const ggml_fp16_t *)qbuf.data(); + for (size_t r = 0; r < total_sampled_rows; ++r) { + ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); + } + } else if (typ == GGML_TYPE_BF16) { + const auto *const src = (const ggml_bf16_t *)qbuf.data(); + for (size_t r = 0; r < total_sampled_rows; ++r) { + ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); + } + } else { + traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + } double total_err = 0.0; size_t sample_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; + const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; const int64_t rs = sample_rows_per_slice[slice]; double slice_err = 0.0; @@ -799,37 +811,37 @@ static std::unordered_map target_bpw_type( const float * ys = deq.data() + sample_offset; double mse_w = 0.0; - double bias_sum = 0.0; - - if (value_slice) { - for (int64_t j = 0; j < n_per_row; ++j) { - const float e = ys[j] - xs[j]; - mse_w += e * e * value_slice[j]; - if (activation_slice) { bias_sum += e * activation_slice[j]; } - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const float e = ys[j] - xs[j]; - mse_w += e * e; - if (activation_slice) { bias_sum += e * activation_slice[j]; } + double x2_w = 0.0; + double bias_num = 0.0; + double bias_den = 0.0; + + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = ys[j] - xs[j]; + const double w = wv ? wv[j] : 1.0; + mse_w += w * e * e; + x2_w += w * xs[j] * xs[j]; + + if (act) { + const double a = act[j]; + bias_num += e * a; + bias_den += a * a; } } - // Normalize by n_per_row to get a per-row average scale - double row_err = mse_w / std::max(1, n_per_row); - if (activation_slice && bias_lambda != 0.0) { - // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) - const double bias = std::abs(bias_sum) / std::max(1, n_per_row); - row_err += bias_lambda * bias; + const double eps = 1e-30; + double row_err = mse_w / (x2_w + eps); + + if (act && bias_lambda != 0.0) { + const double bias_norm = bias_num * bias_num / (bias_den + eps); + row_err += bias_lambda * bias_norm; } slice_err += row_err; sample_offset += n_per_row; } - // Scale the slice contribution by the sampling factor - const double rows_per_expert = (double) nrows; - const auto scale_rows = rows_per_expert / std::max(1.0, (double) rs); + const auto rows_per_expert = nrows; + const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } From 5ef493ea1a01385c02ef4c56d38dfe5e116c47c6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 09:48:29 +0100 Subject: [PATCH 028/148] Exclude embeddings and output tensor --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f5fa309c444..32013e47baf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -697,8 +697,10 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_decay_w2.weight") == std::string::npos; q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= params->quantize_output_tensor || name != "output.weight"; q &= !params->only_copy; + // TODO: Exclude embeddings and output tensors? + q &= params->quantize_output_tensor || name != "output.weight"; + q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; }; From 95b2ab2800e26a5bd5b60c61f9593d720a97eb7a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 10:46:37 +0100 Subject: [PATCH 029/148] Change error estimate to use normalised weighted MSE --- src/llama-quant.cpp | 204 +++++++++++++++++++++++++++++--------------- 1 file changed, 134 insertions(+), 70 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 32013e47baf..629056ee065 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -661,8 +662,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q6_K }; auto name_tn = LLM_TN(model.arch); @@ -752,103 +752,125 @@ static std::unordered_map target_bpw_type( const ggml_type typ, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, - const std::vector & values_sample, - const std::vector & activations_sample) -> double + const float * values_sample, + const float * activations_sample, + std::vector & qbuf, + std::vector & deq) -> double { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - - const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (!traits || !traits->to_float) { - // Cannot dequantize candidate -> assign very high error - return 1e35f; - } + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const size_t total_sampled_rows = f32_sample.size() / n_per_row; if (total_sampled_rows == 0) { return 0.0; } - const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows; - std::vector qbuf(qbuf_size); - std::vector deq(f32_sample.size()); + const size_t row_sz = ggml_row_size(typ, n_per_row); + const size_t need_q = row_sz * total_sampled_rows; + if (qbuf.size() < need_q) { qbuf.resize(need_q); } + if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); } - // Quantize all sampled rows at once and dequantize back - size_t qbuf_offset = 0; - size_t f32_offset = 0; + // Quantize sampled rows slice-by-slice + size_t qoff = 0; + size_t foff = 0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value); - qbuf_offset += ggml_row_size(typ, n_per_row) * rs; - f32_offset += rs * n_per_row; + const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; + + (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); + + qoff += row_sz * rs; + foff += (size_t)rs * n_per_row; } + // Dequantize to deq if (typ == GGML_TYPE_F16) { - const auto *const src = (const ggml_fp16_t *)qbuf.data(); - for (size_t r = 0; r < total_sampled_rows; ++r) { - ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); - } + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); } else if (typ == GGML_TYPE_BF16) { - const auto *const src = (const ggml_bf16_t *)qbuf.data(); - for (size_t r = 0; r < total_sampled_rows; ++r) { - ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); - } + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); } else { - traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + const ggml_type_traits * traits = ggml_get_type_traits(typ); + if (!traits || !traits->to_float) { + // no dequantizer available + return 1e35; + } + traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size()); } + // Compute error + size_t off = 0; double total_err = 0.0; - size_t sample_offset = 0; + const double eps = 1e-12; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; const int64_t rs = sample_rows_per_slice[slice]; + if (rs == 0) { continue; } + + const float * wv = values_sample ? values_sample + slice * n_per_row : nullptr; + const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr; double slice_err = 0.0; - for (int64_t s = 0; s < rs; ++s) { - const float * xs = f32_sample.data() + sample_offset; - const float * ys = deq.data() + sample_offset; + + for (int64_t r = 0; r < rs; ++r) { + const float * x = f32_sample.data() + off; + const float * y = deq.data() + off; double mse_w = 0.0; double x2_w = 0.0; - double bias_num = 0.0; - double bias_den = 0.0; - - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = ys[j] - xs[j]; - const double w = wv ? wv[j] : 1.0; - mse_w += w * e * e; - x2_w += w * xs[j] * xs[j]; + double bnum = 0.0; + double bden = 0.0; - if (act) { + if (wv && act) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double e = y[j] - x[j]; + const double a = act[j]; + mse_w += w * e * e; + x2_w += w * x[j] * x[j]; + bnum += e * a; + bden += a * a; + } + } else if (wv) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double e = y[j] - x[j]; + mse_w += w * e * e; + x2_w += w * x[j] * x[j]; + } + } else if (act) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = y[j] - x[j]; const double a = act[j]; - bias_num += e * a; - bias_den += a * a; + mse_w += e * e; + x2_w += x[j] * x[j]; + bnum += e * a; + bden += a * a; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = y[j] - x[j]; + mse_w += e * e; + x2_w += x[j] * x[j]; } } - const double eps = 1e-30; double row_err = mse_w / (x2_w + eps); - if (act && bias_lambda != 0.0) { - const double bias_norm = bias_num * bias_num / (bias_den + eps); - row_err += bias_lambda * bias_norm; + row_err += bias_lambda * (bnum * bnum) / (bden + eps); } slice_err += row_err; - sample_offset += n_per_row; + off += (size_t)n_per_row; } - const auto rows_per_expert = nrows; - const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + // scale back up to the full number of rows in this slice + const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; - }; +}; std::vector all; all.reserve(tensors.size()); @@ -887,38 +909,70 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const int64_t rows_per_expert = nrows_total; - const int64_t sample_rows_max = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); - const int64_t stride = std::max(1, rows_per_expert / sample_rows_max); + const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); + const int64_t stride = std::max(1, nrows_total / sample_rows_max); std::vector f32_sample; std::vector values_sample; std::vector activations_sample; std::vector sample_rows_per_slice(ne2); + std::mt19937 rng(std::random_device{}()); for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; - for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) { - const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; + int64_t offset = 0; + if (stride > 1) { + std::uniform_int_distribution dist(0, stride - 1); + offset = dist(rng); + } + for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { + const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); current_sampled_rows++; } sample_rows_per_slice[slice] = current_sampled_rows; } + auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { + const size_t want = (size_t)ne2 * (size_t)n_per_row; + dst.clear(); + if (!src || src_sz == 0) { return; } + + if (src_sz == want) { + dst.resize(want); + std::memcpy(dst.data(), src, want * sizeof(float)); + } else if (src_sz == (size_t)n_per_row) { + dst.resize(want); + for (int64_t s = 0; s < ne2; ++s) { + std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); + } + } else { + // Mismatch – safer to skip using it for this tensor + LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", + __func__, name.c_str(), src_sz, (size_t)n_per_row, want); + } + }; + if (values_all) { - values_sample.resize(ne2 * n_per_row); - std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float)); + // get size from the map (not just the raw pointer) + auto itv = values_data->find(remap_imatrix(name, mapped)); + const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); + copy_or_broadcast(values_all, sz, values_sample); } if (activations_all) { - activations_sample.resize(ne2 * n_per_row); - std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float)); + auto ita = activations_data->find(remap_imatrix(name, mapped)); + const size_t sz = ita == activations_data->end() ? 0 : ita->second.size(); + copy_or_broadcast(activations_all, sz, activations_sample); } tensor_info info; info.w = tw; info.n_elements = nelem; + // Prepare scratch buffers sized for the largest candidate row size + size_t total_sampled_rows = f32_sample.size() / n_per_row; + + // Build list of candidate types first (compatible ones) std::vector quant_candidates; if (is_iq(params->ftype)) { quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); @@ -926,18 +980,28 @@ static std::unordered_map target_bpw_type( quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); } - // Build per-tensor candidate list + // Compute maximum row size among compatible candidates (to size qbuf once) + size_t max_row_sz = 0; + std::vector compatible_candidates; + compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !values_all) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } + compatible_candidates.push_back(tt); + max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); + } - // Compute bpw and bytes + std::vector qbuf(max_row_sz * total_sampled_rows); + std::vector deq(f32_sample.size()); + + // Now evaluate candidates + for (ggml_type tt : compatible_candidates) { auto bpw = (float)tensor_bpw(t, tt); size_t bytes = total_bytes(t, tt); - - // Estimate error using the pre-sampled data - auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample); + const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); + const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); + float err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq); info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); } From e01dad886bd2314146ce768240fd0c8a2abecabb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 12:47:13 +0100 Subject: [PATCH 030/148] Parallelise candidate evaluation --- src/llama-quant.cpp | 87 ++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 629056ee065..3cade0bf6fc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -610,7 +610,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 256, + int sample_rows_per_expert = 384, float bias_lambda = 1.0 ) { struct candidate_types { @@ -758,16 +758,17 @@ static std::unordered_map target_bpw_type( std::vector & deq) -> double { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t total_sampled_rows = f32_sample.size() / n_per_row; + const size_t nels = f32_sample.size(); + const size_t total_sampled_rows = nels / (size_t)n_per_row; if (total_sampled_rows == 0) { return 0.0; } const size_t row_sz = ggml_row_size(typ, n_per_row); const size_t need_q = row_sz * total_sampled_rows; if (qbuf.size() < need_q) { qbuf.resize(need_q); } - if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); } + if (deq.size() < nels) { deq.resize(nels); } // Quantize sampled rows slice-by-slice size_t qoff = 0; @@ -777,31 +778,31 @@ static std::unordered_map target_bpw_type( if (rs == 0) { continue; } const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); - qoff += row_sz * rs; - foff += (size_t)rs * n_per_row; + qoff += row_sz * (size_t)rs; + foff += (size_t)rs * (size_t)n_per_row; } - // Dequantize to deq + // Dequantize into deq if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); } else { const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { - // no dequantizer available + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); return 1e35; } - traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size()); + + traits->to_float(qbuf.data(), deq.data(), (int) nels); } // Compute error + const double eps = 1e-12; size_t off = 0; double total_err = 0.0; - const double eps = 1e-12; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; @@ -817,9 +818,9 @@ static std::unordered_map target_bpw_type( const float * y = deq.data() + off; double mse_w = 0.0; - double x2_w = 0.0; - double bnum = 0.0; - double bden = 0.0; + double x2_w = 0.0; + double bnum = 0.0; + double bden = 0.0; if (wv && act) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -828,8 +829,8 @@ static std::unordered_map target_bpw_type( const double a = act[j]; mse_w += w * e * e; x2_w += w * x[j] * x[j]; - bnum += e * a; - bden += a * a; + bnum += w * e * a; // weighted bias + bden += w * a * a; // weighted norm } } else if (wv) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -856,7 +857,9 @@ static std::unordered_map target_bpw_type( } double row_err = mse_w / (x2_w + eps); + if (act && bias_lambda != 0.0) { + // penalize squared projection of error onto activations row_err += bias_lambda * (bnum * bnum) / (bden + eps); } @@ -864,7 +867,7 @@ static std::unordered_map target_bpw_type( off += (size_t)n_per_row; } - // scale back up to the full number of rows in this slice + // scale to full rows in this slice (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } @@ -982,10 +985,14 @@ static std::unordered_map target_bpw_type( // Compute maximum row size among compatible candidates (to size qbuf once) size_t max_row_sz = 0; + const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; std::vector compatible_candidates; compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { - if (is_iq(ts_type) && !values_all) { continue; } + if (is_iq(ts_type) && !has_valid_imatrix) { + LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str()); + continue; + } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } compatible_candidates.push_back(tt); @@ -996,13 +1003,37 @@ static std::unordered_map target_bpw_type( std::vector deq(f32_sample.size()); // Now evaluate candidates - for (ggml_type tt : compatible_candidates) { - auto bpw = (float)tensor_bpw(t, tt); - size_t bytes = total_bytes(t, tt); - const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); - const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - float err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq); - info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); + std::vector cand_out(compatible_candidates.size()); + const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); + const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); + + int n_eval_threads = std::max(1, nthread); + std::atomic cidx{0}; + std::vector eval_workers; + eval_workers.reserve(n_eval_threads); + + for (int ti = 0; ti < n_eval_threads; ++ti) { + eval_workers.emplace_back([&] { + // thread-local scratch + std::vector tl_qbuf(qbuf.size()); + std::vector tl_deq(deq.size()); + + for (;;) { + const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); + if (i >= compatible_candidates.size()) { break; } + + const ggml_type tt = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(t, tt); + const size_t bytes = total_bytes(t, tt); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq); + cand_out[i] = candidate_types{ tt, bpw, bytes, err }; + } + }); + } + for (auto &th : eval_workers) { th.join(); } + + for (auto &c : cand_out) { + if (c.bytes > 0) { info.candidate.push_back(c); } } if (info.candidate.empty()) { From 887490c5ec3c679e8bc0c274b743b483e7c595e3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 15:11:49 +0100 Subject: [PATCH 031/148] Dequantise sampled rows only --- src/llama-quant.cpp | 71 ++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3cade0bf6fc..547281bd7d1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -610,7 +610,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 384, + int sample_rows_per_expert = 512, float bias_lambda = 1.0 ) { struct candidate_types { @@ -699,7 +699,7 @@ static std::unordered_map target_bpw_type( q &= name.find("attn_rel_b.weight") == std::string::npos; q &= !params->only_copy; // TODO: Exclude embeddings and output tensors? - q &= params->quantize_output_tensor || name != "output.weight"; + // q &= params->quantize_output_tensor || name != "output.weight"; q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; @@ -896,31 +896,35 @@ static std::unordered_map target_bpw_type( const int64_t nelem = ggml_nelements(t); std::vector> f32_conv_buf; - float * f32_data = nullptr; - - if (t->type == GGML_TYPE_F32) { - f32_data = (float *)t->data; - } else { - llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread); - f32_data = (float *)f32_conv_buf.data(); - } - const float * values_all = get_values(name); const float * activations_all = get_activations(name); - // Sample the tensor rows once, before looping through quantization candidates. + // Dequantize only sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); - const int64_t stride = std::max(1, nrows_total / sample_rows_max); + + const ggml_type src_type = t->type; + const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); + const bool src_is_quant = ggml_is_quantized(src_type); + const size_t src_row_sz = ggml_row_size(src_type, n_per_row); std::vector f32_sample; + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); + std::vector values_sample; std::vector activations_sample; - std::vector sample_rows_per_slice(ne2); + std::vector sample_rows_per_slice(ne2, 0); + + // deterministic sampling seed based on tensor name + fixed constant + std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); + + const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); + const int64_t stride = std::max(1, nrows_total / sample_rows_max); + + // Temporary buffer for one dequantized row + std::vector rowbuf((size_t)n_per_row); - std::mt19937 rng(std::random_device{}()); for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; int64_t offset = 0; @@ -928,10 +932,30 @@ static std::unordered_map target_bpw_type( std::uniform_int_distribution dist(0, stride - 1); offset = dist(rng); } + for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { - const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row; - f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); - current_sampled_rows++; + if (src_type == GGML_TYPE_F32) { + const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; + f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); + } else if (src_type == GGML_TYPE_F16) { + const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else if (src_type == GGML_TYPE_BF16) { + const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else if (src_is_quant) { + const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + if (!src_traits || !src_traits->to_float) { + throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); + } + src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else { + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + } + ++current_sampled_rows; } sample_rows_per_slice[slice] = current_sampled_rows; } @@ -999,15 +1023,16 @@ static std::unordered_map target_bpw_type( max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); } - std::vector qbuf(max_row_sz * total_sampled_rows); - std::vector deq(f32_sample.size()); + std::sort(compatible_candidates.begin(), compatible_candidates.end()); + compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); // Now evaluate candidates std::vector cand_out(compatible_candidates.size()); const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - - int n_eval_threads = std::max(1, nthread); + std::vector qbuf(max_row_sz * total_sampled_rows); + std::vector deq(f32_sample.size()); + int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; eval_workers.reserve(n_eval_threads); From 9e11f82e8f5ad29cb62cba0bab7014db17a0b2c2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 16:25:31 +0100 Subject: [PATCH 032/148] Precompute error denominator in estimate_erro() --- src/llama-quant.cpp | 154 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 121 insertions(+), 33 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 547281bd7d1..03f8a4bd117 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -598,8 +598,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * // Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl // sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute -// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight -// for bias and error, 2.0 means twice as much weight for bias +// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE), +// 1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -658,7 +658,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, - // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it + // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it? GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, @@ -770,7 +770,68 @@ static std::unordered_map target_bpw_type( if (qbuf.size() < need_q) { qbuf.resize(need_q); } if (deq.size() < nels) { deq.resize(nels); } - // Quantize sampled rows slice-by-slice + // Precompute denominators: + // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2 + // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise + std::vector x2_per_row(total_sampled_rows, 0.0); + std::vector bden_per_slice(ne2, 0.0); + + const bool has_w = (values_sample != nullptr); + const bool has_a = (activations_sample != nullptr); + + // Precompute bden per slice (depends only on w,a) + if (has_a) { + for (int64_t s = 0; s < ne2; ++s) { + const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + const float * act = activations_sample + s * n_per_row; + double bden = 0.0; + if (has_w) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = act[j]; + bden += (double) wv[j] * a * a; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = act[j]; + bden += a * a; + } + } + bden_per_slice[s] = bden; + } + } + + // Precompute x2 per sampled row + { + size_t off = 0; + size_t row_idx = 0; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = sample_rows_per_slice[s]; + if (rs == 0) { continue; } + + const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + + for (int64_t r = 0; r < rs; ++r, ++row_idx) { + const float * x = f32_sample.data() + off; + double x2 = 0.0; + if (has_w) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double xx = x[j]; + x2 += w * xx * xx; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double xx = x[j]; + x2 += xx * xx; + } + } + x2_per_row[row_idx] = x2; + off += (size_t)n_per_row; + } + } + } + + // Quantize sampled rows slice-by-slice into qbuf size_t qoff = 0; size_t foff = 0; for (int64_t slice = 0; slice < ne2; ++slice) { @@ -784,43 +845,50 @@ static std::unordered_map target_bpw_type( foff += (size_t)rs * (size_t)n_per_row; } - // Dequantize into deq - if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); - } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); - } else { + // Dequantize into deq (row-wise if needed to avoid int overflow) + { const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); - return 1e35; - } + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); + } else if (typ == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); + } else { + if (!traits || !traits->to_float) { + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); + return 1e35; + } - traits->to_float(qbuf.data(), deq.data(), (int) nels); + size_t done = 0; + while (done < nels) { + const size_t chunk = std::min((size_t)n_per_row, nels - done); + traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk); + done += chunk; + } + } } // Compute error const double eps = 1e-12; size_t off = 0; + size_t row_idx = 0; double total_err = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * wv = values_sample ? values_sample + slice * n_per_row : nullptr; - const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr; + const float * wv = has_w ? values_sample + slice * n_per_row : nullptr; + const float * act = has_a ? activations_sample + slice * n_per_row : nullptr; + const double bden = has_a ? bden_per_slice[slice] : 0.0; double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r) { + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + off; const float * y = deq.data() + off; double mse_w = 0.0; - double x2_w = 0.0; double bnum = 0.0; - double bden = 0.0; if (wv && act) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -828,52 +896,49 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; const double a = act[j]; mse_w += w * e * e; - x2_w += w * x[j] * x[j]; - bnum += w * e * a; // weighted bias - bden += w * a * a; // weighted norm + bnum += w * e * a; } } else if (wv) { for (int64_t j = 0; j < n_per_row; ++j) { const double w = wv[j]; const double e = y[j] - x[j]; mse_w += w * e * e; - x2_w += w * x[j] * x[j]; } } else if (act) { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; const double a = act[j]; mse_w += e * e; - x2_w += x[j] * x[j]; bnum += e * a; - bden += a * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; mse_w += e * e; - x2_w += x[j] * x[j]; } } - double row_err = mse_w / (x2_w + eps); - + // corrected normalization: divide the full numerator by x2 + double numer = mse_w; if (act && bias_lambda != 0.0) { - // penalize squared projection of error onto activations - row_err += bias_lambda * (bnum * bnum) / (bden + eps); + const double proj = bnum * bnum / (bden + eps); + numer += bias_lambda * proj; } + const double denom = x2_per_row[row_idx] + eps; + const double row_err = numer / denom; + slice_err += row_err; off += (size_t)n_per_row; } - // scale to full rows in this slice (nrows) + // scale to full rows (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; -}; + }; std::vector all; all.reserve(tensors.size()); @@ -1067,6 +1132,29 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } + // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A. + { + std::vector pruned; + pruned.reserve(info.candidate.size()); + // Sort by bytes asc, error asc + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + return a.error < b.error; + }); + + double best_err = std::numeric_limits::infinity(); + size_t last_bytes = std::numeric_limits::max(); + + for (const auto &c : info.candidate) { + if (c.error < best_err || c.bytes > last_bytes) { + pruned.push_back(c); + best_err = std::min(best_err, (double)c.error); + last_bytes = c.bytes; + } + } + info.candidate.swap(pruned); + } + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } From 5b6f1e9fde8dc6fd3456358c5b5c758b1f10b11c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 19:18:54 +0100 Subject: [PATCH 033/148] General code refactor --- src/llama-quant.cpp | 407 +++++++++++++++++++++----------------------- 1 file changed, 192 insertions(+), 215 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 03f8a4bd117..85191a66ae8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,10 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl -// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute -// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE), -// 1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias +// Returns per-tensor type overrides to meet target BPW at lowest ppl static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -609,9 +606,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * values_data, const std::unordered_map> * activations_data, const llama_model_quantize_params * params, - int nthread, - int sample_rows_per_expert = 512, - float bias_lambda = 1.0 + int nthread ) { struct candidate_types { ggml_type type; @@ -621,15 +616,15 @@ static std::unordered_map target_bpw_type( }; struct tensor_info { - const llama_model_loader::llama_tensor_weight * w; - std::vector candidate; + const llama_model_loader::llama_tensor_weight * w = nullptr; + std::vector candidate = {}; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; }; - const ggml_type k_candidates[] = { + constexpr ggml_type k_quants[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, @@ -648,7 +643,7 @@ static std::unordered_map target_bpw_type( #endif }; - const ggml_type iq_candidates[] = { + constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, @@ -665,46 +660,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q6_K }; - auto name_tn = LLM_TN(model.arch); - float target_bpw = params->target_bpw; - - auto can_quantize = [&](const ggml_tensor * t) -> bool { - const std::string name = ggml_get_name(t); - bool q = name.rfind("weight") == name.size() - 6; - q &= ggml_n_dims(t) >= 2; - q &= name.find("_norm.weight") == std::string::npos; - q &= name.find("ffn_gate_inp.weight") == std::string::npos; - q &= name.find("altup") == std::string::npos; - q &= name.find("laurel") == std::string::npos; - q &= name.find("per_layer_model_proj") == std::string::npos; - q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); - q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); - q &= name.find("ssm_conv1d.weight") == std::string::npos; - q &= name.find("shortconv.conv.weight") == std::string::npos; - q &= name.find("time_mix_first.weight") == std::string::npos; - q &= name.find("time_mix_w0.weight") == std::string::npos; - q &= name.find("time_mix_w1.weight") == std::string::npos; - q &= name.find("time_mix_w2.weight") == std::string::npos; - q &= name.find("time_mix_v0.weight") == std::string::npos; - q &= name.find("time_mix_v1.weight") == std::string::npos; - q &= name.find("time_mix_v2.weight") == std::string::npos; - q &= name.find("time_mix_a0.weight") == std::string::npos; - q &= name.find("time_mix_a1.weight") == std::string::npos; - q &= name.find("time_mix_a2.weight") == std::string::npos; - q &= name.find("time_mix_g1.weight") == std::string::npos; - q &= name.find("time_mix_g2.weight") == std::string::npos; - q &= name.find("time_mix_decay_w1.weight") == std::string::npos; - q &= name.find("time_mix_decay_w2.weight") == std::string::npos; - q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= !params->only_copy; - // TODO: Exclude embeddings and output tensors? - // q &= params->quantize_output_tensor || name != "output.weight"; - q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); - - return q; - }; - auto get_values = [&](const std::string & tensor_name) -> const float * { if (!values_data) { return nullptr; } const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); @@ -719,7 +674,7 @@ static std::unordered_map target_bpw_type( return it->second.data(); }; - auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { + auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; @@ -729,8 +684,8 @@ static std::unordered_map target_bpw_type( auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { const int64_t nelem = ggml_nelements(t); - const size_t bytes = total_bytes(t, typ); - return bytes * 8.0 / nelem; + const size_t bytes = tensor_bytes(t, typ); + return (double)bytes * 8.0 / (double)nelem; }; auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { @@ -747,189 +702,220 @@ static std::unordered_map target_bpw_type( return GGML_TYPE_F16; }; + auto name_tn = LLM_TN(model.arch); + auto can_quantize = [&](const ggml_tensor * t) -> bool { + const std::string name = ggml_get_name(t); + bool q = name.rfind("weight") == name.size() - 6; + q &= ggml_n_dims(t) >= 2; + q &= name.find("_norm.weight") == std::string::npos; + q &= name.find("ffn_gate_inp.weight") == std::string::npos; + q &= name.find("altup") == std::string::npos; + q &= name.find("laurel") == std::string::npos; + q &= name.find("per_layer_model_proj") == std::string::npos; + q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); + q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); + q &= name.find("ssm_conv1d.weight") == std::string::npos; + q &= name.find("shortconv.conv.weight") == std::string::npos; + q &= name.find("time_mix_first.weight") == std::string::npos; + q &= name.find("time_mix_w0.weight") == std::string::npos; + q &= name.find("time_mix_w1.weight") == std::string::npos; + q &= name.find("time_mix_w2.weight") == std::string::npos; + q &= name.find("time_mix_v0.weight") == std::string::npos; + q &= name.find("time_mix_v1.weight") == std::string::npos; + q &= name.find("time_mix_v2.weight") == std::string::npos; + q &= name.find("time_mix_a0.weight") == std::string::npos; + q &= name.find("time_mix_a1.weight") == std::string::npos; + q &= name.find("time_mix_a2.weight") == std::string::npos; + q &= name.find("time_mix_g1.weight") == std::string::npos; + q &= name.find("time_mix_g2.weight") == std::string::npos; + q &= name.find("time_mix_decay_w1.weight") == std::string::npos; + q &= name.find("time_mix_decay_w2.weight") == std::string::npos; + q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + q &= name.find("attn_rel_b.weight") == std::string::npos; + q &= !params->only_copy; + // TODO: Exclude embeddings and output tensors? + // q &= params->quantize_output_tensor || name != "output.weight"; + q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); + + return q; + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, - const ggml_type typ, + const ggml_type quant_type, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, const float * values_sample, const float * activations_sample, - std::vector & qbuf, - std::vector & deq) -> double + std::vector & quantized_buffer, + std::vector & dequantized_buffer) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t nels = f32_sample.size(); - const size_t total_sampled_rows = nels / (size_t)n_per_row; - if (total_sampled_rows == 0) { return 0.0; } + const size_t sample_element_count = f32_sample.size(); + const size_t sample_row_count = sample_element_count / (size_t)n_per_row; + if (sample_row_count == 0) { return 0.0; } - const size_t row_sz = ggml_row_size(typ, n_per_row); - const size_t need_q = row_sz * total_sampled_rows; - if (qbuf.size() < need_q) { qbuf.resize(need_q); } - if (deq.size() < nels) { deq.resize(nels); } + const size_t row_size = ggml_row_size(quant_type, n_per_row); + const size_t buffer_size = row_size * sample_row_count; + if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } + if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - // Precompute denominators: - // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2 - // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise - std::vector x2_per_row(total_sampled_rows, 0.0); - std::vector bden_per_slice(ne2, 0.0); + std::vector row_sq_norm(sample_row_count, 0.0); + std::vector bias_denominator_per_slice(ne2, 0.0); - const bool has_w = (values_sample != nullptr); - const bool has_a = (activations_sample != nullptr); - - // Precompute bden per slice (depends only on w,a) - if (has_a) { + // Precompute bias denominator per slice + const bool has_values = (values_sample != nullptr); + const bool has_activations = (activations_sample != nullptr); + if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { - const float * wv = has_w ? values_sample + s * n_per_row : nullptr; - const float * act = activations_sample + s * n_per_row; - double bden = 0.0; - if (has_w) { + const float * values = has_values ? values_sample + s * n_per_row : nullptr; + const float * activations = activations_sample + s * n_per_row; + double bias_denominator = 0.0; + if (has_values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double a = act[j]; - bden += (double) wv[j] * a * a; + const double a = activations[j]; + bias_denominator += values[j] * a * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double a = act[j]; - bden += a * a; + const double a = activations[j]; + bias_denominator += a * a; } } - bden_per_slice[s] = bden; + bias_denominator_per_slice[s] = bias_denominator; } } - // Precompute x2 per sampled row + // Compute squared norms of sampled rows { - size_t off = 0; + size_t offset = 0; size_t row_idx = 0; for (int64_t s = 0; s < ne2; ++s) { const int64_t rs = sample_rows_per_slice[s]; if (rs == 0) { continue; } - const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + const float * values = has_values ? values_sample + s * n_per_row : nullptr; for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + off; - double x2 = 0.0; - if (has_w) { + const float * row = f32_sample.data() + offset; + double rsn = 0.0; + if (has_values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; - const double xx = x[j]; - x2 += w * xx * xx; + const double v = values[j]; + const double x = row[j]; + rsn += v * x * x; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double xx = x[j]; - x2 += xx * xx; + const double x = row[j]; + rsn += x * x; } } - x2_per_row[row_idx] = x2; - off += (size_t)n_per_row; + row_sq_norm[row_idx] = rsn; + offset += (size_t)n_per_row; } } } - // Quantize sampled rows slice-by-slice into qbuf - size_t qoff = 0; - size_t foff = 0; + // Quantize sampled rows slice-by-slice into quantized_buffer + size_t quantised_offset = 0; + size_t floats_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value); - qoff += row_sz * (size_t)rs; - foff += (size_t)rs * (size_t)n_per_row; + quantised_offset += row_size * (size_t)rs; + floats_offset += (size_t)rs * (size_t)n_per_row; } - // Dequantize into deq (row-wise if needed to avoid int overflow) + // Dequantize into dequantized_buffer { - const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); - } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); + const ggml_type_traits * traits = ggml_get_type_traits(quant_type); + if (quant_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); + } else if (quant_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); } else { if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); return 1e35; } size_t done = 0; - while (done < nels) { - const size_t chunk = std::min((size_t)n_per_row, nels - done); - traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk); + while (done < sample_element_count) { + const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done); + traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk); done += chunk; } } } // Compute error - const double eps = 1e-12; - size_t off = 0; + size_t offset = 0; size_t row_idx = 0; double total_err = 0.0; - for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * wv = has_w ? values_sample + slice * n_per_row : nullptr; - const float * act = has_a ? activations_sample + slice * n_per_row : nullptr; - const double bden = has_a ? bden_per_slice[slice] : 0.0; - + const float * values = has_values ? values_sample + slice * n_per_row : nullptr; + const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; + const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0; double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + off; - const float * y = deq.data() + off; - - double mse_w = 0.0; - double bnum = 0.0; - - if (wv && act) { + const float * x = f32_sample.data() + offset; + const float * y = dequantized_buffer.data() + offset; + double weighted_mse = 0.0; + double bias_numerator = 0.0; + if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; + const double v = values[j]; const double e = y[j] - x[j]; - const double a = act[j]; - mse_w += w * e * e; - bnum += w * e * a; + const double a = activations[j]; + weighted_mse += v * e * e; + bias_numerator += v * e * a; } - } else if (wv) { + } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; + const double v = values[j]; const double e = y[j] - x[j]; - mse_w += w * e * e; + weighted_mse += v * e * e; } - } else if (act) { + } else if (activations) { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; - const double a = act[j]; - mse_w += e * e; - bnum += e * a; + const double a = activations[j]; + weighted_mse += e * e; + bias_numerator += e * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; - mse_w += e * e; + weighted_mse += e * e; } } - // corrected normalization: divide the full numerator by x2 - double numer = mse_w; - if (act && bias_lambda != 0.0) { - const double proj = bnum * bnum / (bden + eps); - numer += bias_lambda * proj; + double err_numerator = weighted_mse; + constexpr double epsilon = 1e-12; + constexpr float bias_lambda = 1.0; + //bias_lambda defines the weight of the bias term in the weigthed MSE error function + // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, + // 2.0 means twice as much weight for bias, etc + if (activations && bias_lambda != 0.0) { + const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); + err_numerator += bias_lambda * proj; } - const double denom = x2_per_row[row_idx] + eps; - const double row_err = numer / denom; - + const double err_denominator = row_sq_norm[row_idx] + epsilon; + const double row_err = err_numerator / err_denominator; slice_err += row_err; - off += (size_t)n_per_row; + offset += (size_t)n_per_row; } // scale to full rows (nrows) @@ -942,14 +928,11 @@ static std::unordered_map target_bpw_type( std::vector all; all.reserve(tensors.size()); - for (const auto * tw : tensors) { std::vector workers; workers.reserve(std::max(1, nthread)); - ggml_tensor * t = tw->tensor; const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { continue; } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); @@ -959,37 +942,26 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - const int64_t nelem = ggml_nelements(t); - std::vector> f32_conv_buf; - const float * values_all = get_values(name); - const float * activations_all = get_activations(name); - // Dequantize only sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const ggml_type src_type = t->type; - const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); - const bool src_is_quant = ggml_is_quantized(src_type); - const size_t src_row_sz = ggml_row_size(src_type, n_per_row); - + // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute + int sample_rows_per_expert = 512; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); - std::vector values_sample; - std::vector activations_sample; - std::vector sample_rows_per_slice(ne2, 0); - // deterministic sampling seed based on tensor name + fixed constant std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); - + std::vector sample_rows_per_slice(ne2, 0); const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); const int64_t stride = std::max(1, nrows_total / sample_rows_max); - - // Temporary buffer for one dequantized row - std::vector rowbuf((size_t)n_per_row); - + std::vector row_buffer(n_per_row); + const ggml_type src_type = t->type; + const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); + const bool src_is_quant = ggml_is_quantized(src_type); + const size_t src_row_sz = ggml_row_size(src_type, n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; int64_t offset = 0; @@ -1004,19 +976,19 @@ static std::unordered_map target_bpw_type( f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (!src_traits || !src_traits->to_float) { throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); } - src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else { throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); } @@ -1045,6 +1017,10 @@ static std::unordered_map target_bpw_type( } }; + const float * values_all = get_values(name); + const float * activations_all = get_activations(name); + std::vector values_sample; + std::vector activations_sample; if (values_all) { // get size from the map (not just the raw pointer) auto itv = values_data->find(remap_imatrix(name, mapped)); @@ -1057,6 +1033,7 @@ static std::unordered_map target_bpw_type( copy_or_broadcast(activations_all, sz, activations_sample); } + const int64_t nelem = ggml_nelements(t); tensor_info info; info.w = tw; info.n_elements = nelem; @@ -1067,12 +1044,12 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) std::vector quant_candidates; if (is_iq(params->ftype)) { - quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); + quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants)); } else { - quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); + quant_candidates.assign(std::begin(k_quants), std::end(k_quants)); } - // Compute maximum row size among compatible candidates (to size qbuf once) + // Compute maximum row size among compatible candidates (to size quantized_buffer once) size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; std::vector compatible_candidates; @@ -1092,21 +1069,20 @@ static std::unordered_map target_bpw_type( compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); // Now evaluate candidates - std::vector cand_out(compatible_candidates.size()); - const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); - const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - std::vector qbuf(max_row_sz * total_sampled_rows); - std::vector deq(f32_sample.size()); + std::vector eval_candidates(compatible_candidates.size()); + const float *values = values_sample.empty() ? nullptr : values_sample.data(); + const float *activations = activations_sample.empty() ? nullptr : activations_sample.data(); + std::vector quantized_buffer(max_row_sz * total_sampled_rows); + std::vector dequantised_buffer(f32_sample.size()); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; eval_workers.reserve(n_eval_threads); - for (int ti = 0; ti < n_eval_threads; ++ti) { eval_workers.emplace_back([&] { // thread-local scratch - std::vector tl_qbuf(qbuf.size()); - std::vector tl_deq(deq.size()); + std::vector tl_quantized_buffer(quantized_buffer.size()); + std::vector tl_dequantised_buffer(dequantised_buffer.size()); for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); @@ -1114,15 +1090,16 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); - const size_t bytes = total_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq); - cand_out[i] = candidate_types{ tt, bpw, bytes, err }; + const size_t bytes = tensor_bytes(t, tt); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer); + eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); } + for (auto &th : eval_workers) { th.join(); } - for (auto &c : cand_out) { + for (auto &c : eval_candidates) { if (c.bytes > 0) { info.candidate.push_back(c); } } @@ -1132,7 +1109,7 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A. + // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. { std::vector pruned; pruned.reserve(info.candidate.size()); @@ -1155,36 +1132,37 @@ static std::unordered_map target_bpw_type( info.candidate.swap(pruned); } - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bpw != b.bpw) { return a.bpw < b.bpw; } - if (a.error != b.error) { return a.error < b.error; } - return a.bytes < b.bytes; - }); - // Collapse candidates with identical storage size (bytes) { - std::vector uniq; - uniq.reserve(info.candidate.size()); + std::vector unique; + unique.reserve(info.candidate.size()); + // Sort by bpw asc, error asc, bytes asc + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bpw != b.bpw) { return a.bpw < b.bpw; } + if (a.error != b.error) { return a.error < b.error; } + return a.bytes < b.bytes; + }); for (size_t i = 0; i < info.candidate.size();) { - size_t j = i + 1; + size_t j = i + 1; candidate_types best = info.candidate[i]; // group same-byte entries, keep the one with the lowest error while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { - if (info.candidate[j].error < best.error) { best = info.candidate[j]; } + if (info.candidate[j].error < best.error) { + best = info.candidate[j]; + } ++j; } - uniq.push_back(best); + unique.push_back(best); i = j; } - info.candidate.swap(uniq); + info.candidate.swap(unique); } // Initialize choice at the smallest bpw candidate info.choice = 0; info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; - all.push_back(std::move(info)); } @@ -1196,6 +1174,7 @@ static std::unordered_map target_bpw_type( for (const auto & ti : all) { b += ti.candidate[ti.choice].bytes; } + return b; }; @@ -1204,6 +1183,7 @@ static std::unordered_map target_bpw_type( for (const auto & ti : all) { w += ti.n_elements; } + return w; }; @@ -1215,12 +1195,14 @@ static std::unordered_map target_bpw_type( // Precompute current bpw double bpw_now = current_bpw(); + float target_bpw = params->target_bpw; // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) if (bpw_now >= target_bpw) { std::unordered_map overrides; for (const auto & ti : all) { overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } + return overrides; } @@ -1268,6 +1250,7 @@ static std::unordered_map target_bpw_type( best = upgrade{ i, j, err, delta_bytes, ratio }; } } + return best; }; @@ -1286,16 +1269,12 @@ static std::unordered_map target_bpw_type( } } - // We might still be below target but taking any single upgrade overshoots. - // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. + // We might still be below target so we try to find the best upgrade one last time { - double under_gap = target_bpw - bpw_now; - upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; double best_over_gap = 1e300; - + double under_gap = target_bpw - bpw_now; size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int) all.size(); ++i) { const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } @@ -1305,19 +1284,16 @@ static std::unordered_map target_bpw_type( const auto & cur = ti.candidate[ti.choice]; const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } size_t over_bytes = now_bytes + delta_bytes; double bpw_over = (double)over_bytes * 8.0 / (double)tw; - - double over_gap = std::abs(bpw_over - (double)target_bpw); - double err = cur.error - nxt.error; if (err < 0.0) { err = 0.0; } double ratio = err / (double)(delta_bytes * 8ull); + double over_gap = std::abs(bpw_over - (double)target_bpw); if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { best_over_gap = over_gap; best_over = upgrade{ i, j, err, delta_bytes, ratio }; @@ -1339,6 +1315,7 @@ static std::unordered_map target_bpw_type( __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } + return overrides; } From ec0afbe79ff001af56846365f91f97240bd2dbf4 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 01:46:09 +0100 Subject: [PATCH 034/148] Include embeddings and output tensors --- src/llama-quant.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 85191a66ae8..b9e3c19a89a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -733,9 +733,6 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; q &= !params->only_copy; - // TODO: Exclude embeddings and output tensors? - // q &= params->quantize_output_tensor || name != "output.weight"; - q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; }; From 35c1504441eb03b126b15a6ddd4625f094dc7dfe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:01:57 +0100 Subject: [PATCH 035/148] Fix byte count for 3d or higher tensors --- src/llama-quant.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b9e3c19a89a..8cc5f221ea1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -676,10 +676,9 @@ static std::unordered_map target_bpw_type( auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t row_sz = ggml_row_size(typ, n_per_row); - return (size_t)ne2 * (size_t)nrows * row_sz; + const size_t row_sz = ggml_row_size(typ, n_per_row); + const int64_t nrows = ggml_nrows(t); + return (size_t)nrows * row_sz; }; auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { From bb0d912c1f93de2ef1af4ef9fb467c4862012898 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:02:56 +0100 Subject: [PATCH 036/148] Update comments --- src/llama-quant.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8cc5f221ea1..4b846c7d0c7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -703,6 +703,7 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { + // This list should be kept in sync with llama_tensor_quantize_impl() const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; q &= ggml_n_dims(t) >= 2; @@ -902,7 +903,7 @@ static std::unordered_map target_bpw_type( constexpr float bias_lambda = 1.0; //bias_lambda defines the weight of the bias term in the weigthed MSE error function // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, - // 2.0 means twice as much weight for bias, etc + // 2.0 means twice as much weight for bias, etc. Default is 1.0. if (activations && bias_lambda != 0.0) { const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); err_numerator += bias_lambda * proj; @@ -1192,7 +1193,7 @@ static std::unordered_map target_bpw_type( double bpw_now = current_bpw(); float target_bpw = params->target_bpw; - // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) + // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw) if (bpw_now >= target_bpw) { std::unordered_map overrides; for (const auto & ti : all) { From 2f13fee795639841de46b8f415a233062aa5d2b8 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:05:55 +0100 Subject: [PATCH 037/148] Parameterise type --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4b846c7d0c7..e5e27da5096 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -760,8 +760,8 @@ static std::unordered_map target_bpw_type( if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - std::vector row_sq_norm(sample_row_count, 0.0); - std::vector bias_denominator_per_slice(ne2, 0.0); + std::vector row_sq_norm(sample_row_count, 0.0); + std::vector bias_denominator_per_slice(ne2, 0.0); // Precompute bias denominator per slice const bool has_values = (values_sample != nullptr); From 47cdbe21552324cd79b9243485eeb455cab4673a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:11:11 +0100 Subject: [PATCH 038/148] Reduce sampling window to speedup process --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e5e27da5096..5460669e7ce 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -945,7 +945,7 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute - int sample_rows_per_expert = 512; + constexpr int sample_rows_per_expert = 384; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); From 01c927fb94163ddb36365323683274071c034690 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:14:14 +0100 Subject: [PATCH 039/148] Improve pareto efficient candidate selection --- src/llama-quant.cpp | 49 +++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5460669e7ce..14d9087f53e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1106,54 +1106,33 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. + // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. { std::vector pruned; pruned.reserve(info.candidate.size()); - // Sort by bytes asc, error asc - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + + // Sort by bytes ascending, error ascending + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); double best_err = std::numeric_limits::infinity(); size_t last_bytes = std::numeric_limits::max(); - - for (const auto &c : info.candidate) { - if (c.error < best_err || c.bytes > last_bytes) { - pruned.push_back(c); - best_err = std::min(best_err, (double)c.error); + for (const auto & c : info.candidate) { + // Only keep the best error seen so far at strictly larger byte sizes + if (c.bytes != last_bytes) { + // first time we see this byte size last_bytes = c.bytes; - } - } - info.candidate.swap(pruned); - } - - // Collapse candidates with identical storage size (bytes) - { - std::vector unique; - unique.reserve(info.candidate.size()); - // Sort by bpw asc, error asc, bytes asc - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bpw != b.bpw) { return a.bpw < b.bpw; } - if (a.error != b.error) { return a.error < b.error; } - return a.bytes < b.bytes; - }); - - for (size_t i = 0; i < info.candidate.size();) { - size_t j = i + 1; - candidate_types best = info.candidate[i]; - // group same-byte entries, keep the one with the lowest error - while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { - if (info.candidate[j].error < best.error) { - best = info.candidate[j]; + if (c.error < best_err) { + pruned.push_back(c); + best_err = c.error; } - ++j; + } else { + // same bytes: we already sorted by error; skip } - unique.push_back(best); - i = j; } - info.candidate.swap(unique); + info.candidate.swap(pruned); } // Initialize choice at the smallest bpw candidate From 897decbe8a062ded079f1f1a866392571ed7f95f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:15:11 +0100 Subject: [PATCH 040/148] Show skipped IQ tensors --- src/llama-quant.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 14d9087f53e..c5c19f3c5f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1019,7 +1019,6 @@ static std::unordered_map target_bpw_type( std::vector values_sample; std::vector activations_sample; if (values_all) { - // get size from the map (not just the raw pointer) auto itv = values_data->find(remap_imatrix(name, mapped)); const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); copy_or_broadcast(values_all, sz, values_sample); @@ -1053,7 +1052,7 @@ static std::unordered_map target_bpw_type( compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str()); continue; } ggml_type tt = make_compatible(t, ts_type); @@ -1214,13 +1213,11 @@ static std::unordered_map target_bpw_type( const auto & cur = ti.candidate[ti.choice]; const auto & nxt = ti.candidate[j]; - const size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } double err = cur.error - nxt.error; err = std::max(err, 0.0); - double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { best = upgrade{ i, j, err, delta_bytes, ratio }; From f05c8483d8b138c58a41ecdf32f95947bb130be5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:17:58 +0100 Subject: [PATCH 041/148] Improve dequantized_buffer fill --- src/llama-quant.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c5c19f3c5f3..db4a0e1a20e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -843,12 +843,9 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); return 1e35; } - - size_t done = 0; - while (done < sample_element_count) { - const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done); - traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk); - done += chunk; + const size_t row_size = ggml_row_size(quant_type, n_per_row); + for (size_t r = 0; r < sample_row_count; ++r) { + traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row); } } } From fea99d051ad3a9f3cce3cdf084074e0655f47e14 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 16:57:58 +0100 Subject: [PATCH 042/148] Refactor and combine lambdas --- src/llama-quant.cpp | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index db4a0e1a20e..10993e89c6a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -660,20 +660,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q6_K }; - auto get_values = [&](const std::string & tensor_name) -> const float * { - if (!values_data) { return nullptr; } - const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); - if (it == values_data->end()) { return nullptr; } - return it->second.data(); - }; - - auto get_activations = [&](const std::string & tensor_name) -> const float * { - if (!activations_data) { return nullptr; } - const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); - if (it == activations_data->end()) { return nullptr; } - return it->second.data(); - }; - auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); @@ -991,6 +977,15 @@ static std::unordered_map target_bpw_type( sample_rows_per_slice[slice] = current_sampled_rows; } + auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { + if (!m) { return {nullptr, 0}; } + const std::string key = remap_imatrix(tensor_name, mapped); + const auto it = m->find(key); + if (it == m->end()) { return {nullptr, 0}; } + return { it->second.data(), it->second.size() }; + }; + + // Copy this row's side data (values and activations), or broadcasts to all slices auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { const size_t want = (size_t)ne2 * (size_t)n_per_row; dst.clear(); @@ -1005,26 +1000,17 @@ static std::unordered_map target_bpw_type( std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } } else { - // Mismatch – safer to skip using it for this tensor LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", __func__, name.c_str(), src_sz, (size_t)n_per_row, want); } }; - const float * values_all = get_values(name); - const float * activations_all = get_activations(name); + const auto [values_all, values_sz] = side_data(values_data, name); + const auto [activations_all, activations_sz] = side_data(activations_data, name); std::vector values_sample; std::vector activations_sample; - if (values_all) { - auto itv = values_data->find(remap_imatrix(name, mapped)); - const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); - copy_or_broadcast(values_all, sz, values_sample); - } - if (activations_all) { - auto ita = activations_data->find(remap_imatrix(name, mapped)); - const size_t sz = ita == activations_data->end() ? 0 : ita->second.size(); - copy_or_broadcast(activations_all, sz, activations_sample); - } + if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } + if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } const int64_t nelem = ggml_nelements(t); tensor_info info; From 6d17889addf3aa18000334e1dd958111104cdf3e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 16:58:46 +0100 Subject: [PATCH 043/148] Log if override is from tensor-type or from bpw-target --- src/llama-quant.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 10993e89c6a..721deaddad0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1049,8 +1049,8 @@ static std::unordered_map target_bpw_type( // Now evaluate candidates std::vector eval_candidates(compatible_candidates.size()); - const float *values = values_sample.empty() ? nullptr : values_sample.data(); - const float *activations = activations_sample.empty() ? nullptr : activations_sample.data(); + const float * values = values_sample.empty() ? nullptr : values_sample.data(); + const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantised_buffer(f32_sample.size()); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); @@ -1656,15 +1656,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); // get bpw override const auto override = bpw_overrides.find(name); - if (override != bpw_overrides.end()) { new_type = override->second; } - // unless the user specifies a type, and the tensor geometry will not require fallback quantisation + if (override != bpw_overrides.end() && override->second != new_type) { + LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type)); + new_type = override->second; + } + // unless the user specifies a type, and the tensor shape will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type)); new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } From 9a4b1154974d5ddbfb9d9d3f785f5a29bb202fac Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 01:08:01 +0100 Subject: [PATCH 044/148] Explicitly adding include --- src/llama-quant.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 721deaddad0..d17b21d0086 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -4,6 +4,7 @@ #include "llama-model-loader.h" #include +#include #include #include #include From f75265f55bb1d4470dea57f4c9e3ad108cc343a1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 01:08:37 +0100 Subject: [PATCH 045/148] Fix typo --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d17b21d0086..6e3aa3f83d1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1535,7 +1535,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f) { - LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } From 73124a9921b967fe9e5afbb9f48924a3d48983a6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 02:17:22 +0100 Subject: [PATCH 046/148] Refactor estimate_error() --- src/llama-quant.cpp | 131 ++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6e3aa3f83d1..3c358fb67e2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -742,38 +742,33 @@ static std::unordered_map target_bpw_type( const size_t sample_row_count = sample_element_count / (size_t)n_per_row; if (sample_row_count == 0) { return 0.0; } - const size_t row_size = ggml_row_size(quant_type, n_per_row); - const size_t buffer_size = row_size * sample_row_count; - if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } + const size_t row_sz = ggml_row_size(quant_type, n_per_row); + const size_t buffer_sz = row_sz * sample_row_count; + + if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); } if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - std::vector row_sq_norm(sample_row_count, 0.0); - std::vector bias_denominator_per_slice(ne2, 0.0); + const bool has_values = values_sample != nullptr; + const bool has_activations = activations_sample != nullptr; - // Precompute bias denominator per slice - const bool has_values = (values_sample != nullptr); - const bool has_activations = (activations_sample != nullptr); + // Bias denominators per slice (only needed if we have activations) + std::vector bias_denominator_per_slice(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { const float * values = has_values ? values_sample + s * n_per_row : nullptr; const float * activations = activations_sample + s * n_per_row; - double bias_denominator = 0.0; - if (has_values) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double a = activations[j]; - bias_denominator += values[j] * a * a; - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const double a = activations[j]; - bias_denominator += a * a; - } + double denom = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = activations[j]; + const double w = values ? values[j] : 1.0; + denom += w * a * a; } - bias_denominator_per_slice[s] = bias_denominator; + bias_denominator_per_slice[s] = denom; } } - // Compute squared norms of sampled rows + // Compute per-row squared norms with weighting (if values are provided) + std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; size_t row_idx = 0; @@ -784,18 +779,18 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + s * n_per_row : nullptr; for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * row = f32_sample.data() + offset; + const float * x = f32_sample.data() + offset; double rsn = 0.0; - if (has_values) { + if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; - const double x = row[j]; - rsn += v * x * x; + const double v = values[j]; + const double xx = x[j]; + rsn += v * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double x = row[j]; - rsn += x * x; + const double xx = x[j]; + rsn += xx * xx; } } row_sq_norm[row_idx] = rsn; @@ -805,35 +800,44 @@ static std::unordered_map target_bpw_type( } // Quantize sampled rows slice-by-slice into quantized_buffer - size_t quantised_offset = 0; - size_t floats_offset = 0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; - if (rs == 0) { continue; } + { + size_t q_offset = 0; + size_t f_offset = 0; + for (int64_t slice = 0; slice < ne2; ++slice) { + const int64_t rs = sample_rows_per_slice[slice]; + if (rs == 0) { continue; } - const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value); + const float * value = has_values ? values_sample + slice * n_per_row : nullptr; + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - quantised_offset += row_size * (size_t)rs; - floats_offset += (size_t)rs * (size_t)n_per_row; + q_offset += row_sz * (size_t)rs; + f_offset += (size_t)rs * (size_t)n_per_row; + } } // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); - } else { - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); - return 1e35; - } - const size_t row_size = ggml_row_size(quant_type, n_per_row); - for (size_t r = 0; r < sample_row_count; ++r) { - traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row); + auto row_to_float = [&](size_t r) { + uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + if (quant_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (quant_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else { + if (!traits || !traits->to_float) { + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); + return false; + } + traits->to_float(src, dst, (int)n_per_row); } + + return true; + }; + + for (size_t r = 0; r < sample_row_count; ++r) { + if (!row_to_float(r)) { return 1e35; } } } @@ -847,20 +851,22 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; - const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0; + const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; + double slice_err = 0.0; + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; double weighted_mse = 0.0; - double bias_numerator = 0.0; + double bias_num = 0.0; if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { const double v = values[j]; const double e = y[j] - x[j]; const double a = activations[j]; weighted_mse += v * e * e; - bias_numerator += v * e * a; + bias_num += v * e * a; } } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -873,7 +879,7 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; const double a = activations[j]; weighted_mse += e * e; - bias_numerator += e * a; + bias_num += e * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { @@ -882,24 +888,19 @@ static std::unordered_map target_bpw_type( } } - double err_numerator = weighted_mse; + constexpr float bias_lambda = 1.75f; constexpr double epsilon = 1e-12; - constexpr float bias_lambda = 1.0; - //bias_lambda defines the weight of the bias term in the weigthed MSE error function - // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, - // 2.0 means twice as much weight for bias, etc. Default is 1.0. - if (activations && bias_lambda != 0.0) { - const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); - err_numerator += bias_lambda * proj; + double err_num = weighted_mse; + if (activations && bias_lambda != 0.0f) { + const double proj = bias_num * bias_num / (bias_denom + epsilon); + err_num += (double)bias_lambda * proj; } - const double err_denominator = row_sq_norm[row_idx] + epsilon; - const double row_err = err_numerator / err_denominator; - slice_err += row_err; + const double err_den = row_sq_norm[row_idx] + epsilon; + slice_err += err_num / err_den; offset += (size_t)n_per_row; } - // scale to full rows (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } From 68ae5e66cea41457a3ed11018374b64e2f94d3d3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 02:50:55 +0100 Subject: [PATCH 047/148] Improve list of candidate types --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3c358fb67e2..392a23b5ca2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1023,21 +1023,20 @@ static std::unordered_map target_bpw_type( size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) - std::vector quant_candidates; - if (is_iq(params->ftype)) { - quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants)); - } else { - quant_candidates.assign(std::begin(k_quants), std::end(k_quants)); - } + const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; + const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]); - // Compute maximum row size among compatible candidates (to size quantized_buffer once) size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; + std::vector compatible_candidates; - compatible_candidates.reserve(quant_candidates.size()); - for (ggml_type ts_type : quant_candidates) { + compatible_candidates.reserve(base_sz); + + for (size_t i = 0; i < base_sz; ++i) { + ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", + __func__, ggml_type_name(ts_type), name.c_str()); continue; } ggml_type tt = make_compatible(t, ts_type); From decafae27060ed923c69ce3b89db505538a9b230 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 11:30:11 +0100 Subject: [PATCH 048/148] Adjust bias_lambda --- src/llama-quant.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 392a23b5ca2..4ce651723f8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -888,7 +888,9 @@ static std::unordered_map target_bpw_type( } } - constexpr float bias_lambda = 1.75f; + // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger + constexpr float bias_lambda = 1.5f; constexpr double epsilon = 1e-12; double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { @@ -1024,7 +1026,7 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; - const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]); + const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; From 3856d60328349c5b2a4e381d6fdff20d272415ab Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 14:45:07 +0100 Subject: [PATCH 049/148] Restrict quant types per family --- src/llama-quant.cpp | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ce651723f8..7615376e31c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -628,11 +628,7 @@ static std::unordered_map target_bpw_type( constexpr ggml_type k_quants[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, @@ -646,19 +642,12 @@ static std::unordered_map target_bpw_type( constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, - // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it? - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 }; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { @@ -888,8 +877,8 @@ static std::unordered_map target_bpw_type( } } - // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger + // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger constexpr float bias_lambda = 1.5f; constexpr double epsilon = 1e-12; double err_num = weighted_mse; From 61c0e01f500ef2610904045c6a7852956c7ba6ba Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 24 Aug 2025 13:36:03 +0100 Subject: [PATCH 050/148] Execute bpw_overrides() only if an imatrix file is provided --- src/llama-quant.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7615376e31c..4ed94540687 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1525,9 +1525,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::unordered_map bpw_overrides = {}; - if (params->target_bpw != -1.0f) { - LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + if (params->target_bpw != -1.0f && !params->only_copy) { + if (params->imatrix) { + if (params->activations) { + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__); + } else { + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); + } + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + } else { + LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); + } } int cur_split = -1; From d4ac2106fb5b9e1a98d6aef8a0931e73e46f324e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 24 Aug 2025 13:39:10 +0100 Subject: [PATCH 051/148] Improve logging and some minor code refactoring --- src/llama-quant.cpp | 26 +++++++++++++++----------- tools/quantize/quantize.cpp | 7 +------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ed94540687..407a63d887d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -132,7 +132,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< for (const auto & p : mapped) { if (p.second == blk) { - LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first); return new_name.replace(match.position(1), match.length(1), std::to_string(p.first)); } } @@ -1257,7 +1256,7 @@ static std::unordered_map target_bpw_type( // Build the override map std::unordered_map overrides; - LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw); + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__); for (const auto & ti : all) { LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); @@ -1352,7 +1351,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size())); + LLAMA_LOG_INFO("================================ Have weights data with %d entries",int(values_data->size())); qs.has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *values_data) { @@ -1367,7 +1366,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->activations) { activations_data = static_cast>*>(params->activations); if (activations_data) { - LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size())); + LLAMA_LOG_INFO(" and %d activations",int(activations_data->size())); qs.has_activations = true; // check activations for nans or infs for (const auto & kv : *activations_data) { @@ -1379,6 +1378,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -1655,12 +1655,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!params->pure && ggml_is_quantized(default_type)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // get bpw override - const auto override = bpw_overrides.find(name); - if (override != bpw_overrides.end() && override->second != new_type) { - LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type)); - new_type = override->second; + + // get quantization type overrides targeting a given bits per weight budget + if (params->target_bpw != -1.0f && !bpw_overrides.empty()) { + const auto override = bpw_overrides.find(name); + if (override != bpw_overrides.end() && override->second != new_type) { + LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type)); + new_type = override->second; + } } + // unless the user specifies a type, and the tensor shape will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); @@ -1668,7 +1672,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(type override %s) ", ggml_type_name(new_type)); new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } @@ -1699,7 +1703,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (values_data) { auto it = values_data->find(remap_imatrix(tensor->name, mapped)); if (it == values_data->end()) { - LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s, ", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { imatrix = it->second.data(); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index b907008cb4f..77fa6b90cea 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -399,12 +399,7 @@ static int prepare_imatrix(const std::string & imatrix_file, values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); } - if (!values_data.empty()) { - printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size())); - } - if (!activations_data.empty()) { - printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size())); - } + return m_last_call; } From 4286690019f21cae3abb92a7903c6675a3367e5e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 26 Aug 2025 21:39:40 +0100 Subject: [PATCH 052/148] Minor comment update --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 407a63d887d..cbbfdedfbd6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor type overrides to meet target BPW at lowest ppl +// Returns per-tensor type overrides to meet target BPW at lowest error static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, From 04946114c9009cd04f665ed98b55304e376e19d3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:01:03 +0100 Subject: [PATCH 053/148] Refactor epsilon into a function-wide variable --- src/llama-quant.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cbbfdedfbd6..da1267ddbc6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -649,6 +649,8 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q8_0 }; + constexpr double epsilon = 1e-12; + auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); @@ -1193,7 +1195,7 @@ static std::unordered_map target_bpw_type( double err = cur.error - nxt.error; err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); - if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { + if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) { best = upgrade{ i, j, err, delta_bytes, ratio }; } } @@ -1208,7 +1210,7 @@ static std::unordered_map target_bpw_type( size_t now_bytes = current_total_bytes(); size_t next_bytes = now_bytes + up.delta_bytes; double bpw_next = (double)next_bytes * 8.0 / (double)tw; - if (bpw_next <= target_bpw + 1e-12) { + if (bpw_next <= target_bpw + epsilon) { all[up.idx].choice = up.next; bpw_now = bpw_next; } else { @@ -1241,7 +1243,7 @@ static std::unordered_map target_bpw_type( double ratio = err / (double)(delta_bytes * 8ull); double over_gap = std::abs(bpw_over - (double)target_bpw); - if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { + if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) { best_over_gap = over_gap; best_over = upgrade{ i, j, err, delta_bytes, ratio }; } From 8df1d00ae4042a1eee38c1fc9ac06137d5ce5078 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:04:28 +0100 Subject: [PATCH 054/148] Add directional scaling --- src/llama-quant.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index da1267ddbc6..a9621eab8e1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -900,6 +900,27 @@ static std::unordered_map target_bpw_type( return std::isfinite(total_err) ? total_err : 1e35; }; + auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { + if (!activations) { return 1.0f; } + // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v))) + // If no values, use v=1 + double sum_v = 0.0; + double sum_aw2 = 0.0; + double sum_a2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double v = values ? std::max(0.0f, values[j]) : 1.0; + const double a = activations[j]; + sum_v += v; + sum_aw2 += v * a * a; + sum_a2 += a * a; + } + const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row)); + const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); + const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; + + // Clamp to a reasonable range + return (float)std::clamp(scale, 0.5, 2.0); + }; std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { From 66aff8fa1ee1d34c7faaa0ff658a730a9554ef36 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:06:42 +0100 Subject: [PATCH 055/148] Add precise_lambda() --- src/llama-quant.cpp | 102 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a9621eab8e1..662760fbe9a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -921,6 +921,108 @@ static std::unordered_map target_bpw_type( // Clamp to a reasonable range return (float)std::clamp(scale, 0.5, 2.0); }; + + // Returns an adaptive lambda for this tensor using a small probe set + // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger + auto precise_lambda = [&](const ggml_tensor * t, + const std::vector & f32_sample, + const std::vector & sample_rows_per_slice, + const float * values, + const float * activations, + const std::vector & compatible_candidates) -> float + { + // No activations => no projection term + if (!activations) { return 0.0f; } + + // pick a tiny probe set: try to spread around mid-range types + std::vector probes; + probes.reserve(3); + auto push_if = [&](const ggml_type tiny) { + if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) { + probes.push_back(tiny); + } + }; + + // Prefer family-consistent probes; fall back to whatever exists + push_if(GGML_TYPE_Q4_K); + push_if(GGML_TYPE_Q3_K); + push_if(GGML_TYPE_Q5_K); + if (probes.empty() && !compatible_candidates.empty()) { + probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); + } + if (probes.size() == 1 && compatible_candidates.size() >= 2) { + probes.push_back(compatible_candidates.front()); + } + if (probes.empty()) { return 0.0f; } + + // Scratch buffers (reused) + const int64_t n_per_row = t->ne[0]; + const size_t total_sampled_rows = f32_sample.size() / n_per_row; + size_t max_row_sz = 0; + for (auto pt : probes) { + max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); + } + std::vector quantized_buffer(max_row_sz * total_sampled_rows); + std::vector dequantized_buffer(f32_sample.size()); + + std::vector ratios; + ratios.reserve(probes.size()); + + for (const auto pt : probes) { + // err at lambda=0 => pure weighted MSE part + double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); + // err at lambda=1 => weighted MSE + projection penalty + const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f); + + const double p = std::max(0.0, err1 - err0); // projection term contribution + const double m = std::max(0.0, err0); // MSE term contribution + if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { + ratios.push_back(m / p); + } + } + + if (ratios.empty()) { return 0.0f; } + + std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); + double lambda = ratios[ratios.size() / 2]; + + // activations directional scale + const float scale = directional_scale(values, activations, n_per_row); + lambda *= scale; + + // clamp to safe range + lambda = std::clamp(lambda, 0.0, 8.0); + return (float)lambda; + }; + + auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { + if (!activations) { return 0.0f; } + double s = 0.0; + double s2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = values ? std::max(0.0f, values[j]) : 1.0; + const double aw = std::sqrt(w) * activations[j]; + const double aw2 = aw * aw; + s += aw2; + s2 += aw2 * aw2; + } + if (s2 <= 0.0) { return 0.0f; } + const auto d = (double)n_per_row; + //const double p = s * s / (d * s2 + epsilon); + //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0); + // Map p in (0,1] to lambda in [0,8] decreasing + double base = 1.0 - s * s / (d * s2 + epsilon); + base = std::clamp(base, 0.0, 1.0); + + // activations directional scale + const double scale = directional_scale(values, activations, n_per_row); + // clamp to safe range + const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; + + return (float)lambda; + }; + std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { From 556f6b04fed2092568e31948708af8102c9e5433 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:08:08 +0100 Subject: [PATCH 056/148] Add --precise-lambda option --- include/llama.h | 1 + src/llama-quant.cpp | 27 +++++++++++++++++---------- tools/quantize/quantize.cpp | 6 +++++- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/llama.h b/include/llama.h index 01c5b67c755..3a5bda32eab 100644 --- a/include/llama.h +++ b/include/llama.h @@ -357,6 +357,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) + bool precise_lambda; // use precise_lambda calculation - slow computation but very accurate } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 662760fbe9a..98fc11d8403 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -722,7 +722,8 @@ static std::unordered_map target_bpw_type( const float * values_sample, const float * activations_sample, std::vector & quantized_buffer, - std::vector & dequantized_buffer) -> double + std::vector & dequantized_buffer, + float bias_lambda) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; @@ -878,10 +879,6 @@ static std::unordered_map target_bpw_type( } } - // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger - constexpr float bias_lambda = 1.5f; - constexpr double epsilon = 1e-12; double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { const double proj = bias_num * bias_num / (bias_denom + epsilon); @@ -1163,6 +1160,15 @@ static std::unordered_map target_bpw_type( std::sort(compatible_candidates.begin(), compatible_candidates.end()); compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); + // Compute adaptive bias_lambda for this tensor + float bias_lambda = 0.0f; + { + const float * values = values_sample.empty() ? nullptr : values_sample.data(); + const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); + bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) : + fast_lambda(values, activations, n_per_row); + } + // Now evaluate candidates std::vector eval_candidates(compatible_candidates.size()); const float * values = values_sample.empty() ? nullptr : values_sample.data(); @@ -1186,7 +1192,7 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); const size_t bytes = tensor_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); @@ -1301,7 +1307,6 @@ static std::unordered_map target_bpw_type( }; auto recompute_best_upgrade = [&]() -> upgrade { - const double eps = 1e-12; upgrade best{ -1, -1, 0.0, 0, -1.0 }; for (int i = 0; i < (int) all.size(); ++i) { const auto & ti = all[i]; @@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__); + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__); } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } + LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f + /*.target_bpw =*/ -1.0f, + /*.precise_lambda =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 77fa6b90cea..0c9460513c8 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,9 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); + printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -538,6 +540,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) { + params.precise_lambda = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From eab8708244db703c5c7219261b0c875c4b57825f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 30 Aug 2025 10:14:46 +0100 Subject: [PATCH 057/148] Minor factoring for efficiency and correctness --- src/llama-quant.cpp | 126 +++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 66 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 98fc11d8403..db688fdf02c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor type overrides to meet target BPW at lowest error +// Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -650,6 +650,7 @@ static std::unordered_map target_bpw_type( }; constexpr double epsilon = 1e-12; + constexpr double infinity = std::numeric_limits::infinity(); auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -680,7 +681,7 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { - // This list should be kept in sync with llama_tensor_quantize_impl() + // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; q &= ggml_n_dims(t) >= 2; @@ -730,9 +731,15 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const size_t sample_element_count = f32_sample.size(); - const size_t sample_row_count = sample_element_count / (size_t)n_per_row; + const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; if (sample_row_count == 0) { return 0.0; } + size_t expected_rows = 0; + for (int64_t s = 0; s < ne2; ++s) { + expected_rows += (size_t)sample_rows_per_slice[s]; + } + if (expected_rows != sample_row_count) { return infinity; } + const size_t row_sz = ggml_row_size(quant_type, n_per_row); const size_t buffer_sz = row_sz * sample_row_count; @@ -750,15 +757,15 @@ static std::unordered_map target_bpw_type( const float * activations = activations_sample + s * n_per_row; double denom = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { + const double w = values ? std::max(0.0f, values[j]) : 1.0; const double a = activations[j]; - const double w = values ? values[j] : 1.0; denom += w * a * a; } bias_denominator_per_slice[s] = denom; } } - // Compute per-row squared norms with weighting (if values are provided) + // Per-row squared norms with weighting std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; @@ -768,15 +775,14 @@ static std::unordered_map target_bpw_type( if (rs == 0) { continue; } const float * values = has_values ? values_sample + s * n_per_row : nullptr; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; double rsn = 0.0; if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double xx = x[j]; - rsn += v * xx * xx; + rsn += w * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { @@ -790,7 +796,7 @@ static std::unordered_map target_bpw_type( } } - // Quantize sampled rows slice-by-slice into quantized_buffer + // Quantize sampled rows per slice -> quantized_buffer { size_t q_offset = 0; size_t f_offset = 0; @@ -800,35 +806,32 @@ static std::unordered_map target_bpw_type( const float * value = has_values ? values_sample + slice * n_per_row : nullptr; (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - q_offset += row_sz * (size_t)rs; f_offset += (size_t)rs * (size_t)n_per_row; } } - // Dequantize into dequantized_buffer + // quantized_buffer -> dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - auto row_to_float = [&](size_t r) { - uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); - return false; + + const bool is_fp16 = quant_type == GGML_TYPE_F16; + const bool is_bf16 = quant_type == GGML_TYPE_BF16; + if (!is_fp16 && !is_bf16 && traits && traits->to_float) { + traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row)); + } else { + for (size_t r = 0; r < sample_row_count; ++r) { + uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; + if (is_fp16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + } else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + } else { + if (!traits || !traits->to_float) { return infinity; } + traits->to_float(src, dst, (int)n_per_row); } - traits->to_float(src, dst, (int)n_per_row); } - - return true; - }; - - for (size_t r = 0; r < sample_row_count; ++r) { - if (!row_to_float(r)) { return 1e35; } } } @@ -836,6 +839,7 @@ static std::unordered_map target_bpw_type( size_t offset = 0; size_t row_idx = 0; double total_err = 0.0; + for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } @@ -843,9 +847,7 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; - double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; @@ -853,17 +855,17 @@ static std::unordered_map target_bpw_type( double bias_num = 0.0; if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double e = y[j] - x[j]; const double a = activations[j]; - weighted_mse += v * e * e; - bias_num += v * e * a; + weighted_mse += w * e * e; + bias_num += w * e * a; } } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double e = y[j] - x[j]; - weighted_mse += v * e * e; + weighted_mse += w * e * e; } } else if (activations) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -881,26 +883,28 @@ static std::unordered_map target_bpw_type( double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { - const double proj = bias_num * bias_num / (bias_denom + epsilon); - err_num += (double)bias_lambda * proj; + if (bias_denom > 0.0) { + const double proj = bias_num * bias_num / (bias_denom + epsilon); + err_num += bias_lambda * proj; + } } - const double err_den = row_sq_norm[row_idx] + epsilon; - slice_err += err_num / err_den; + const double denom = row_sq_norm[row_idx] + epsilon; + slice_err += err_num / denom; offset += (size_t)n_per_row; } const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; + if (!std::isfinite(total_err)) { return infinity; } } - return std::isfinite(total_err) ? total_err : 1e35; + return std::isfinite(total_err) ? total_err : infinity; }; + // Scaling factor to increase lambda when activations are concentrated auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { if (!activations) { return 1.0f; } - // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v))) - // If no values, use v=1 double sum_v = 0.0; double sum_aw2 = 0.0; double sum_a2 = 0.0; @@ -915,13 +919,10 @@ static std::unordered_map target_bpw_type( const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; - // Clamp to a reasonable range return (float)std::clamp(scale, 0.5, 2.0); }; - // Returns an adaptive lambda for this tensor using a small probe set - // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger + // Higher precision but much longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, @@ -929,10 +930,8 @@ static std::unordered_map target_bpw_type( const float * activations, const std::vector & compatible_candidates) -> float { - // No activations => no projection term if (!activations) { return 0.0f; } - // pick a tiny probe set: try to spread around mid-range types std::vector probes; probes.reserve(3); auto push_if = [&](const ggml_type tiny) { @@ -941,7 +940,6 @@ static std::unordered_map target_bpw_type( } }; - // Prefer family-consistent probes; fall back to whatever exists push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q3_K); push_if(GGML_TYPE_Q5_K); @@ -953,19 +951,18 @@ static std::unordered_map target_bpw_type( } if (probes.empty()) { return 0.0f; } - // Scratch buffers (reused) + // Scratch buffers const int64_t n_per_row = t->ne[0]; const size_t total_sampled_rows = f32_sample.size() / n_per_row; size_t max_row_sz = 0; for (auto pt : probes) { max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); } + std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); - std::vector ratios; ratios.reserve(probes.size()); - for (const auto pt : probes) { // err at lambda=0 => pure weighted MSE part double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); @@ -984,17 +981,17 @@ static std::unordered_map target_bpw_type( std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); double lambda = ratios[ratios.size() / 2]; - // activations directional scale const float scale = directional_scale(values, activations, n_per_row); lambda *= scale; - - // clamp to safe range lambda = std::clamp(lambda, 0.0, 8.0); + return (float)lambda; }; + // Faster to compute but lower precision. Best option for the vast majority of models auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { if (!activations) { return 0.0f; } + double s = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { @@ -1004,17 +1001,13 @@ static std::unordered_map target_bpw_type( s += aw2; s2 += aw2 * aw2; } + if (s2 <= 0.0) { return 0.0f; } const auto d = (double)n_per_row; - //const double p = s * s / (d * s2 + epsilon); - //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0); - // Map p in (0,1] to lambda in [0,8] decreasing double base = 1.0 - s * s / (d * s2 + epsilon); base = std::clamp(base, 0.0, 1.0); - // activations directional scale const double scale = directional_scale(values, activations, n_per_row); - // clamp to safe range const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; return (float)lambda; @@ -1036,13 +1029,13 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - // Dequantize only sampled rows into f32_sample + // Dequantize sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute - constexpr int sample_rows_per_expert = 384; + // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute + constexpr int sample_rows_per_expert = 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); @@ -1096,6 +1089,7 @@ static std::unordered_map target_bpw_type( const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); if (it == m->end()) { return {nullptr, 0}; } + return { it->second.data(), it->second.size() }; }; @@ -1104,7 +1098,6 @@ static std::unordered_map target_bpw_type( const size_t want = (size_t)ne2 * (size_t)n_per_row; dst.clear(); if (!src || src_sz == 0) { return; } - if (src_sz == want) { dst.resize(want); std::memcpy(dst.data(), src, want * sizeof(float)); @@ -1160,7 +1153,8 @@ static std::unordered_map target_bpw_type( std::sort(compatible_candidates.begin(), compatible_candidates.end()); compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); - // Compute adaptive bias_lambda for this tensor + // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. + // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger float bias_lambda = 0.0f; { const float * values = values_sample.empty() ? nullptr : values_sample.data(); From 04c07b3272f067ba30d32fb82d693fb0013cc47d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 10 Sep 2025 18:00:56 +0100 Subject: [PATCH 058/148] Add better control over MSE and directional bias computation --- include/llama.h | 2 +- src/llama-quant.cpp | 41 +++++++++---------------------------- tools/quantize/quantize.cpp | 31 +++++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 35 deletions(-) diff --git a/include/llama.h b/include/llama.h index d0ca37dc65a..ba6c185346c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - bool precise_lambda; // use precise_lambda calculation - slow computation but very accurate + int32_t bpw_bias; // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index db688fdf02c..74ceb3de9cc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -902,26 +902,6 @@ static std::unordered_map target_bpw_type( return std::isfinite(total_err) ? total_err : infinity; }; - // Scaling factor to increase lambda when activations are concentrated - auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { - if (!activations) { return 1.0f; } - double sum_v = 0.0; - double sum_aw2 = 0.0; - double sum_a2 = 0.0; - for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values ? std::max(0.0f, values[j]) : 1.0; - const double a = activations[j]; - sum_v += v; - sum_aw2 += v * a * a; - sum_a2 += a * a; - } - const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row)); - const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); - const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; - - return (float)std::clamp(scale, 0.5, 2.0); - }; - // Higher precision but much longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, @@ -979,11 +959,7 @@ static std::unordered_map target_bpw_type( if (ratios.empty()) { return 0.0f; } std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); - double lambda = ratios[ratios.size() / 2]; - - const float scale = directional_scale(values, activations, n_per_row); - lambda *= scale; - lambda = std::clamp(lambda, 0.0, 8.0); + const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0); return (float)lambda; }; @@ -1007,8 +983,7 @@ static std::unordered_map target_bpw_type( double base = 1.0 - s * s / (d * s2 + epsilon); base = std::clamp(base, 0.0, 1.0); - const double scale = directional_scale(values, activations, n_per_row); - const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; + const double lambda = std::clamp(base, 0.0, 1.0) * 8.0; return (float)lambda; }; @@ -1159,8 +1134,11 @@ static std::unordered_map target_bpw_type( { const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) : - fast_lambda(values, activations, n_per_row); + if (params->bpw_bias == 1) { + bias_lambda = fast_lambda(values, activations, n_per_row); + } else if (params->bpw_bias == 2) { + bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); + } } // Now evaluate candidates @@ -1656,7 +1634,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda"); + const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"}; + LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -1967,7 +1946,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.precise_lambda =*/ false + /*.bpw_bias =*/ 1 }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0c9460513c8..0fe65daea0d 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -134,7 +134,7 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n"); + printf(" --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -496,6 +496,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } +static bool parse_bpw_bias(const char * data, int & bpw_bias) { + if (!data) { + printf("\n%s: error bias type not provided\n\n", __func__); + return false; + } + + try { + bpw_bias = std::stoi(data); + if (bpw_bias < 0 || bpw_bias > 2) { + printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__); + return false; + } + } + catch (const std::exception & e) { + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); + return false; + } + + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -510,6 +531,7 @@ int main(int argc, char ** argv) { std::vector tensor_types; std::vector prune_layers; float target_bpw = -1.0f; + int bpw_bias = 1; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -540,8 +562,11 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) { - params.precise_lambda = true; + } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) { + if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) { + usage(argv[0]); + } + params.bpw_bias = bpw_bias; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 886536d80ab5c227cd6c3f8813b8b5fbf5bea41d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:27:23 +0100 Subject: [PATCH 059/148] Increase error type precision --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 74ceb3de9cc..c4c525c68e4 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -612,7 +612,7 @@ static std::unordered_map target_bpw_type( ggml_type type; float bpw; size_t bytes; - float error; + double error; }; struct tensor_info { From bc8762f27f185c5db1cbd0d8ec3bcc8e1771856d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:33:22 +0100 Subject: [PATCH 060/148] Capture surrounding function name --- src/llama-quant.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c4c525c68e4..cae908803be 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -651,6 +651,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); + const char * func = __func__; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -1083,7 +1084,7 @@ static std::unordered_map target_bpw_type( } } else { LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", - __func__, name.c_str(), src_sz, (size_t)n_per_row, want); + func, name.c_str(), src_sz, (size_t)n_per_row, want); } }; From 4dff85fbe54336130155a8e4fa5e7f4db48f4451 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:41:37 +0100 Subject: [PATCH 061/148] Improve precise_lambda() efficiency --- src/llama-quant.cpp | 126 ++++++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 40 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cae908803be..1677b242d9e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -725,7 +725,9 @@ static std::unordered_map target_bpw_type( const float * activations_sample, std::vector & quantized_buffer, std::vector & dequantized_buffer, - float bias_lambda) -> double + float bias_lambda, + double * out_mse = nullptr, + double * out_proj = nullptr) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; @@ -733,13 +735,23 @@ static std::unordered_map target_bpw_type( const size_t sample_element_count = f32_sample.size(); const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; - if (sample_row_count == 0) { return 0.0; } + if (sample_row_count == 0) { + if (out_mse) { *out_mse = 0.0; } + if (out_proj) { *out_proj = 0.0; } + + return 0.0; + } size_t expected_rows = 0; for (int64_t s = 0; s < ne2; ++s) { expected_rows += (size_t)sample_rows_per_slice[s]; } - if (expected_rows != sample_row_count) { return infinity; } + if (expected_rows != sample_row_count) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } const size_t row_sz = ggml_row_size(quant_type, n_per_row); const size_t buffer_sz = row_sz * sample_row_count; @@ -750,7 +762,7 @@ static std::unordered_map target_bpw_type( const bool has_values = values_sample != nullptr; const bool has_activations = activations_sample != nullptr; - // Bias denominators per slice (only needed if we have activations) + // Bias denominators per slice std::vector bias_denominator_per_slice(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { @@ -815,7 +827,6 @@ static std::unordered_map target_bpw_type( // quantized_buffer -> dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - const bool is_fp16 = quant_type == GGML_TYPE_F16; const bool is_bf16 = quant_type == GGML_TYPE_BF16; if (!is_fp16 && !is_bf16 && traits && traits->to_float) { @@ -825,12 +836,19 @@ static std::unordered_map target_bpw_type( uint8_t * src = quantized_buffer.data() + r * row_sz; float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; if (is_fp16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); - } else if (is_bf16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { return infinity; } - traits->to_float(src, dst, (int)n_per_row); + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row); + } + else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row); + } + else { + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } + traits->to_float(src, dst, (int) n_per_row); } } } @@ -839,8 +857,8 @@ static std::unordered_map target_bpw_type( // Compute error size_t offset = 0; size_t row_idx = 0; - double total_err = 0.0; - + double total_mse = 0.0; + double total_proj = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } @@ -848,7 +866,11 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; - double slice_err = 0.0; + std::vector row_mse_norm; + std::vector row_proj_norm; + row_mse_norm.reserve(rs); + if (activations) { row_proj_norm.reserve(rs); } + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; @@ -868,13 +890,6 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; weighted_mse += w * e * e; } - } else if (activations) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = y[j] - x[j]; - const double a = activations[j]; - weighted_mse += e * e; - bias_num += e * a; - } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; @@ -882,28 +897,64 @@ static std::unordered_map target_bpw_type( } } - double err_num = weighted_mse; - if (activations && bias_lambda != 0.0f) { + const double denom_x = row_sq_norm[row_idx]; + double m_norm = weighted_mse / (denom_x + epsilon); + row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); + + if (activations) { + double p_norm = 0.0; if (bias_denom > 0.0) { const double proj = bias_num * bias_num / (bias_denom + epsilon); - err_num += bias_lambda * proj; + p_norm = std::isfinite(proj) ? proj : 0.0; } + row_proj_norm.push_back(p_norm); } - - const double denom = row_sq_norm[row_idx] + epsilon; - slice_err += err_num / denom; offset += (size_t)n_per_row; } + // Trimmed sum to avoid outlier rows dominating the results + auto trimmed_sum = [&](std::vector & v) -> double { + if (v.empty()) { return 0.0; } + const int64_t n = (int64_t)v.size(); + if (n < 50) { + double s = 0.0; + for (const double z : v) { s += z; } + return s; + } + + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side + k = std::max(0, std::min(k, n / 32)); // but not more than 3.125% + std::nth_element(v.begin(), v.begin() + k, v.end()); + std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + double s = 0.0; + for (int64_t i = k; i < n - k; ++i) { + s += v[i]; + } + + return s; + }; + const double scale_rows = (double)nrows / std::max(1.0, (double)rs); - total_err += slice_err * scale_rows; - if (!std::isfinite(total_err)) { return infinity; } + + total_mse += trimmed_sum(row_mse_norm) * scale_rows; + if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; } + + if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } } + if (out_mse) { *out_mse = total_mse; } + if (out_proj) { *out_proj = total_proj; } + + const double total_err = total_mse + bias_lambda * total_proj; return std::isfinite(total_err) ? total_err : infinity; }; - // Higher precision but much longer to compute + // Higher precision but longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, @@ -936,22 +987,17 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const size_t total_sampled_rows = f32_sample.size() / n_per_row; size_t max_row_sz = 0; - for (auto pt : probes) { - max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); - } + for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); + std::vector ratios; ratios.reserve(probes.size()); for (const auto pt : probes) { - // err at lambda=0 => pure weighted MSE part - double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); - // err at lambda=1 => weighted MSE + projection penalty - const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f); - - const double p = std::max(0.0, err1 - err0); // projection term contribution - const double m = std::max(0.0, err0); // MSE term contribution + double m = 0.0; + double p = 0.0; + (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p); if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { ratios.push_back(m / p); } From 7d85993f268d9fa35bea9178f6acf2d72833dffa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:44:41 +0100 Subject: [PATCH 062/148] Minor refactoring --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1677b242d9e..15ea36721e8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -617,7 +617,7 @@ static std::unordered_map target_bpw_type( struct tensor_info { const llama_model_loader::llama_tensor_weight * w = nullptr; - std::vector candidate = {}; + std::vector candidate; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; @@ -972,8 +972,8 @@ static std::unordered_map target_bpw_type( } }; - push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q3_K); + push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q5_K); if (probes.empty() && !compatible_candidates.empty()) { probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); @@ -1011,7 +1011,7 @@ static std::unordered_map target_bpw_type( return (float)lambda; }; - // Faster to compute but lower precision. Best option for the vast majority of models + // Faster to compute but may yield lower precision. Best option for the vast majority of cases auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { if (!activations) { return 0.0f; } @@ -1057,12 +1057,10 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute - constexpr int sample_rows_per_expert = 256; + const int sample_rows_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); - // deterministic sampling seed based on tensor name + fixed constant - std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); std::vector sample_rows_per_slice(ne2, 0); const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); const int64_t stride = std::max(1, nrows_total / sample_rows_max); @@ -1072,6 +1070,7 @@ static std::unordered_map target_bpw_type( const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); int64_t current_sampled_rows = 0; int64_t offset = 0; if (stride > 1) { @@ -1084,11 +1083,11 @@ static std::unordered_map target_bpw_type( const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { @@ -1211,7 +1210,7 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); const size_t bytes = tensor_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); + const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); @@ -1240,7 +1239,7 @@ static std::unordered_map target_bpw_type( return a.error < b.error; }); - double best_err = std::numeric_limits::infinity(); + double best_err = infinity; size_t last_bytes = std::numeric_limits::max(); for (const auto & c : info.candidate) { // Only keep the best error seen so far at strictly larger byte sizes From 12e816b51199b38a6571141d5f1e5f1039ebe706 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 09:24:23 +0100 Subject: [PATCH 063/148] Replace greedy allocator with lagrangian relaxation --- src/llama-quant.cpp | 266 ++++++++++++++++++++++++++------------------ 1 file changed, 156 insertions(+), 110 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 15ea36721e8..a369d50ffe6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1266,152 +1266,198 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } - // Greedy allocation from minimum bpw upward to reach target_bpw - auto current_total_bytes = [&]() -> size_t { - size_t b = 0; + // Lagrangian relaxation to minimise error subject to a bpw target constraint + auto total_bytes = [&]() -> size_t { + size_t tb = 0; for (const auto & ti : all) { - b += ti.candidate[ti.choice].bytes; + tb += ti.candidate[ti.choice].bytes; } - return b; + return tb; }; - auto total_weights = [&]() -> size_t { - size_t w = 0; - for (const auto & ti : all) { - w += ti.n_elements; - } - - return w; - }; + size_t total_elems = 0; + size_t min_bytes = 0; + size_t max_bytes = 0; + for (const auto & ti : all) { + total_elems += (size_t)ti.n_elements; + min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor + max_bytes += ti.candidate.back().bytes; // largest candidate per tensor + } - const size_t tw = total_weights(); - auto current_bpw = [&]() -> double { - return (double)current_total_bytes() * 8.0f / (double)tw; - }; + if (total_elems == 0) { return {}; } - // Precompute current bpw - double bpw_now = current_bpw(); + const double target_bpw = params->target_bpw; + size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); - float target_bpw = params->target_bpw; - // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw) - if (bpw_now >= target_bpw) { + auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func); for (const auto & ti : all) { + LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", + func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } return overrides; + }; + + if (budget_bytes <= min_bytes) { + for (auto & ti : all) { ti.choice = 0; } + + return emit_overrides(); } + if (budget_bytes >= max_bytes) { + for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } - struct upgrade { - int idx; - int next; - double err; - size_t delta_bytes; - double ratio; - }; + return emit_overrides(); + } - // Find next strictly-larger candidate index for a tensor - auto next_distinct_idx = [&](const tensor_info & ti) -> int { - const auto & cand = ti.candidate; - const auto & cur = cand[ti.choice]; - int j = ti.choice + 1; - while (j < (int)cand.size() && cand[j].bytes == cur.bytes) { - ++j; - } + auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { + choice.resize(all.size()); + bytes = 0; + err = 0.0; + for (size_t i = 0; i < all.size(); ++i) { + const auto & cand = all[i].candidate; + int best_j = 0; + double best_val = infinity; + for (int j = 0; j < (int)cand.size(); ++j) { + const double bits = (double)cand[j].bytes * 8.0; + const double val = cand[j].error + mu * bits; + if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) { + best_val = val; + best_j = j; + } + } - return j < (int)cand.size() ? j : -1; + choice[i] = best_j; + bytes += cand[best_j].bytes; + err += cand[best_j].error; + } }; - auto recompute_best_upgrade = [&]() -> upgrade { - upgrade best{ -1, -1, 0.0, 0, -1.0 }; - for (int i = 0; i < (int) all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - - const int j = next_distinct_idx(ti); - if (j < 0) { continue; } - - const auto & cur = ti.candidate[ti.choice]; - const auto & nxt = ti.candidate[j]; - const size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } - - double err = cur.error - nxt.error; - err = std::max(err, 0.0); - double ratio = err / (double)(delta_bytes * 8ull); - if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) { - best = upgrade{ i, j, err, delta_bytes, ratio }; + size_t bytes_lo = 0; + size_t bytes_hi = 0; + size_t bytes_mid = 0; + double mu_lo = 0.0; + double mu_hi = 1.0; + double err_lo = 0.0; + double err_hi = 0.0; + double err_mid = 0.0; + std::vector choice_lo; + std::vector choice_hi; + std::vector choice_mid; + std::vector best_under_choice; + std::vector best_over_choice; + + lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo); + + // increase mu until we get under budget or hit a safety cap + { + int expand = 0; + while (true) { + lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); + if (bytes_hi <= budget_bytes) { + break; + } + mu_hi *= 2.0; + if (++expand > 60) { + break; } } + } - return best; - }; + double best_under_gap = infinity; + double best_over_gap = infinity; + double best_under_err = infinity; + double best_over_err = infinity; + for (int it = 0; it < 40; ++it) { + double mu = 0.5 * (mu_lo + mu_hi); + lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); - while (true) { - upgrade up = recompute_best_upgrade(); - if (up.idx < 0) { break; } + const double gap = std::abs((double)bytes_mid - (double)budget_bytes); - size_t now_bytes = current_total_bytes(); - size_t next_bytes = now_bytes + up.delta_bytes; - double bpw_next = (double)next_bytes * 8.0 / (double)tw; - if (bpw_next <= target_bpw + epsilon) { - all[up.idx].choice = up.next; - bpw_now = bpw_next; + if (bytes_mid > budget_bytes) { + // Too big, need stronger penalty + mu_lo = mu; + + if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { + best_over_gap = gap; + best_over_err = err_mid; + best_over_choice = choice_mid; + } } else { - break; - } - } + // Under budget, good candidate + mu_hi = mu; - // We might still be below target so we try to find the best upgrade one last time - { - upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; - double best_over_gap = 1e300; - double under_gap = target_bpw - bpw_now; - size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int) all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - - int j = next_distinct_idx(ti); - if (j < 0) { continue; } - - const auto & cur = ti.candidate[ti.choice]; - const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } - - size_t over_bytes = now_bytes + delta_bytes; - double bpw_over = (double)over_bytes * 8.0 / (double)tw; - double err = cur.error - nxt.error; - if (err < 0.0) { err = 0.0; } - double ratio = err / (double)(delta_bytes * 8ull); - - double over_gap = std::abs(bpw_over - (double)target_bpw); - if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) { - best_over_gap = over_gap; - best_over = upgrade{ i, j, err, delta_bytes, ratio }; + if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { + best_under_gap = gap; + best_under_err = err_mid; + best_under_choice = choice_mid; } } + } - if (best_over.idx >= 0) { - if (best_over_gap < under_gap) { - all[best_over.idx].choice = best_over.next; + if (!best_under_choice.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = best_under_choice[i]; + } + } else if (!best_over_choice.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = best_over_choice[i]; + } + } else { + // Pick whichever side we already have, or keep minimal + if (bytes_hi <= budget_bytes && !choice_hi.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = choice_hi[i]; + } + } else { + for (auto & ti : all) { + ti.choice = 0; } } } - // Build the override map - std::unordered_map overrides; - LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__); - for (const auto & ti : all) { - LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", - __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); - overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + // Spend any remaining budget with best upgrades that still fit (one pass) + { + auto cur_bytes = total_bytes(); + while (true) { + int best_i = -1; + int best_j = -1; + double best_ratio = -1.0; + size_t best_delta = 0; + + for (int i = 0; i < (int)all.size(); ++i) { + const auto & ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { + continue; + } + + int j = ti.choice + 1; + while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } + if (j >= (int)ti.candidate.size()) { continue; } + + size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; + if (cur_bytes + delta > budget_bytes) { continue; } + + double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error); + double ratio = err_gain / (double)(delta * 8); + if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { + best_ratio = ratio; + best_delta = delta; + best_i = i; + best_j = j; + } + } + + if (best_i < 0) { break; } + all[best_i].choice = best_j; + cur_bytes += best_delta; + } } - return overrides; + return emit_overrides(); } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { From 2b516068e2ef0e51373be32b1917eb7295bcfc54 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 09:41:52 +0100 Subject: [PATCH 064/148] "Convexify" candidate list --- src/llama-quant.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a369d50ffe6..955e6c12fe3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1257,6 +1257,32 @@ static std::unordered_map target_bpw_type( info.candidate.swap(pruned); } + // Enforce convexity in (bytes, error) curve + { + const auto & c = info.candidate; + if (c.size() >= 3) { + std::vector convex; + convex.reserve(c.size()); + auto slope = [](const candidate_types & a, const candidate_types & b) -> double { + const double dx = (double)b.bytes - (double)a.bytes; + if (dx <= 0.0) { return infinity; } + + return ((double)b.error - (double)a.error) / dx; + }; + + for (const auto & p : c) { + while (convex.size() >= 2) { + double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]); + double s2 = slope(convex[convex.size() - 1], p); + if (s2 + epsilon < s1) { convex.pop_back(); } + else { break; } + } + convex.push_back(p); + } + info.candidate.swap(convex); + } + } + // Initialize choice at the smallest bpw candidate info.choice = 0; info.min_bpw = info.candidate.front().bpw; From 8503d59ee44bc30b0d030cceb5e17590b334730d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 11:49:18 +0100 Subject: [PATCH 065/148] Increase IQ options --- src/llama-quant.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 955e6c12fe3..41fd819f86f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -641,12 +641,21 @@ static std::unordered_map target_bpw_type( constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, + // TODO: find better way to handle F16/BF16 +#ifdef GGML_USE_METAL + GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; constexpr double epsilon = 1e-12; From c709e1a3353cbefbe58320c2eae1a1edafc0f618 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 14 Sep 2025 22:38:27 +0100 Subject: [PATCH 066/148] Fix MoE tensor estimation --- src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 41fd819f86f..1efb1c5eeed 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1021,27 +1021,38 @@ static std::unordered_map target_bpw_type( }; // Faster to compute but may yield lower precision. Best option for the vast majority of cases - auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { + auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) { if (!activations) { return 0.0f; } - double s = 0.0; - double s2 = 0.0; - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = values ? std::max(0.0f, values[j]) : 1.0; - const double aw = std::sqrt(w) * activations[j]; - const double aw2 = aw * aw; - s += aw2; - s2 += aw2 * aw2; - } + double accum = 0.0; + int ns = 0; + + for (int64_t s = 0; s < std::max(1, ne2); ++s) { + const float * v = values ? values + s * n_per_row : nullptr; + const float * a = activations + s * n_per_row; + + double s1 = 0.0; + double s2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double aw = std::sqrt(w) * a[j]; + const double aw2 = aw * aw; + s1 += aw2; + s2 += aw2 * aw2; + } - if (s2 <= 0.0) { return 0.0f; } - const auto d = (double)n_per_row; - double base = 1.0 - s * s / (d * s2 + epsilon); - base = std::clamp(base, 0.0, 1.0); + if (s1 > 0.0) { + const double n = (double)n_per_row; + double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); + double lambda = 8.0 * (c / (c + 1.0)); + accum += std::clamp(lambda, 0.0, 8.0); + ++ns; + } + } - const double lambda = std::clamp(base, 0.0, 1.0) * 8.0; + if (ns == 0) { return 0.0f; } - return (float)lambda; + return (float)(accum / ns); }; std::vector all; @@ -1190,7 +1201,7 @@ static std::unordered_map target_bpw_type( const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); if (params->bpw_bias == 1) { - bias_lambda = fast_lambda(values, activations, n_per_row); + bias_lambda = fast_lambda(values, activations, n_per_row, ne2); } else if (params->bpw_bias == 2) { bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); } From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 21:31:31 +0100 Subject: [PATCH 067/148] General refactoring --- src/llama-quant.cpp | 75 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c6051a480c0..6e5562379cf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -729,19 +729,19 @@ static std::unordered_map target_bpw_type( auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, const std::vector & f32_sample, - const std::vector & sample_rows_per_slice, + const std::vector & rows_sample, const float * values_sample, const float * activations_sample, std::vector & quantized_buffer, std::vector & dequantized_buffer, - float bias_lambda, + float tensor_bias_lambda, + const float * slice_bias_lambda, double * out_mse = nullptr, double * out_proj = nullptr) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t sample_element_count = f32_sample.size(); const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; if (sample_row_count == 0) { @@ -753,8 +753,9 @@ static std::unordered_map target_bpw_type( size_t expected_rows = 0; for (int64_t s = 0; s < ne2; ++s) { - expected_rows += (size_t)sample_rows_per_slice[s]; + expected_rows += (size_t)rows_sample[s]; } + if (expected_rows != sample_row_count) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } @@ -783,17 +784,18 @@ static std::unordered_map target_bpw_type( const double a = activations[j]; denom += w * a * a; } + bias_denominator_per_slice[s] = denom; } } - // Per-row squared norms with weighting + // Weighted per-row squared norms std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; size_t row_idx = 0; for (int64_t s = 0; s < ne2; ++s) { - const int64_t rs = sample_rows_per_slice[s]; + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } const float * values = has_values ? values_sample + s * n_per_row : nullptr; @@ -823,7 +825,7 @@ static std::unordered_map target_bpw_type( size_t q_offset = 0; size_t f_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; + const int64_t rs = rows_sample[slice]; if (rs == 0) { continue; } const float * value = has_values ? values_sample + slice * n_per_row : nullptr; @@ -843,21 +845,19 @@ static std::unordered_map target_bpw_type( } else { for (size_t r = 0; r < sample_row_count; ++r) { uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; if (is_fp16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row); - } - else if (is_bf16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row); - } - else { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else { if (!traits || !traits->to_float) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } return infinity; } - traits->to_float(src, dst, (int) n_per_row); + traits->to_float(src, dst, (int)n_per_row); } } } @@ -1098,20 +1098,20 @@ static std::unordered_map target_bpw_type( offset = dist(rng); } - for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { + for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { if (src_type == GGML_TYPE_F32) { - const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; + const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { - const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (!src_traits || !src_traits->to_float) { throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); } @@ -1120,9 +1120,11 @@ static std::unordered_map target_bpw_type( } else { throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); } + ++current_sampled_rows; } - sample_rows_per_slice[slice] = current_sampled_rows; + + rows_sample[slice] = current_sampled_rows; } auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { @@ -1160,7 +1162,7 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(t); + const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; info.n_elements = nelem; @@ -1185,8 +1187,9 @@ static std::unordered_map target_bpw_type( __func__, ggml_type_name(ts_type), name.c_str()); continue; } - ggml_type tt = make_compatible(t, ts_type); - if (!is_compatible(t, tt)) { continue; } + + ggml_type tt = make_compatible(tensor, ts_type); + if (!is_compatible(tensor, tt)) { continue; } compatible_candidates.push_back(tt); max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); } @@ -1222,16 +1225,16 @@ static std::unordered_map target_bpw_type( // thread-local scratch std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantised_buffer(dequantised_buffer.size()); - for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); if (i >= compatible_candidates.size()) { break; } - const ggml_type tt = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(t, tt); - const size_t bytes = tensor_bytes(t, tt); - const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); - eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; } }); } @@ -1244,8 +1247,8 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(t) * 8.0f / nelem; - info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); + float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. @@ -1274,6 +1277,7 @@ static std::unordered_map target_bpw_type( // same bytes: we already sorted by error; skip } } + info.candidate.swap(pruned); } @@ -1299,6 +1303,7 @@ static std::unordered_map target_bpw_type( } convex.push_back(p); } + info.candidate.swap(convex); } } @@ -1312,7 +1317,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } - // Lagrangian relaxation to minimise error subject to a bpw target constraint auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1359,6 +1363,7 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } + // Lagrangian relaxation to minimise error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); bytes = 0; @@ -1406,6 +1411,7 @@ static std::unordered_map target_bpw_type( if (bytes_hi <= budget_bytes) { break; } + mu_hi *= 2.0; if (++expand > 60) { break; @@ -1422,11 +1428,9 @@ static std::unordered_map target_bpw_type( lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); const double gap = std::abs((double)bytes_mid - (double)budget_bytes); - if (bytes_mid > budget_bytes) { // Too big, need stronger penalty mu_lo = mu; - if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { best_over_gap = gap; best_over_err = err_mid; @@ -1435,7 +1439,6 @@ static std::unordered_map target_bpw_type( } else { // Under budget, good candidate mu_hi = mu; - if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { best_under_gap = gap; best_under_err = err_mid; From a36946997e2c365e9317062f14e298af6e9928a9 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 21:36:54 +0100 Subject: [PATCH 068/148] Replace fast_bias() for per slice version and remove precise_bias() --- src/llama-quant.cpp | 167 +++++++++++++++----------------------------- 1 file changed, 58 insertions(+), 109 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6e5562379cf..fe10365772a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -868,8 +868,9 @@ static std::unordered_map target_bpw_type( size_t row_idx = 0; double total_mse = 0.0; double total_proj = 0.0; + double total_bias = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; + const int64_t rs = rows_sample[slice]; if (rs == 0) { continue; } const float * values = has_values ? values_sample + slice * n_per_row : nullptr; @@ -918,21 +919,24 @@ static std::unordered_map target_bpw_type( } row_proj_norm.push_back(p_norm); } + offset += (size_t)n_per_row; } // Trimmed sum to avoid outlier rows dominating the results auto trimmed_sum = [&](std::vector & v) -> double { if (v.empty()) { return 0.0; } + const int64_t n = (int64_t)v.size(); if (n < 50) { double s = 0.0; for (const double z : v) { s += z; } + return s; } - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::max(0, std::min(k, n / 32)); // but not more than 3.125% + int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side + k = std::max(0, std::min(k, n / 32)); // cap at ~3.125% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); double s = 0.0; @@ -944,11 +948,17 @@ static std::unordered_map target_bpw_type( }; const double scale_rows = (double)nrows / std::max(1.0, (double)rs); + const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; + const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; - total_mse += trimmed_sum(row_mse_norm) * scale_rows; - if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; } + total_mse += slice_mse; + total_proj += slice_proj; - if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) { + // per-slice lambda if provided, otherwise use scalar + const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda; + total_bias += bl * slice_proj; + + if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } @@ -959,100 +969,42 @@ static std::unordered_map target_bpw_type( if (out_mse) { *out_mse = total_mse; } if (out_proj) { *out_proj = total_proj; } - const double total_err = total_mse + bias_lambda * total_proj; + const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; + return std::isfinite(total_err) ? total_err : infinity; }; - // Higher precision but longer to compute - auto precise_lambda = [&](const ggml_tensor * t, - const std::vector & f32_sample, - const std::vector & sample_rows_per_slice, - const float * values, - const float * activations, - const std::vector & compatible_candidates) -> float + // Returns lambda per slice or 0.0 if no activations + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { - if (!activations) { return 0.0f; } - - std::vector probes; - probes.reserve(3); - auto push_if = [&](const ggml_type tiny) { - if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) { - probes.push_back(tiny); - } - }; - - push_if(GGML_TYPE_Q3_K); - push_if(GGML_TYPE_Q4_K); - push_if(GGML_TYPE_Q5_K); - if (probes.empty() && !compatible_candidates.empty()) { - probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); - } - if (probes.size() == 1 && compatible_candidates.size() >= 2) { - probes.push_back(compatible_candidates.front()); - } - if (probes.empty()) { return 0.0f; } - - // Scratch buffers - const int64_t n_per_row = t->ne[0]; - const size_t total_sampled_rows = f32_sample.size() / n_per_row; - size_t max_row_sz = 0; - for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); - - std::vector quantized_buffer(max_row_sz * total_sampled_rows); - std::vector dequantized_buffer(f32_sample.size()); - - std::vector ratios; - ratios.reserve(probes.size()); - for (const auto pt : probes) { - double m = 0.0; - double p = 0.0; - (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p); - if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { - ratios.push_back(m / p); - } - } - - if (ratios.empty()) { return 0.0f; } - - std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); - const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0); - - return (float)lambda; - }; - - // Faster to compute but may yield lower precision. Best option for the vast majority of cases - auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) { - if (!activations) { return 0.0f; } - - double accum = 0.0; - int ns = 0; + std::vector lambdas(std::max(1, ne2), 0.0f); + if (!activations) { return lambdas; } for (int64_t s = 0; s < std::max(1, ne2); ++s) { const float * v = values ? values + s * n_per_row : nullptr; const float * a = activations + s * n_per_row; - double s1 = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { - const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double w = v ? std::max(0.0f, v[j]) : 1.0; const double aw = std::sqrt(w) * a[j]; const double aw2 = aw * aw; s1 += aw2; s2 += aw2 * aw2; } + float l = 0.0f; if (s1 > 0.0) { - const double n = (double)n_per_row; - double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); + const auto n = (double)n_per_row; + const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); double lambda = 8.0 * (c / (c + 1.0)); - accum += std::clamp(lambda, 0.0, 8.0); - ++ns; + l = (float)std::clamp(lambda, 0.0, 12.0); } - } - if (ns == 0) { return 0.0f; } + lambdas[(size_t)s] = l; + } - return (float)(accum / ns); + return lambdas; }; std::vector all; @@ -1060,32 +1012,33 @@ static std::unordered_map target_bpw_type( for (const auto * tw : tensors) { std::vector workers; workers.reserve(std::max(1, nthread)); - ggml_tensor * t = tw->tensor; - const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { continue; } + ggml_tensor * tensor = tw->tensor; + const std::string name = ggml_get_name(tensor); + if (!can_quantize(tensor)) { continue; } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor)); if (!ml.use_mmap) { - if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); } - t->data = buffer.data(); + if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } + tensor->data = buffer.data(); } - ml.load_data_for(t); + + ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample - const int64_t n_per_row = t->ne[0]; - const int64_t nrows_total = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows_total = tensor->ne[1]; + const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute - const int sample_rows_per_expert = activations_data ? 512 : 256; + // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute + const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; - f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); - std::vector sample_rows_per_slice(ne2, 0); - const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); - const int64_t stride = std::max(1, nrows_total / sample_rows_max); + std::vector rows_sample(ne2, 0); + const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); + const int64_t stride = std::max(1, nrows_total / rows_sample_max); std::vector row_buffer(n_per_row); - const ggml_type src_type = t->type; + const ggml_type src_type = tensor->type; const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); @@ -1199,23 +1152,20 @@ static std::unordered_map target_bpw_type( // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger - float bias_lambda = 0.0f; - { - const float * values = values_sample.empty() ? nullptr : values_sample.data(); - const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - if (params->bpw_bias == 1) { - bias_lambda = fast_lambda(values, activations, n_per_row, ne2); - } else if (params->bpw_bias == 2) { - bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); - } - } - - // Now evaluate candidates - std::vector eval_candidates(compatible_candidates.size()); + float tensor_lambda = 0.0f; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); + auto lambdas = estimate_lambda(values, activations, n_per_row, ne2); + double acc = 0.0; + int ns = 0; + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + + // Evaluate candidates + std::vector eval_candidates(compatible_candidates.size()); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantised_buffer(f32_sample.size()); + const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; @@ -1476,7 +1426,6 @@ static std::unordered_map target_bpw_type( int best_j = -1; double best_ratio = -1.0; size_t best_delta = 0; - for (int i = 0; i < (int)all.size(); ++i) { const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { From 9e74f8341120d5f26939267e96fbaba04451d516 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 23:06:37 +0100 Subject: [PATCH 069/148] Replace --bpw-bias flag with --no-bias --- include/llama.h | 2 +- src/llama-quant.cpp | 18 +++++++++------- tools/quantize/quantize.cpp | 42 ++++++++----------------------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/include/llama.h b/include/llama.h index ba6c185346c..502bedbb802 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - int32_t bpw_bias; // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow) + bool no_bias; // use mean square error estimation only (no aligment bias) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9d7a9f97428..9e7d9d295cf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1153,13 +1153,16 @@ static std::unordered_map target_bpw_type( // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger float tensor_lambda = 0.0f; + std::vector lambdas; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - auto lambdas = estimate_lambda(values, activations, n_per_row, ne2); - double acc = 0.0; - int ns = 0; - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + if (!params->no_bias) { + double acc = 0.0; + int ns = 0; + lambdas = estimate_lambda(values, activations, n_per_row, ne2); + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + } // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); @@ -1726,8 +1729,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"}; - LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]); + LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -2038,7 +2040,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.bpw_bias =*/ 1 + /*.no_bias =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0fe65daea0d..03018cc3012 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,12 +117,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); + printf(" [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); - printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf(" --pure: disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); @@ -134,7 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n"); + printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); + printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -496,27 +497,6 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } -static bool parse_bpw_bias(const char * data, int & bpw_bias) { - if (!data) { - printf("\n%s: error bias type not provided\n\n", __func__); - return false; - } - - try { - bpw_bias = std::stoi(data); - if (bpw_bias < 0 || bpw_bias > 2) { - printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__); - return false; - } - } - catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); - return false; - } - - return true; -} - int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -531,7 +511,6 @@ int main(int argc, char ** argv) { std::vector tensor_types; std::vector prune_layers; float target_bpw = -1.0f; - int bpw_bias = 1; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -562,11 +541,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) { - if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) { - usage(argv[0]); - } - params.bpw_bias = bpw_bias; + } else if (strcmp(argv[arg_idx], "--no-bias") == 0) { + params.no_bias = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From e8e2aed17a4ade7b14021e05f2a55f9b8f26510f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:41:44 +0100 Subject: [PATCH 070/148] Refactor row sampling --- src/llama-quant.cpp | 49 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9e7d9d295cf..4a8c08e68f7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1029,7 +1029,6 @@ static std::unordered_map target_bpw_type( const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); @@ -1037,11 +1036,30 @@ static std::unordered_map target_bpw_type( std::vector rows_sample(ne2, 0); const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); const int64_t stride = std::max(1, nrows_total / rows_sample_max); - std::vector row_buffer(n_per_row); const ggml_type src_type = tensor->type; - const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); + const ggml_type_traits * src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); + + std::vector row_buffer(n_per_row); + auto row_to_fp32 = [&](const uint8_t * src, float * dst) { + if (src_type == GGML_TYPE_F32) { + std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row); + } else if (src_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (src_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else if (src_is_quant) { + if (!src_traits || !src_traits->to_float) { + throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); + } + + src_traits->to_float(src, dst, (int)n_per_row); + } else { + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + } + }; + for (int64_t slice = 0; slice < ne2; ++slice) { std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); int64_t current_sampled_rows = 0; @@ -1052,31 +1070,18 @@ static std::unordered_map target_bpw_type( } for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { + const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (src_type == GGML_TYPE_F32) { - const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row; - f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); - } else if (src_type == GGML_TYPE_F16) { - const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); - } else if (src_type == GGML_TYPE_BF16) { - const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); - } else if (src_is_quant) { - const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; - if (!src_traits || !src_traits->to_float) { - throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); - } - src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + auto src_f32 = (const float *)src_row; + f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); } else { - throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + row_to_fp32(src_row, row_buffer.data()); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } ++current_sampled_rows; } - + rows_sample[slice] = current_sampled_rows; } From bdefdb673c0d28b59c23d505307536b4f1724858 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:07 +0100 Subject: [PATCH 071/148] Refactor copy_or_broadcast() --- src/llama-quant.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4a8c08e68f7..b1302df431b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1087,6 +1087,7 @@ static std::unordered_map target_bpw_type( auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { if (!m) { return {nullptr, 0}; } + const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); if (it == m->end()) { return {nullptr, 0}; } @@ -1095,22 +1096,27 @@ static std::unordered_map target_bpw_type( }; // Copy this row's side data (values and activations), or broadcasts to all slices - auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { - const size_t want = (size_t)ne2 * (size_t)n_per_row; + auto copy_or_broadcast = [&](const float * src, size_t src_sz, std::vector & dst) { dst.clear(); if (!src || src_sz == 0) { return; } + + const size_t want = (size_t)ne2 * (size_t)n_per_row; if (src_sz == want) { dst.resize(want); std::memcpy(dst.data(), src, want * sizeof(float)); - } else if (src_sz == (size_t)n_per_row) { + + return; + } + if (src_sz == (size_t)n_per_row) { dst.resize(want); for (int64_t s = 0; s < ne2; ++s) { std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } - } else { - LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", - func, name.c_str(), src_sz, (size_t)n_per_row, want); + + return; } + + LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); }; const auto [values_all, values_sz] = side_data(values_data, name); From 6b8cedf3bcd2282e9f31b00026178d6bb393fc3e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:31 +0100 Subject: [PATCH 072/148] Refactor estimate_lambda() --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b1302df431b..ebacf688062 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -975,30 +975,29 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector - { - std::vector lambdas(std::max(1, ne2), 0.0f); + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + const int64_t ns = std::max(1, ne2); + std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } - for (int64_t s = 0; s < std::max(1, ne2); ++s) { + for (int64_t s = 0; s < ns; ++s) { const float * v = values ? values + s * n_per_row : nullptr; const float * a = activations + s * n_per_row; double s1 = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { const double w = v ? std::max(0.0f, v[j]) : 1.0; - const double aw = std::sqrt(w) * a[j]; - const double aw2 = aw * aw; - s1 += aw2; - s2 += aw2 * aw2; + const double aw2 = std::sqrt(w) * a[j]; + const double z = aw2 * aw2; + s1 += z; + s2 += z * z; } float l = 0.0f; if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - double lambda = 8.0 * (c / (c + 1.0)); - l = (float)std::clamp(lambda, 0.0, 12.0); + l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From c466c53808e566f5eb81a654c9f131064246cdaf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:54 +0100 Subject: [PATCH 073/148] Refactor pareto pruning and convexification --- src/llama-quant.cpp | 91 +++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ebacf688062..ab6601a8bf9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1146,8 +1146,7 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", - __func__, ggml_type_name(ts_type), name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1214,60 +1213,54 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } - // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. + // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve { - std::vector pruned; - pruned.reserve(info.candidate.size()); - - // Sort by bytes ascending, error ascending - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - return a.error < b.error; - }); - - double best_err = infinity; - size_t last_bytes = std::numeric_limits::max(); - for (const auto & c : info.candidate) { - // Only keep the best error seen so far at strictly larger byte sizes - if (c.bytes != last_bytes) { - // first time we see this byte size - last_bytes = c.bytes; - if (c.error < best_err) { - pruned.push_back(c); - best_err = c.error; + auto & candidates = info.candidate; + if (!candidates.empty()) { + std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + + return a.error < b.error; + }); + + std::vector pareto; + pareto.reserve(candidates.size()); + double best_err = infinity; + size_t last_bytes = std::numeric_limits::max(); + for (const auto & c : candidates) { + if (c.bytes != last_bytes) { + last_bytes = c.bytes; + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); + } } - } else { - // same bytes: we already sorted by error; skip } - } - info.candidate.swap(pruned); - } + candidates.swap(pareto); - // Enforce convexity in (bytes, error) curve - { - const auto & c = info.candidate; - if (c.size() >= 3) { - std::vector convex; - convex.reserve(c.size()); - auto slope = [](const candidate_types & a, const candidate_types & b) -> double { - const double dx = (double)b.bytes - (double)a.bytes; - if (dx <= 0.0) { return infinity; } - - return ((double)b.error - (double)a.error) / dx; - }; - - for (const auto & p : c) { - while (convex.size() >= 2) { - double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]); - double s2 = slope(convex[convex.size() - 1], p); - if (s2 + epsilon < s1) { convex.pop_back(); } - else { break; } + if (candidates.size() >= 3) { + std::vector hull; + hull.reserve(candidates.size()); + auto slope = [](const candidate_types & a, const candidate_types & b) { + const double dx = b.bytes - a.bytes; + + return dx <= 0.0 ? infinity : (b.error - a.error) / dx; + }; + + for (const auto & p : candidates) { + while (hull.size() >= 2) { + double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); + double s2 = slope(hull[hull.size() - 1], p); + if (s2 + epsilon < s1) { hull.pop_back(); } + else { break; } + } + + hull.push_back(p); } - convex.push_back(p); - } - info.candidate.swap(convex); + candidates.swap(hull); + } } } From b433fd95472c39c4974892aa9100e3cdc7b9c63d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:43:09 +0100 Subject: [PATCH 074/148] Refactor last budget pass --- src/llama-quant.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ab6601a8bf9..e062b2dc6a3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1433,19 +1433,16 @@ static std::unordered_map target_bpw_type( double best_ratio = -1.0; size_t best_delta = 0; for (int i = 0; i < (int)all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { - continue; - } - + const auto &ti = all[i]; int j = ti.choice + 1; + // skip same-bytes entries while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; if (cur_bytes + delta > budget_bytes) { continue; } - double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error); + double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); double ratio = err_gain / (double)(delta * 8); if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { best_ratio = ratio; @@ -1454,7 +1451,6 @@ static std::unordered_map target_bpw_type( best_j = j; } } - if (best_i < 0) { break; } all[best_i].choice = best_j; cur_bytes += best_delta; From b6c008fd8a12a9b1970c4810585cbd540bf0737e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:04:13 +0100 Subject: [PATCH 075/148] Refactor helper lambdas --- src/llama-quant.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e062b2dc6a3..d31552ea23a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -665,28 +665,23 @@ static std::unordered_map target_bpw_type( auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); - const int64_t nrows = ggml_nrows(t); - return (size_t)nrows * row_sz; + return (size_t)ggml_nrows(t) * row_sz; }; auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { - const int64_t nelem = ggml_nelements(t); const size_t bytes = tensor_bytes(t, typ); - return (double)bytes * 8.0 / (double)nelem; + return (double)bytes * 8.0 / (double)ggml_nelements(t); }; auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { - const int64_t n_per_row = t->ne[0]; const int64_t blck = ggml_blck_size(typ); - if (blck <= 1) { return true; } - return n_per_row % blck == 0; + return blck <= 1 || (t->ne[0] % blck) == 0; }; auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } + if (is_compatible(t, typ)) return typ; ggml_type fb = fallback_type(typ); - if (is_compatible(t, fb)) { return fb; } - return GGML_TYPE_F16; + return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; auto name_tn = LLM_TN(model.arch); @@ -1080,7 +1075,7 @@ static std::unordered_map target_bpw_type( ++current_sampled_rows; } - + rows_sample[slice] = current_sampled_rows; } From 7386d4eadd64006ac7f0fbc992d7d4bcb195bd6c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:18:26 +0100 Subject: [PATCH 076/148] Refactor row sampling --- src/llama-quant.cpp | 83 +++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d31552ea23a..f2dab6a898a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1019,64 +1019,73 @@ static std::unordered_map target_bpw_type( ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample + const int rows_sample_per_expert = activations_data ? 512 : 256; const int64_t n_per_row = tensor->ne[0]; const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - - const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); - std::vector rows_sample(ne2, 0); - const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); - const int64_t stride = std::max(1, nrows_total / rows_sample_max); const ggml_type src_type = tensor->type; const ggml_type_traits * src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); - std::vector row_buffer(n_per_row); + // Convert a single row to fp32 auto row_to_fp32 = [&](const uint8_t * src, float * dst) { - if (src_type == GGML_TYPE_F32) { + const ggml_type t = src_type; + if (t == GGML_TYPE_F32) { std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row); - } else if (src_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (src_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else if (src_is_quant) { - if (!src_traits || !src_traits->to_float) { - throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); - } + return; + } + if (t == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + return; + } + if (t == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + return; + } - src_traits->to_float(src, dst, (int)n_per_row); - } else { - throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + if (src_is_quant) { + GGML_ASSERT(src_traits && src_traits->to_float); + src_traits->to_float(src, dst, (int) n_per_row); + return; } + + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(t))); }; - for (int64_t slice = 0; slice < ne2; ++slice) { - std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); - int64_t current_sampled_rows = 0; - int64_t offset = 0; - if (stride > 1) { - std::uniform_int_distribution dist(0, stride - 1); - offset = dist(rng); - } + // Sample rows randomly per slice + { + f32_sample.clear(); + std::vector row_buffer(n_per_row); + for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); + const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); + const int64_t stride = std::max(1, nrows_total / rows_sample_max); + int64_t offset = 0; + if (stride > 1) { + std::uniform_int_distribution dist(0, stride - 1); + offset = dist(rng); + } - for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { - const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; - if (src_type == GGML_TYPE_F32) { - auto src_f32 = (const float *)src_row; - f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); - } else { - row_to_fp32(src_row, row_buffer.data()); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + int64_t current = 0; + for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) { + const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + if (src_type == GGML_TYPE_F32) { + auto src_f32 = (const float *)src_row; + f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); + } else { + row_to_fp32(src_row, row_buffer.data()); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + } + + ++current; } - ++current_sampled_rows; + rows_sample[slice] = current; } - - rows_sample[slice] = current_sampled_rows; } auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { From 08146fd67f5ec6b93e2406340afaaa5aa336596a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:03 +0100 Subject: [PATCH 077/148] Refactor side_data() and copy_or_broadcast() --- src/llama-quant.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f2dab6a898a..b8eb12690e3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1088,14 +1088,12 @@ static std::unordered_map target_bpw_type( } } - auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { - if (!m) { return {nullptr, 0}; } + auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) { + if (!m) { return std::pair{nullptr, 0}; } const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); - if (it == m->end()) { return {nullptr, 0}; } - - return { it->second.data(), it->second.size() }; + return it == m->end() ? std::pair{nullptr, 0} : std::pair{ it->second.data(), it->second.size() }; }; // Copy this row's side data (values and activations), or broadcasts to all slices @@ -1105,9 +1103,7 @@ static std::unordered_map target_bpw_type( const size_t want = (size_t)ne2 * (size_t)n_per_row; if (src_sz == want) { - dst.resize(want); - std::memcpy(dst.data(), src, want * sizeof(float)); - + dst.assign(src, src + want); return; } if (src_sz == (size_t)n_per_row) { @@ -1115,7 +1111,6 @@ static std::unordered_map target_bpw_type( for (int64_t s = 0; s < ne2; ++s) { std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } - return; } From 17be7615ce070af61cd1a0f80b38947c3fea5709 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:28 +0100 Subject: [PATCH 078/148] Refactor candidate types build --- src/llama-quant.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b8eb12690e3..beac311d50e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1133,19 +1133,17 @@ static std::unordered_map target_bpw_type( size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) + const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; + size_t max_row_sz = 0; const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); - - size_t max_row_sz = 0; - const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; - std::vector compatible_candidates; compatible_candidates.reserve(base_sz); for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str()); continue; } From b09662f86aefb5750842c9d68dac42db9054e90c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:49 +0100 Subject: [PATCH 079/148] Refactor estimate_lambda() --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index beac311d50e..63779ded487 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -982,8 +982,8 @@ static std::unordered_map target_bpw_type( double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { const double w = v ? std::max(0.0f, v[j]) : 1.0; - const double aw2 = std::sqrt(w) * a[j]; - const double z = aw2 * aw2; + const double aw = std::sqrt(w) * a[j]; + const double z = aw * aw; s1 += z; s2 += z * z; } @@ -992,7 +992,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From a7ee915e19d9acd7a1187ba7d8d772d3a52a8f0d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:20:06 +0100 Subject: [PATCH 080/148] Refactor trimmed_sum() --- src/llama-quant.cpp | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 63779ded487..67de29df872 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -920,26 +920,15 @@ static std::unordered_map target_bpw_type( // Trimmed sum to avoid outlier rows dominating the results auto trimmed_sum = [&](std::vector & v) -> double { - if (v.empty()) { return 0.0; } - const int64_t n = (int64_t)v.size(); - if (n < 50) { - double s = 0.0; - for (const double z : v) { s += z; } - - return s; - } + if (n == 0) { return 0.0; } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side - k = std::max(0, std::min(k, n / 32)); // cap at ~3.125% + k = std::clamp(k, 0, n / 32); // cap at ~3.125% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); - double s = 0.0; - for (int64_t i = k; i < n - k; ++i) { - s += v[i]; - } - - return s; + return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; const double scale_rows = (double)nrows / std::max(1.0, (double)rs); From 1a3e9ea4c88c40b7fea3a94ff45522531f31f005 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:21:00 +0100 Subject: [PATCH 081/148] Refactor estimate_error() --- src/llama-quant.cpp | 191 ++++++++++++++++++++------------------------ 1 file changed, 85 insertions(+), 106 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 67de29df872..b3e4b3cbf7b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -737,12 +737,12 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t sample_element_count = f32_sample.size(); - const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; - if (sample_row_count == 0) { + const size_t sample_elems = f32_sample.size(); + const size_t sample_rows = n_per_row > 0 ? sample_elems / (size_t)n_per_row : 0; + + if (sample_rows == 0) { if (out_mse) { *out_mse = 0.0; } if (out_proj) { *out_proj = 0.0; } - return 0.0; } @@ -751,105 +751,102 @@ static std::unordered_map target_bpw_type( expected_rows += (size_t)rows_sample[s]; } - if (expected_rows != sample_row_count) { + if (expected_rows != sample_rows) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } const size_t row_sz = ggml_row_size(quant_type, n_per_row); - const size_t buffer_sz = row_sz * sample_row_count; + const size_t buf_sz = row_sz * sample_rows; - if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); } - if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } + if (quantized_buffer.size() < buf_sz) { quantized_buffer.resize(buf_sz); } + if (dequantized_buffer.size() < sample_elems) { dequantized_buffer.resize(sample_elems); } const bool has_values = values_sample != nullptr; const bool has_activations = activations_sample != nullptr; // Bias denominators per slice - std::vector bias_denominator_per_slice(ne2, 0.0); + std::vector bias_denom(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { - const float * values = has_values ? values_sample + s * n_per_row : nullptr; - const float * activations = activations_sample + s * n_per_row; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + const float * a = activations_sample + s * n_per_row; double denom = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { - const double w = values ? std::max(0.0f, values[j]) : 1.0; - const double a = activations[j]; - denom += w * a * a; + const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double aj = a[j]; + denom += w * aj * aj; } - bias_denominator_per_slice[s] = denom; + bias_denom[s] = denom; } } - // Weighted per-row squared norms - std::vector row_sq_norm(sample_row_count, 0.0); + // Row squared norms (weighted if values present) + std::vector row_sq_norm(sample_rows, 0.0); { - size_t offset = 0; - size_t row_idx = 0; + size_t off = 0; + size_t ridx = 0; for (int64_t s = 0; s < ne2; ++s) { const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * values = has_values ? values_sample + s * n_per_row : nullptr; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + offset; - double rsn = 0.0; - if (values) { + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + for (int64_t r = 0; r < rs; ++r, ++ridx) { + const float * x = f32_sample.data() + off; + double sum = 0.0; + if (v) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); + const double w = std::max(0.0f, v[j]); const double xx = x[j]; - rsn += w * xx * xx; + sum += w * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double xx = x[j]; - rsn += xx * xx; + sum += xx * xx; } } - row_sq_norm[row_idx] = rsn; - offset += (size_t)n_per_row; + + row_sq_norm[ridx] = sum; + off += (size_t)n_per_row; } } } - // Quantize sampled rows per slice -> quantized_buffer + // Quantize per slice into quantized_buffer { - size_t q_offset = 0; - size_t f_offset = 0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = rows_sample[slice]; + size_t qoff = 0; + size_t foff = 0; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * value = has_values ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - q_offset += row_sz * (size_t)rs; - f_offset += (size_t)rs * (size_t)n_per_row; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v); + qoff += row_sz * (size_t)rs; + foff += (size_t)rs * (size_t)n_per_row; } } - // quantized_buffer -> dequantized_buffer + // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - const bool is_fp16 = quant_type == GGML_TYPE_F16; - const bool is_bf16 = quant_type == GGML_TYPE_BF16; - if (!is_fp16 && !is_bf16 && traits && traits->to_float) { - traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row)); + if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) { + traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row)); } else { - for (size_t r = 0; r < sample_row_count; ++r) { - uint8_t * src = quantized_buffer.data() + r * row_sz; + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (is_fp16) { + if (quant_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (is_bf16) { + } else if (quant_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); } else { if (!traits || !traits->to_float) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } traits->to_float(src, dst, (int)n_per_row); @@ -858,94 +855,77 @@ static std::unordered_map target_bpw_type( } } - // Compute error - size_t offset = 0; - size_t row_idx = 0; + // Compute error per slice with trimmed aggregation + auto trimmed_sum = [&](std::vector & v) -> double { + const int64_t n = (int64_t)v.size(); + if (n == 0) { return 0.0; } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } + int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side + k = std::clamp(k, 0, n / 32); // but no more than ~3% + std::nth_element(v.begin(), v.begin() + k, v.end()); + std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + }; + + size_t off = 0; + size_t ridx = 0; double total_mse = 0.0; double total_proj = 0.0; double total_bias = 0.0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = rows_sample[slice]; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * values = has_values ? values_sample + slice * n_per_row : nullptr; - const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; - const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + const float * a = has_activations ? activations_sample + s * n_per_row : nullptr; + const double denom_bias = has_activations ? bias_denom[s] : 0.0; std::vector row_mse_norm; - std::vector row_proj_norm; row_mse_norm.reserve(rs); - if (activations) { row_proj_norm.reserve(rs); } + std::vector row_proj_norm; + if (a) { row_proj_norm.reserve(rs); } - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + offset; - const float * y = dequantized_buffer.data() + offset; - double weighted_mse = 0.0; + for (int64_t r = 0; r < rs; ++r, ++ridx) { + const float * x = f32_sample.data() + off; + const float * y = dequantized_buffer.data() + off; + double w_mse = 0.0; double bias_num = 0.0; - if (values && activations) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); - const double e = y[j] - x[j]; - const double a = activations[j]; - weighted_mse += w * e * e; - bias_num += w * e * a; - } - } else if (values) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); - const double e = y[j] - x[j]; - weighted_mse += w * e * e; - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = y[j] - x[j]; - weighted_mse += e * e; - } + for (int64_t j = 0; j < n_per_row; ++j) { + const double wj = v ? std::max(0.0f, v[j]) : 1.0; + const double e = y[j] - x[j]; + w_mse += wj * e * e; + if (a) { bias_num += wj * e * a[j]; } } - const double denom_x = row_sq_norm[row_idx]; - double m_norm = weighted_mse / (denom_x + epsilon); + const double denom_x = row_sq_norm[ridx]; + const double m_norm = w_mse / (denom_x + epsilon); row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); - if (activations) { + if (a) { double p_norm = 0.0; - if (bias_denom > 0.0) { - const double proj = bias_num * bias_num / (bias_denom + epsilon); + if (denom_bias > 0.0) { + const double proj = bias_num * bias_num / (denom_bias + epsilon); p_norm = std::isfinite(proj) ? proj : 0.0; } + row_proj_norm.push_back(p_norm); } - offset += (size_t)n_per_row; + off += (size_t)n_per_row; } - // Trimmed sum to avoid outlier rows dominating the results - auto trimmed_sum = [&](std::vector & v) -> double { - const int64_t n = (int64_t)v.size(); - if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } - - int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side - k = std::clamp(k, 0, n / 32); // cap at ~3.125% - std::nth_element(v.begin(), v.begin() + k, v.end()); - std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); - return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); - }; - const double scale_rows = (double)nrows / std::max(1.0, (double)rs); const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; - const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; + const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; total_mse += slice_mse; total_proj += slice_proj; - // per-slice lambda if provided, otherwise use scalar - const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda; + const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[s]) : (double)tensor_bias_lambda; total_bias += bl * slice_proj; if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } } @@ -954,7 +934,6 @@ static std::unordered_map target_bpw_type( if (out_proj) { *out_proj = total_proj; } const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; - return std::isfinite(total_err) ? total_err : infinity; }; From 9a1656eb975fa9f1024a8de029e22a762e49719b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:21:35 +0100 Subject: [PATCH 082/148] Refactor pareto optimise and convexify --- src/llama-quant.cpp | 84 ++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b3e4b3cbf7b..751a26c63aa 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1179,55 +1179,53 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - { - auto & candidates = info.candidate; - if (!candidates.empty()) { - std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - - return a.error < b.error; - }); - - std::vector pareto; - pareto.reserve(candidates.size()); - double best_err = infinity; - size_t last_bytes = std::numeric_limits::max(); - for (const auto & c : candidates) { - if (c.bytes != last_bytes) { - last_bytes = c.bytes; - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); - } - } - } + auto pareto_convex = [](std::vector & candidates) { + if (candidates.empty()) return; - candidates.swap(pareto); - - if (candidates.size() >= 3) { - std::vector hull; - hull.reserve(candidates.size()); - auto slope = [](const candidate_types & a, const candidate_types & b) { - const double dx = b.bytes - a.bytes; - - return dx <= 0.0 ? infinity : (b.error - a.error) / dx; - }; - - for (const auto & p : candidates) { - while (hull.size() >= 2) { - double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); - double s2 = slope(hull[hull.size() - 1], p); - if (s2 + epsilon < s1) { hull.pop_back(); } - else { break; } - } + std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + return a.error < b.error; + }); - hull.push_back(p); + // Pareto by bytes -> error + std::vector pareto; + pareto.reserve(candidates.size()); + double best_err = std::numeric_limits::infinity(); + size_t last_b = std::numeric_limits::max(); + for (const auto & c : candidates) { + if (c.bytes != last_b) { + last_b = c.bytes; + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); } + } + } - candidates.swap(hull); + candidates.swap(pareto); + if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull + + // Convex hull (lower envelope) + auto slope = [](const candidate_types & a, const candidate_types & b) { + const double dx = b.bytes - a.bytes; + return dx <= 0.0 ? infinity : (b.error - a.error) / dx; + }; + + std::vector hull; hull.reserve(candidates.size()); + for (const auto & p : candidates) { + while (hull.size() >= 2) { + const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); + const double s2 = slope(hull[hull.size() - 1], p); + if (s2 + epsilon < s1) hull.pop_back(); + else { break; } } + + hull.push_back(p); } - } + candidates.swap(hull); + }; + + pareto_convex(info.candidate); // Initialize choice at the smallest bpw candidate info.choice = 0; From 0d5f18303e25e6b4e4dc21f963ca6672b9b12d0f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:22:00 +0100 Subject: [PATCH 083/148] Refactor lagrange_penalty() --- src/llama-quant.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 751a26c63aa..204fbfecad8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1288,21 +1288,21 @@ static std::unordered_map target_bpw_type( bytes = 0; err = 0.0; for (size_t i = 0; i < all.size(); ++i) { - const auto & cand = all[i].candidate; + const auto & candidate = all[i].candidate; int best_j = 0; double best_val = infinity; - for (int j = 0; j < (int)cand.size(); ++j) { - const double bits = (double)cand[j].bytes * 8.0; - const double val = cand[j].error + mu * bits; - if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) { + for (int j = 0; j < (int)candidate.size(); ++j) { + const double bits = (double)candidate[j].bytes * 8.0; + const double val = candidate[j].error + mu * bits; + if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) { best_val = val; best_j = j; } } choice[i] = best_j; - bytes += cand[best_j].bytes; - err += cand[best_j].error; + bytes += candidate[best_j].bytes; + err += candidate[best_j].error; } }; From 814f6b66be4b5ebbe286201eafe8361a37d39a98 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:45:09 +0100 Subject: [PATCH 084/148] Minor general refactoring --- src/llama-quant.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 204fbfecad8..93b5fb0ebad 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -860,7 +860,8 @@ static std::unordered_map target_bpw_type( const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } - int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side + + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side k = std::clamp(k, 0, n / 32); // but no more than ~3% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); @@ -1190,7 +1191,7 @@ static std::unordered_map target_bpw_type( // Pareto by bytes -> error std::vector pareto; pareto.reserve(candidates.size()); - double best_err = std::numeric_limits::infinity(); + double best_err = infinity; size_t last_b = std::numeric_limits::max(); for (const auto & c : candidates) { if (c.bytes != last_b) { @@ -1273,12 +1274,10 @@ static std::unordered_map target_bpw_type( if (budget_bytes <= min_bytes) { for (auto & ti : all) { ti.choice = 0; } - return emit_overrides(); } if (budget_bytes >= max_bytes) { for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } - return emit_overrides(); } @@ -1327,14 +1326,10 @@ static std::unordered_map target_bpw_type( int expand = 0; while (true) { lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); - if (bytes_hi <= budget_bytes) { - break; - } + if (bytes_hi <= budget_bytes) { break; } mu_hi *= 2.0; - if (++expand > 60) { - break; - } + if (++expand > 60) { break; } // safety cap } } From e92db008bc848b109f2931162a69c7010f675b70 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 17:20:48 +0100 Subject: [PATCH 085/148] Refactor quantisation checks into its own function --- src/llama-quant.cpp | 140 ++++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 83 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 93b5fb0ebad..3544653a56b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -21,6 +21,60 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static bool is_quantizable(const std::string & name, const llm_arch arch, const llama_model_quantize_params * params) { + if (params->only_copy) { return false; } + + const auto tn = LLM_TN(arch); + + // This used to be a regex, but has an extreme cost to compile times. + bool q = name.size() >= 6 && name.rfind("weight") == name.size() - 6; // ends with 'weight'? + + // Do not quantize norm tensors + q &= name.find("_norm.weight") == std::string::npos; + + // Do not quantize expert gating tensors + // NOTE: can't use LLM_TN here because the layer number is not known + q &= name.find("ffn_gate_inp.weight") == std::string::npos; + + // These are very small (e.g. 4x4) + q &= name.find("altup") == std::string::npos; + q &= name.find("laurel") == std::string::npos; + + // These are not too big so keep them as it is + q &= name.find("per_layer_model_proj") == std::string::npos; + + // Do not quantize positional embeddings and token types (BERT) + q &= name != tn(LLM_TENSOR_POS_EMBD, "weight"); + q &= name != tn(LLM_TENSOR_TOKEN_TYPES, "weight"); + + // Do not quantize Jamba, Mamba, LFM2's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + q &= name.find("ssm_conv1d.weight") == std::string::npos; + q &= name.find("shortconv.conv.weight") == std::string::npos; + + // Do not quantize ARWKV, RWKV's small yet 2D weights + q &= name.find("time_mix_first.weight") == std::string::npos; + q &= name.find("time_mix_w0.weight") == std::string::npos; + q &= name.find("time_mix_w1.weight") == std::string::npos; + q &= name.find("time_mix_w2.weight") == std::string::npos; + q &= name.find("time_mix_v0.weight") == std::string::npos; + q &= name.find("time_mix_v1.weight") == std::string::npos; + q &= name.find("time_mix_v2.weight") == std::string::npos; + q &= name.find("time_mix_a0.weight") == std::string::npos; + q &= name.find("time_mix_a1.weight") == std::string::npos; + q &= name.find("time_mix_a2.weight") == std::string::npos; + q &= name.find("time_mix_g1.weight") == std::string::npos; + q &= name.find("time_mix_g2.weight") == std::string::npos; + q &= name.find("time_mix_decay_w1.weight") == std::string::npos; + q &= name.find("time_mix_decay_w2.weight") == std::string::npos; + q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + + // Do not quantize relative position bias (T5) + q &= name.find("attn_rel_b.weight") == std::string::npos; + + return q; +} + static bool is_iq(const enum ggml_type t) { switch (t) { case GGML_TYPE_IQ1_S: @@ -684,40 +738,9 @@ static std::unordered_map target_bpw_type( return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; - auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { - // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift - const std::string name = ggml_get_name(t); - bool q = name.rfind("weight") == name.size() - 6; - q &= ggml_n_dims(t) >= 2; - q &= name.find("_norm.weight") == std::string::npos; - q &= name.find("ffn_gate_inp.weight") == std::string::npos; - q &= name.find("altup") == std::string::npos; - q &= name.find("laurel") == std::string::npos; - q &= name.find("per_layer_model_proj") == std::string::npos; - q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); - q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); - q &= name.find("ssm_conv1d.weight") == std::string::npos; - q &= name.find("shortconv.conv.weight") == std::string::npos; - q &= name.find("time_mix_first.weight") == std::string::npos; - q &= name.find("time_mix_w0.weight") == std::string::npos; - q &= name.find("time_mix_w1.weight") == std::string::npos; - q &= name.find("time_mix_w2.weight") == std::string::npos; - q &= name.find("time_mix_v0.weight") == std::string::npos; - q &= name.find("time_mix_v1.weight") == std::string::npos; - q &= name.find("time_mix_v2.weight") == std::string::npos; - q &= name.find("time_mix_a0.weight") == std::string::npos; - q &= name.find("time_mix_a1.weight") == std::string::npos; - q &= name.find("time_mix_a2.weight") == std::string::npos; - q &= name.find("time_mix_g1.weight") == std::string::npos; - q &= name.find("time_mix_g2.weight") == std::string::npos; - q &= name.find("time_mix_decay_w1.weight") == std::string::npos; - q &= name.find("time_mix_decay_w2.weight") == std::string::npos; - q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= !params->only_copy; - - return q; + if (ggml_n_dims(t) < 2) { return false; } + return is_quantizable(ggml_get_name(t), model.arch, params); }; // Estimate error for a given type using a sampled subset of rows @@ -1747,57 +1770,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - - // do not quantize norm tensors - quantize &= name.find("_norm.weight") == std::string::npos; - + bool quantize = ggml_n_dims(tensor) >= 2 && is_quantizable(name, model.arch, params); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= !params->only_copy; - - // do not quantize expert gating tensors - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; - - // these are very small (e.g. 4x4) - quantize &= name.find("altup") == std::string::npos; - quantize &= name.find("laurel") == std::string::npos; - - // these are not too big so keep them as it is - quantize &= name.find("per_layer_model_proj") == std::string::npos; - - // do not quantize positional embeddings and token types (BERT) - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - - // do not quantize Mamba's small yet 2D weights - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - quantize &= name.find("shortconv.conv.weight") == std::string::npos; - - // do not quantize RWKV's small yet 2D weights - quantize &= name.find("time_mix_first.weight") == std::string::npos; - quantize &= name.find("time_mix_w0.weight") == std::string::npos; - quantize &= name.find("time_mix_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_v0.weight") == std::string::npos; - quantize &= name.find("time_mix_v1.weight") == std::string::npos; - quantize &= name.find("time_mix_v2.weight") == std::string::npos; - quantize &= name.find("time_mix_a0.weight") == std::string::npos; - quantize &= name.find("time_mix_a1.weight") == std::string::npos; - quantize &= name.find("time_mix_a2.weight") == std::string::npos; - quantize &= name.find("time_mix_g1.weight") == std::string::npos; - quantize &= name.find("time_mix_g2.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - - // do not quantize relative position bias (T5) - quantize &= name.find("attn_rel_b.weight") == std::string::npos; ggml_type new_type; void * new_data; From fecc472c6175bc65217d6f29855acf81477a5125 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 17:26:38 +0100 Subject: [PATCH 086/148] Fix typos in variable names --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3544653a56b..8a709ddfdd5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1165,7 +1165,7 @@ static std::unordered_map target_bpw_type( // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); std::vector quantized_buffer(max_row_sz * total_sampled_rows); - std::vector dequantised_buffer(f32_sample.size()); + std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; @@ -1175,7 +1175,7 @@ static std::unordered_map target_bpw_type( eval_workers.emplace_back([&] { // thread-local scratch std::vector tl_quantized_buffer(quantized_buffer.size()); - std::vector tl_dequantised_buffer(dequantised_buffer.size()); + std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); if (i >= compatible_candidates.size()) { break; } @@ -1184,7 +1184,7 @@ static std::unordered_map target_bpw_type( const auto bpw = (float)tensor_bpw(tensor, tensor_types); const size_t bytes = tensor_bytes(tensor, tensor_types); const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda); + tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda); eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; } }); From 896cdc21217ab4d0b2bcb8b18938d3c0efc94dc1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 22:03:36 +0100 Subject: [PATCH 087/148] Refactor potential overflow --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8a709ddfdd5..52d7984e2a4 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1002,7 +1002,7 @@ static std::unordered_map target_bpw_type( const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } tensor->data = buffer.data(); From b748a1efa7dd0ab0d4064574530b4b045b27bbfc Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 22:03:54 +0100 Subject: [PATCH 088/148] Fix typo --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 52d7984e2a4..2652f5c86e5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1709,7 +1709,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)"); + LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { From c855094dff509c97f6cc268e28f123262e67b6f7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:09:11 +0100 Subject: [PATCH 089/148] Exit loop if no better solution found --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2652f5c86e5..8ee052a8e55 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1347,9 +1347,12 @@ static std::unordered_map target_bpw_type( // increase mu until we get under budget or hit a safety cap { int expand = 0; + size_t prev_bytes_hi = std::numeric_limits::max(); while (true) { lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); if (bytes_hi <= budget_bytes) { break; } + if (bytes_hi >= prev_bytes_hi) { break; } + prev_bytes_hi = bytes_hi; mu_hi *= 2.0; if (++expand > 60) { break; } // safety cap From 1fbc59f867b283d1f66a87a8b1f45d265cf69fca Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:10:10 +0100 Subject: [PATCH 090/148] Replace slope with cross product --- src/llama-quant.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8ee052a8e55..0b2f15f0a66 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1230,22 +1230,27 @@ static std::unordered_map target_bpw_type( if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull // Convex hull (lower envelope) - auto slope = [](const candidate_types & a, const candidate_types & b) { - const double dx = b.bytes - a.bytes; - return dx <= 0.0 ? infinity : (b.error - a.error) / dx; - }; - std::vector hull; hull.reserve(candidates.size()); - for (const auto & p : candidates) { + for (const auto & c : candidates) { + auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { + const double dx1 = (double)h1.bytes - (double)h0.bytes; + const double dy1 = h1.error - h0.error; + const double dx2 = (double)p.bytes - (double)h0.bytes; + const double dy2 = p.error - h0.error; + return dx1 * dy2 - dx2 * dy1; + }; + while (hull.size() >= 2) { - const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); - const double s2 = slope(hull[hull.size() - 1], p); - if (s2 + epsilon < s1) hull.pop_back(); - else { break; } + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { + hull.pop_back(); + } else { + break; + } } - hull.push_back(p); + hull.push_back(c); } + candidates.swap(hull); }; From f184450806163bd1af0eecaff5c31639cf3eaf8f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:10:42 +0100 Subject: [PATCH 091/148] Fix minor logic flaw --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0b2f15f0a66..4c0ec3063a9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -957,7 +957,7 @@ static std::unordered_map target_bpw_type( if (out_mse) { *out_mse = total_mse; } if (out_proj) { *out_proj = total_proj; } - const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; + const double total_err = total_mse + total_bias; return std::isfinite(total_err) ? total_err : infinity; }; From d79ade2e8e45057d9006b0b096888501ae639aab Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:11:26 +0100 Subject: [PATCH 092/148] Adjust for small vector size --- src/llama-quant.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4c0ec3063a9..08e1c97185d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -885,9 +885,8 @@ static std::unordered_map target_bpw_type( if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::clamp(k, 0, n / 32); // but no more than ~3% - std::nth_element(v.begin(), v.begin() + k, v.end()); - std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small + std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; From 7ba6001ec8fda89e7d513ced2da7b9aa3532cb70 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:11:54 +0100 Subject: [PATCH 093/148] Simplify candidates sorting --- src/llama-quant.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 08e1c97185d..f4c0ea0fcd9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1209,6 +1209,10 @@ static std::unordered_map target_bpw_type( if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); + const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + return a.bytes == b.bytes; + }); + candidates.erase(last, candidates.end()); // Pareto by bytes -> error std::vector pareto; From d36ee0a0a86a65e1d730e788d735c1606ebeb49a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:41:56 +0100 Subject: [PATCH 094/148] Add comments to explain magic numbers --- src/llama-quant.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4c0ea0fcd9..93007f281ea 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -739,7 +739,7 @@ static std::unordered_map target_bpw_type( }; auto can_quantize = [&](const ggml_tensor * t) -> bool { - if (ggml_n_dims(t) < 2) { return false; } + if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors return is_quantizable(ggml_get_name(t), model.arch, params); }; @@ -882,10 +882,10 @@ static std::unordered_map target_bpw_type( auto trimmed_sum = [&](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution + k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1 std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; @@ -1289,7 +1289,7 @@ static std::unordered_map target_bpw_type( if (total_elems == 0) { return {}; } const double target_bpw = params->target_bpw; - size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); + size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; @@ -1362,8 +1362,8 @@ static std::unordered_map target_bpw_type( if (bytes_hi >= prev_bytes_hi) { break; } prev_bytes_hi = bytes_hi; - mu_hi *= 2.0; - if (++expand > 60) { break; } // safety cap + mu_hi *= 2.0; // double the penalty multiplier to reduce tensor sizes + if (++expand > 60) { break; } // safety cap to prevent an infinite loop } } @@ -1371,8 +1371,8 @@ static std::unordered_map target_bpw_type( double best_over_gap = infinity; double best_under_err = infinity; double best_over_err = infinity; - for (int it = 0; it < 40; ++it) { - double mu = 0.5 * (mu_lo + mu_hi); + for (int it = 0; it < 40; ++it) { // binary search iterations for optimal Lagrange multiplier (40 ≈ 1e-12 precision) + double mu = 0.5 * (mu_lo + mu_hi); // midpoint of current bounds lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); const double gap = std::abs((double)bytes_mid - (double)budget_bytes); @@ -1435,7 +1435,7 @@ static std::unordered_map target_bpw_type( if (cur_bytes + delta > budget_bytes) { continue; } double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - double ratio = err_gain / (double)(delta * 8); + double ratio = err_gain / (double)(delta * 8); // error reduction per bit if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { best_ratio = ratio; best_delta = delta; From 8eedcf74bc4df64eb7fe5b4935390dc9ad73d104 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:42:37 +0100 Subject: [PATCH 095/148] Increase scale multiplier --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 93007f281ea..0f05c8f9566 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -983,7 +983,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From a74b410f5f6bd11ff42cc1f40fa93242d0f67940 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Sep 2025 19:49:47 +0100 Subject: [PATCH 096/148] Move is_iq() into a lambda and remove unused variables --- src/llama-quant.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0f05c8f9566..af564ce03e0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -727,11 +727,28 @@ static std::unordered_map target_bpw_type( return (double)bytes * 8.0 / (double)ggml_nelements(t); }; - auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { + auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t blck = ggml_blck_size(typ); return blck <= 1 || (t->ne[0] % blck) == 0; }; + auto is_iq = [](const enum ggml_type t) { + switch (t) { + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } + }; + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { if (is_compatible(t, typ)) return typ; ggml_type fb = fallback_type(typ); @@ -995,8 +1012,6 @@ static std::unordered_map target_bpw_type( std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { - std::vector workers; - workers.reserve(std::max(1, nthread)); ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } From dbdd179a92426c2031e4bee1ba0ccace45ea29fe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Sep 2025 19:50:20 +0100 Subject: [PATCH 097/148] Combine quant types --- src/llama-quant.cpp | 75 ++++++++------------------------------------- 1 file changed, 13 insertions(+), 62 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index af564ce03e0..f36b9202d53 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -75,43 +75,6 @@ static bool is_quantizable(const std::string & name, const llm_arch arch, const return q; } -static bool is_iq(const enum ggml_type t) { - switch (t) { - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - return true; - default: - return false; - } -} - -static bool is_iq(const enum llama_ftype t) { - switch (t) { - case LLAMA_FTYPE_MOSTLY_IQ1_S: - case LLAMA_FTYPE_MOSTLY_IQ1_M: - case LLAMA_FTYPE_MOSTLY_IQ2_XXS: - case LLAMA_FTYPE_MOSTLY_IQ2_XS: - case LLAMA_FTYPE_MOSTLY_IQ2_S: - case LLAMA_FTYPE_MOSTLY_IQ2_M: - case LLAMA_FTYPE_MOSTLY_IQ3_XXS: - case LLAMA_FTYPE_MOSTLY_IQ3_XS: - case LLAMA_FTYPE_MOSTLY_IQ3_S: - case LLAMA_FTYPE_MOSTLY_IQ3_M: - case LLAMA_FTYPE_MOSTLY_IQ4_XS: - case LLAMA_FTYPE_MOSTLY_IQ4_NL: - return true; - default: - return false; - } -} - static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -678,33 +641,21 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - constexpr ggml_type k_quants[] = { - GGML_TYPE_Q2_K, - GGML_TYPE_Q3_K, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0, -// TODO: find better way to handle F16/BF16 -#ifdef GGML_USE_METAL - GGML_TYPE_F16 -#else - GGML_TYPE_BF16 -#endif - }; - - constexpr ggml_type iq_quants[] = { + // subset of quantization types with the best accuracy/size tradeoff + constexpr ggml_type quant_types[] = { GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, - GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_S, + GGML_TYPE_Q2_K, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, - // TODO: find better way to handle F16/BF16 #ifdef GGML_USE_METAL GGML_TYPE_F16 #else @@ -896,7 +847,7 @@ static std::unordered_map target_bpw_type( } // Compute error per slice with trimmed aggregation - auto trimmed_sum = [&](std::vector & v) -> double { + auto trimmed_sum = [](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets @@ -978,7 +929,7 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { const int64_t ns = std::max(1, ne2); std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } @@ -1141,8 +1092,8 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; size_t max_row_sz = 0; - const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; - const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); + const ggml_type * base_arr = quant_types; + const size_t base_sz = std::size(quant_types); std::vector compatible_candidates; compatible_candidates.reserve(base_sz); From dd4f4bd0b88c4d59613033ba941d85e7ce1d9547 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:23:48 +0100 Subject: [PATCH 098/148] Reduce bpw range --- src/llama-quant.cpp | 7 +------ tools/quantize/quantize.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f36b9202d53..03863520147 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -655,12 +655,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0, -#ifdef GGML_USE_METAL - GGML_TYPE_F16 -#else - GGML_TYPE_BF16 -#endif + GGML_TYPE_Q8_0 }; constexpr double epsilon = 1e-12; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 03018cc3012..69e03179b3b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); @@ -484,13 +484,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 16.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 8.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); return false; } From d16945730eac146d87d158a97ef053f845921f01 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:25:29 +0100 Subject: [PATCH 099/148] Refactor outlier trimming --- src/llama-quant.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 03863520147..df36a705c2f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -847,8 +847,7 @@ static std::unordered_map target_bpw_type( if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution - k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1 + int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; From 87cba659089342ef4e4c2209d9a750555ae140e3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:26:30 +0100 Subject: [PATCH 100/148] Tighten worker allocator --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index df36a705c2f..90931f25e7b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1136,7 +1136,7 @@ static std::unordered_map target_bpw_type( std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { - const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); + const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); if (i >= compatible_candidates.size()) { break; } const ggml_type tensor_types = compatible_candidates[i]; From 8a2c71f471842a9b2dcc0bc33592cd7adb8b8dfe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:27:29 +0100 Subject: [PATCH 101/148] Check for direction reversal --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 90931f25e7b..601b9ada427 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1204,7 +1204,7 @@ static std::unordered_map target_bpw_type( }; while (hull.size() >= 2) { - if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance hull.pop_back(); } else { break; From 3d75b14c0f2fc605fb39a3cb425c4c2482b8d8f5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:27:58 +0100 Subject: [PATCH 102/148] Simplify dequantisation --- src/llama-quant.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 601b9ada427..316dd35fa86 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -819,25 +819,16 @@ static std::unordered_map target_bpw_type( // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) { - traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row)); - } else { - for (size_t r = 0; r < sample_rows; ++r) { - const uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } - traits->to_float(src, dst, (int)n_per_row); - } - } + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + return infinity; + } + + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + traits->to_float(src, dst, (int)n_per_row); } } From e49e241d37e7fd7f25142ee514c9e129c304083b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:28:39 +0100 Subject: [PATCH 103/148] Calculate bpw over all tensors --- src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 316dd35fa86..699264553ac 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1219,6 +1219,18 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } + // Compute total elements across all tensors and bytes for non-quantizable tensors + size_t nq_elements = 0; + size_t nq_bytes = 0; + for (const auto & it : ml.weights_map) { + const ggml_tensor * tensor = it.second.tensor; + const std::string name = it.first; + nq_elements += (size_t)ggml_nelements(tensor); + if (!is_quantizable(name, model.arch, params)) { + nq_bytes += ggml_nbytes(tensor); + } + } + auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1228,19 +1240,20 @@ static std::unordered_map target_bpw_type( return tb; }; - size_t total_elems = 0; + size_t q_elements = 0; size_t min_bytes = 0; size_t max_bytes = 0; for (const auto & ti : all) { - total_elems += (size_t)ti.n_elements; + q_elements += (size_t)ti.n_elements; min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor max_bytes += ti.candidate.back().bytes; // largest candidate per tensor } - if (total_elems == 0) { return {}; } + if (q_elements == 0) { return {}; } const double target_bpw = params->target_bpw; - size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes + size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); + size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; @@ -1374,29 +1387,35 @@ static std::unordered_map target_bpw_type( int best_i = -1; int best_j = -1; double best_ratio = -1.0; - size_t best_delta = 0; + double best_gain = -1.0; + for (int i = 0; i < (int)all.size(); ++i) { const auto &ti = all[i]; int j = ti.choice + 1; - // skip same-bytes entries while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } - size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta > budget_bytes) { continue; } + size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; + if (cur_bytes + delta_bytes > budget_bytes) { continue; } double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - double ratio = err_gain / (double)(delta * 8); // error reduction per bit - if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { + if (err_gain < epsilon) { continue; } // no real improvement + + double ratio = err_gain / (double)delta_bytes; // error reduction per byte + // For tie-breaking, prioritize the largest absolute error improvement. + if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { best_ratio = ratio; - best_delta = delta; + best_gain = err_gain; best_i = i; best_j = j; } } - if (best_i < 0) { break; } + + if (best_i < 0) { break; } // no more upgrades within budget found + + size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes; all[best_i].choice = best_j; - cur_bytes += best_delta; + cur_bytes += upgrade_cost; } } From b3b8a111a58a8a1586c763382463ccdf9bba3f6a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 28 Sep 2025 18:45:25 +0100 Subject: [PATCH 104/148] Compute rows based on tensor shape and slice count --- src/llama-quant.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 699264553ac..7bfb8751aeb 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -650,9 +650,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_XXS, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0 @@ -961,10 +959,24 @@ static std::unordered_map target_bpw_type( ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample - const int rows_sample_per_expert = activations_data ? 512 : 256; const int64_t n_per_row = tensor->ne[0]; const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; + + // Compute rows based on tensor shape and slice count + auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t { + const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024; + const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt + const double slice_budget = tensor_budget * scale_rows / std::max(1, n2); + const int64_t min_rows = has_acts ? 128 : 64; + const int64_t max_rows = 4096; + int64_t total_rows = std::llround(slice_budget / std::max(1, n)); + total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); + if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors + return total_rows; + }; + + const int64_t rows_sample_per_expert = sample_rows(n_per_row, nrows_total, ne2, activations_data != nullptr); std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); std::vector rows_sample(ne2, 0); From f5d8811ddde7533c561ad77d358d1d509a57ff9f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 1 Oct 2025 19:04:43 +0100 Subject: [PATCH 105/148] Prioritise important tensors --- src/llama-quant.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7bfb8751aeb..a93d982e634 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -656,6 +656,13 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q8_0 }; + const char * important_tensors[] = { + ".output.weight", + ".attn_output.weight", + ".ffn_down.weight", + ".ffn_down_shexp.weight" + }; + constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); const char * func = __func__; @@ -1288,6 +1295,13 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } + auto is_important = [&](const std::string & tensor_name) -> bool { + return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) { + return tensor_name.find(imp) != std::string::npos; + } + ); + }; + // Lagrangian relaxation to minimise error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); @@ -1295,11 +1309,15 @@ static std::unordered_map target_bpw_type( err = 0.0; for (size_t i = 0; i < all.size(); ++i) { const auto & candidate = all[i].candidate; + const std::string tensor_name = ggml_get_name(all[i].w->tensor); + double effective_mu = mu; + if (is_important(tensor_name)) { effective_mu *= 0.1; } // important tensors get 10x lower penalty + int best_j = 0; double best_val = infinity; for (int j = 0; j < (int)candidate.size(); ++j) { const double bits = (double)candidate[j].bytes * 8.0; - const double val = candidate[j].error + mu * bits; + const double val = candidate[j].error + effective_mu * bits; if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) { best_val = val; best_j = j; @@ -1402,18 +1420,21 @@ static std::unordered_map target_bpw_type( double best_gain = -1.0; for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + const auto & ti = all[i]; + const std::string tensor_name = ggml_get_name(ti.w->tensor); int j = ti.choice + 1; while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } - if (j >= (int)ti.candidate.size()) { continue; } + if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta_bytes > budget_bytes) { continue; } + if (cur_bytes + delta_bytes > budget_bytes) { continue; } // won't fit in budget double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - if (err_gain < epsilon) { continue; } // no real improvement + if (err_gain < epsilon) { continue; } // no error improvement double ratio = err_gain / (double)delta_bytes; // error reduction per byte + if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost + // For tie-breaking, prioritize the largest absolute error improvement. if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { best_ratio = ratio; From 940db63144d7369f88145a099370cf1bd33a45b7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 3 Oct 2025 11:08:02 +0100 Subject: [PATCH 106/148] Select quantization type if target_bpw is set unless user specifies type and threads --- tools/quantize/quantize.cpp | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 69e03179b3b..89cf0fbf80b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -497,6 +497,24 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } +static const char * get_ftype(const float bpw) { + const std::map quant_bpw = { + {1.5625, "IQ1_S"}, + {1.7500, "IQ1_M"}, + {2.0625, "IQ2_XXS"}, + {2.6250, "Q2_K"}, + {3.0625, "IQ3_XXS"}, + {3.4375, "Q3_K"}, + {4.2500, "IQ4_XS"}, + {4.5000, "Q4_K"}, + {5.5000, "Q5_K"}, + {6.5625, "Q6_K"}, + {8.5000, "Q8_0"} + }; + + return quant_bpw.lower_bound(bpw)->second; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -655,6 +673,7 @@ int main(int argc, char ** argv) { std::string ftype_str; std::string suffix = ".gguf"; + std::vector tmp_argv(argv, argv + argc); if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of("/\\"); @@ -678,7 +697,21 @@ int main(int argc, char ** argv) { } arg_idx++; - if (argc <= arg_idx) { + // select quantization type if target_bpw is set unless user specifies type and threads + if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) { + auto * ftype = const_cast(get_ftype(params.target_bpw)); + if (argc == arg_idx) { + tmp_argv.push_back(ftype); + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; + } else { + tmp_argv.insert(tmp_argv.end() - 1, ftype); + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; + } + } else if (argc <= arg_idx) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; } From 66d4aed173aba8b3b4e05c6d7b46ca8911ec7ddf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 4 Oct 2025 08:21:01 +0100 Subject: [PATCH 107/148] Minor refactoring --- tools/quantize/quantize.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 89cf0fbf80b..d355f972742 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -700,17 +700,11 @@ int main(int argc, char ** argv) { // select quantization type if target_bpw is set unless user specifies type and threads if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) { auto * ftype = const_cast(get_ftype(params.target_bpw)); - if (argc == arg_idx) { - tmp_argv.push_back(ftype); - tmp_argv.push_back(nullptr); - argv = const_cast(tmp_argv.data()); - argc++; - } else { - tmp_argv.insert(tmp_argv.end() - 1, ftype); - tmp_argv.push_back(nullptr); - argv = const_cast(tmp_argv.data()); - argc++; - } + if (argc == arg_idx) { tmp_argv.push_back(ftype); } + else { tmp_argv.insert(tmp_argv.end() - 1, ftype); } + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; } else if (argc <= arg_idx) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; From 560e8c9d70964320a0283936b0d8e9fd198356ee Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 14:41:42 +0100 Subject: [PATCH 108/148] Relax lambda clamping --- src/llama-quant.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a93d982e634..422c929f0c8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -701,7 +701,7 @@ static std::unordered_map target_bpw_type( }; auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) return typ; + if (is_compatible(t, typ)) { return typ; } ggml_type fb = fallback_type(typ); return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; @@ -941,7 +941,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0); } lambdas[(size_t)s] = l; @@ -1035,7 +1035,7 @@ static std::unordered_map target_bpw_type( for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) { const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (src_type == GGML_TYPE_F32) { - auto src_f32 = (const float *)src_row; + const auto *src_f32 = (const float *)src_row; f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); } else { row_to_fp32(src_row, row_buffer.data()); @@ -1173,7 +1173,7 @@ static std::unordered_map target_bpw_type( // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve auto pareto_convex = [](std::vector & candidates) { - if (candidates.empty()) return; + if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bytes != b.bytes) { return a.bytes < b.bytes; } From 533cda3076b5ae26d120f04b7aaa813f7b7a5ac7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:16:33 +0100 Subject: [PATCH 109/148] Add signal handler --- src/llama-quant.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 422c929f0c8..50c8dbf4238 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,6 +614,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } +static std::atomic bpw_stop{ false }; + +static void signal_handler(int) { + bpw_stop.store(true, std::memory_order_relaxed); +} + // Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, @@ -711,6 +718,22 @@ static std::unordered_map target_bpw_type( return is_quantizable(ggml_get_name(t), model.arch, params); }; + auto install_signal_handlers = [] { + static std::once_flag once; + std::call_once(once, [] { + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + }); + }; + + auto uninstall_signal_handlers = [] { + static std::once_flag once; + std::call_once(once, [] { + std::signal(SIGINT, SIG_DFL); + std::signal(SIGTERM, SIG_DFL); + }); + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From e48ca32f19095ba0c47058dc7a703c1bb52977e0 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:17:27 +0100 Subject: [PATCH 110/148] Add save_bpw_state() --- src/llama-quant.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 50c8dbf4238..3080b0ed715 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -734,6 +734,56 @@ static std::unordered_map target_bpw_type( }); }; + // Saved state per tensor + struct saved_info { + std::vector candidate; + int choice = -1; + float min_bpw = 0.0f; + float max_bpw = 0.0f; + size_t n_elements = 0; + }; + + auto save_bpw_state = [&](const std::vector & all_vec) { + const std::string tmp = checkpoint_file + ".tmp"; + std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); + if (!ofs) { return; } // best-effort + const float target_bpw = params->target_bpw; + const uint8_t bias_mode = params->no_bias ? 1 : 0; + ofs.write((const char *)&file_magic, sizeof(file_magic)); + ofs.write((const char *)&target_bpw, sizeof(target_bpw)); + ofs.write((const char *)&bias_mode, sizeof(bias_mode)); + const uint64_t n = all_vec.size(); + ofs.write((const char *)&n, sizeof(n)); + for (const auto & ti : all_vec) { + const std::string name = ggml_get_name(ti.w->tensor); + const uint32_t len = (uint32_t)name.size(); + ofs.write((const char *)&len, sizeof(len)); + ofs.write(name.data(), len); + + const uint64_t cn = ti.candidate.size(); + ofs.write((const char *)&cn, sizeof(cn)); + ofs.write((const char *)&ti.choice, sizeof(ti.choice)); + ofs.write((const char *)&ti.min_bpw, sizeof(ti.min_bpw)); + ofs.write((const char *)&ti.max_bpw, sizeof(ti.max_bpw)); + const uint64_t ne = ti.n_elements; + ofs.write((const char *)&ne, sizeof(ne)); + + for (const auto & c : ti.candidate) { + const int32_t t = c.type; + const uint64_t b = c.bytes; + ofs.write((const char *)&t, sizeof(t)); + ofs.write((const char *)&c.bpw, sizeof(c.bpw)); + ofs.write((const char *)&b, sizeof(b)); + ofs.write((const char *)&c.error, sizeof(c.error)); + } + } + + ofs.close(); + std::remove(checkpoint_file.c_str()); // TODO: handle errors + std::rename(tmp.c_str(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 02c3073b81cc7fa26219419c517331b3e3243379 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:18:36 +0100 Subject: [PATCH 111/148] Add load_bpw_state() --- src/llama-quant.cpp | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3080b0ed715..4d0dc6a36e3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -672,7 +672,9 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); + constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; + const std::string checkpoint_file = ml.arch_name + ".bpw_state"; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -784,6 +786,68 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; + auto load_bpw_state = [&]() -> std::unordered_map { + std::unordered_map out; + std::ifstream ifs(checkpoint_file, std::ios::binary); + if (!ifs) { return out; } + + uint32_t magic = 0; + float target_bpw = 0.0f; + uint8_t bias_mode = 0; + ifs.read((char *)&magic, sizeof(magic)); + ifs.read((char *)&target_bpw, sizeof(target_bpw)); + ifs.read((char *)&bias_mode, sizeof(bias_mode)); + if (magic != file_magic) { + LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); + return out; + } + if (target_bpw != params->target_bpw) { + LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str()); + return out; + } + if (bias_mode != (params->no_bias ? 1 : 0)) { + LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); + return out; + } + + uint64_t n = 0; + ifs.read((char *)&n, sizeof(n)); + for (uint64_t i = 0; i < n; ++i) { + uint32_t len = 0; + ifs.read((char *)&len, sizeof(len)); + std::string name(len, '\0'); + ifs.read(name.data(), len); + + uint64_t cn = 0; + ifs.read((char *)&cn, sizeof(cn)); + + saved_info si; + ifs.read((char *)&si.choice, sizeof(si.choice)); + ifs.read((char *)&si.min_bpw, sizeof(si.min_bpw)); + ifs.read((char *)&si.max_bpw, sizeof(si.max_bpw)); + uint64_t ne = 0; + ifs.read((char *)&ne, sizeof(ne)); + si.n_elements = (size_t)ne; + + si.candidate.resize(cn); + for (size_t j = 0; j < si.candidate.size(); ++j) { + int32_t t = 0; + uint64_t b = 0; + ifs.read((char *)&t, sizeof(t)); + si.candidate[j].type = (ggml_type)t; + ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw)); + ifs.read((char *)&b, sizeof(b)); + si.candidate[j].bytes = (size_t)b; + ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error)); + } + + out.emplace(std::move(name), std::move(si)); + } + + LLAMA_LOG_INFO("%s: loaded bpw state for %lu tensors from %s\n", func, out.size(), checkpoint_file.c_str()); + return out; + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 74c62ed4e63e4e95f031875b6ead5718f5fb900a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:19:03 +0100 Subject: [PATCH 112/148] Add delete_bpw_state() --- src/llama-quant.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4d0dc6a36e3..9212c885632 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -848,6 +848,19 @@ static std::unordered_map target_bpw_type( return out; }; + auto delete_bpw_state = [&] { + LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); + std::remove(checkpoint_file.c_str()); + }; + + auto check_signal_handler = [&](const std::vector & all_vec) { + if (bpw_stop.load(std::memory_order_relaxed)) { + LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + save_bpw_state(all_vec); + throw std::runtime_error("user interrupted the process"); + } + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 46706cec28ad83b8ab10781493b84343b5b0f048 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:20:28 +0100 Subject: [PATCH 113/148] Persist progress --- src/llama-quant.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9212c885632..640672aec73 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1100,12 +1100,28 @@ static std::unordered_map target_bpw_type( return lambdas; }; + install_signal_handlers(); + auto bpw_data = load_bpw_state(); std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } + check_signal_handler(all); + + // If we already have fully evaluatedd this tensor then reuse it + if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) { + tensor_info info; + info.w = tw; + info.candidate = it_saved->second.candidate; + info.choice = it_saved->second.choice; + info.min_bpw = it_saved->second.min_bpw; + info.max_bpw = it_saved->second.max_bpw; + info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); + all.push_back(std::move(info)); + continue; + } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { @@ -1296,6 +1312,7 @@ static std::unordered_map target_bpw_type( std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { + if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); if (i >= compatible_candidates.size()) { break; } @@ -1311,6 +1328,11 @@ static std::unordered_map target_bpw_type( for (auto &th : eval_workers) { th.join(); } + // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry + if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) { + check_signal_handler(all); + } + for (auto &c : eval_candidates) { if (c.bytes > 0) { info.candidate.push_back(c); } } @@ -1384,6 +1406,7 @@ static std::unordered_map target_bpw_type( info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; all.push_back(std::move(info)); + check_signal_handler(all); // save after each tensor } if (all.empty()) { return {}; } @@ -1441,7 +1464,7 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } if (budget_bytes >= max_bytes) { - for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } + for (auto & ti : all) { ti.choice = (int)ti.candidate.size() - 1; } return emit_overrides(); } From 84ada44894dec721124613820bf640b97ac3e784 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:20:56 +0100 Subject: [PATCH 114/148] Uninstall signal handler and cleanup --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 640672aec73..eb5c9124b5f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1625,6 +1625,9 @@ static std::unordered_map target_bpw_type( } } + delete_bpw_state(); // we're done, clear any checkpoint + uninstall_signal_handlers(); + return emit_overrides(); } From 044fa783c7e5e87bddf667fbe7396628e827b455 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 6 Oct 2025 21:40:37 +0100 Subject: [PATCH 115/148] Fix trimming logic --- src/llama-quant.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index eb5c9124b5f..aeb1542607c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -849,8 +849,12 @@ static std::unordered_map target_bpw_type( }; auto delete_bpw_state = [&] { - LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); - std::remove(checkpoint_file.c_str()); + std::ifstream ifs(checkpoint_file); + if (ifs.good()) { + LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); + std::remove(checkpoint_file.c_str()); + } + }; auto check_signal_handler = [&](const std::vector & all_vec) { @@ -988,14 +992,16 @@ static std::unordered_map target_bpw_type( } // Compute error per slice with trimmed aggregation - auto trimmed_sum = [](std::vector & v) -> double { + auto trimmed_mean = [](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - - int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution + double sum = std::accumulate(v.begin(), v.end(), 0.0); + if (n < 50) { return sum / (double)n; } // too few elements to trim + int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 5% (2.5% each side) std::sort(v.begin(), v.end()); - return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + const auto num = (double)(n - 2 * k); + sum = std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + return sum / std::max(1.0, num); }; size_t off = 0; @@ -1028,7 +1034,7 @@ static std::unordered_map target_bpw_type( } const double denom_x = row_sq_norm[ridx]; - const double m_norm = w_mse / (denom_x + epsilon); + const double m_norm = w_mse / (denom_x + epsilon); row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); if (a) { @@ -1044,9 +1050,8 @@ static std::unordered_map target_bpw_type( off += (size_t)n_per_row; } - const double scale_rows = (double)nrows / std::max(1.0, (double)rs); - const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; - const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; + const double slice_mse = trimmed_mean(row_mse_norm) * (double)nrows; + const double slice_proj = a ? trimmed_mean(row_proj_norm) * (double)nrows : 0.0; total_mse += slice_mse; total_proj += slice_proj; From c11184a3c11917aba2c3d360a9cbb3bf3ebaf38a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 9 Oct 2025 11:58:01 +0100 Subject: [PATCH 116/148] Generate model ID hash --- src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index aeb1542607c..5388d5a072a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -674,7 +674,6 @@ static std::unordered_map target_bpw_type( constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; - const std::string checkpoint_file = ml.arch_name + ".bpw_state"; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -745,6 +744,26 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; + auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t { + uint64_t h = 5381; + for (size_t i = 0; i < n; ++i) { + h = (h << 5) + h + data[i]; + } + return h ? h : 0xeabada55cafed00d; + }; + + auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { + const size_t sz = gguf_get_meta_size(ctx); + std::vector buf(sz); + gguf_get_meta_data(ctx, buf.data()); + return djb2_hash(buf.data(), buf.size()); + }; + + char hex[17]; + const uint64_t model_id = metadata_id(ml.meta.get()); + std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); + const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); @@ -752,6 +771,7 @@ static std::unordered_map target_bpw_type( const float target_bpw = params->target_bpw; const uint8_t bias_mode = params->no_bias ? 1 : 0; ofs.write((const char *)&file_magic, sizeof(file_magic)); + ofs.write((const char *)&model_id, sizeof(model_id)); ofs.write((const char *)&target_bpw, sizeof(target_bpw)); ofs.write((const char *)&bias_mode, sizeof(bias_mode)); const uint64_t n = all_vec.size(); @@ -781,9 +801,9 @@ static std::unordered_map target_bpw_type( } ofs.close(); - std::remove(checkpoint_file.c_str()); // TODO: handle errors + std::remove(checkpoint_file.c_str()); std::rename(tmp.c_str(), checkpoint_file.c_str()); - LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; auto load_bpw_state = [&]() -> std::unordered_map { @@ -792,22 +812,27 @@ static std::unordered_map target_bpw_type( if (!ifs) { return out; } uint32_t magic = 0; - float target_bpw = 0.0f; - uint8_t bias_mode = 0; + uint64_t id = 0; + float bpw = 0.0f; + uint8_t bias = 0; ifs.read((char *)&magic, sizeof(magic)); - ifs.read((char *)&target_bpw, sizeof(target_bpw)); - ifs.read((char *)&bias_mode, sizeof(bias_mode)); + ifs.read((char *)&id, sizeof(id)); + ifs.read((char *)&bpw, sizeof(bpw)); + ifs.read((char *)&bias, sizeof(bias)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } - if (target_bpw != params->target_bpw) { - LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str()); + } else if (id != model_id) { + LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } - if (bias_mode != (params->no_bias ? 1 : 0)) { + } else if (bpw != params->target_bpw) { + LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); + return out; + } else if (bias != (params->no_bias ? 1 : 0)) { LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); return out; + } else { + LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } uint64_t n = 0; @@ -859,7 +884,7 @@ static std::unordered_map target_bpw_type( auto check_signal_handler = [&](const std::vector & all_vec) { if (bpw_stop.load(std::memory_order_relaxed)) { - LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); save_bpw_state(all_vec); throw std::runtime_error("user interrupted the process"); } From 3a3d807fc3aacc01715047bcc893f925f5343c6b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 10 Oct 2025 13:10:42 +0100 Subject: [PATCH 117/148] Remove bias mode computation --- src/llama-quant.cpp | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5388d5a072a..7b3e956193b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -769,11 +769,9 @@ static std::unordered_map target_bpw_type( std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); if (!ofs) { return; } // best-effort const float target_bpw = params->target_bpw; - const uint8_t bias_mode = params->no_bias ? 1 : 0; ofs.write((const char *)&file_magic, sizeof(file_magic)); ofs.write((const char *)&model_id, sizeof(model_id)); ofs.write((const char *)&target_bpw, sizeof(target_bpw)); - ofs.write((const char *)&bias_mode, sizeof(bias_mode)); const uint64_t n = all_vec.size(); ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { @@ -814,11 +812,9 @@ static std::unordered_map target_bpw_type( uint32_t magic = 0; uint64_t id = 0; float bpw = 0.0f; - uint8_t bias = 0; ifs.read((char *)&magic, sizeof(magic)); ifs.read((char *)&id, sizeof(id)); ifs.read((char *)&bpw, sizeof(bpw)); - ifs.read((char *)&bias, sizeof(bias)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; @@ -828,9 +824,6 @@ static std::unordered_map target_bpw_type( } else if (bpw != params->target_bpw) { LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); return out; - } else if (bias != (params->no_bias ? 1 : 0)) { - LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); - return out; } else { LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } @@ -1319,13 +1312,11 @@ static std::unordered_map target_bpw_type( std::vector lambdas; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - if (!params->no_bias) { - double acc = 0.0; - int ns = 0; - lambdas = estimate_lambda(values, activations, n_per_row, ne2); - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; - } + double acc = 0.0; + int ns = 0; + lambdas = estimate_lambda(values, activations, n_per_row, ne2); + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); @@ -1925,11 +1916,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__); + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__); } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } - LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { From c93131cef6dbb4e415fd2b3625f644c6714e7465 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 10 Oct 2025 13:26:51 +0100 Subject: [PATCH 118/148] Remove --no-bias option --- include/llama.h | 1 - src/llama-quant.cpp | 3 +-- tools/quantize/quantize.cpp | 6 +----- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/llama.h b/include/llama.h index 16f61247272..1df8f96920c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,6 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - bool no_bias; // use mean square error estimation only (no aligment bias) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7b3e956193b..4ad5124d1ab 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -2180,8 +2180,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f, - /*.no_bias =*/ false + /*.target_bpw =*/ -1.0f }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index d355f972742..c254c3f6b24 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -134,8 +134,6 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); - printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -559,8 +557,6 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--no-bias") == 0) { - params.no_bias = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 5b0d3f6d5ad46596e0f30c967c00e2dc2b93d8da Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 11 Oct 2025 10:04:48 +0100 Subject: [PATCH 119/148] Automatically determine if bias error is significant --- src/llama-quant.cpp | 52 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ad5124d1ab..07a88f0fd68 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -637,6 +637,8 @@ static std::unordered_map target_bpw_type( float bpw; size_t bytes; double error; + double mse = 0.0; + double proj = 0.0; }; struct tensor_info { @@ -1340,9 +1342,11 @@ static std::unordered_map target_bpw_type( const ggml_type tensor_types = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(tensor, tensor_types); const size_t bytes = tensor_bytes(tensor, tensor_types); + double mse = 0.0; + double proj = 0.0; const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; + tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; } }); } @@ -1354,8 +1358,48 @@ static std::unordered_map target_bpw_type( check_signal_handler(all); } - for (auto &c : eval_candidates) { - if (c.bytes > 0) { info.candidate.push_back(c); } + // Check if biasing is needed + bool bias_needed = false; + if (!lambdas.empty()) { + int min_mse = -1; + int min_bias = -1; + { + double best_mse = std::numeric_limits::infinity(); + double best_err = std::numeric_limits::infinity(); + for (int i = 0; i < (int)eval_candidates.size(); ++i) { + const auto & c = eval_candidates[i]; + if (c.bytes == 0) { continue; } + if (c.mse < best_mse) { + best_mse = c.mse; + min_mse = i; + } + if (c.error < best_err) { + best_err = c.error; + min_bias = i; + } + } + } + + if (min_mse != min_bias) { + bias_needed = true; + } else { + double max_rel_bias = 0.0; + for (const auto & c : eval_candidates) { + if (c.bytes == 0) { continue; } + const double mse = std::max(c.mse, epsilon); + const double bias_term = std::max(0.0, c.error - c.mse); + const double rel = bias_term / mse; + max_rel_bias = std::max(rel, max_rel_bias); + } + + bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? + } + } + + for (auto & c : eval_candidates) { + if (c.bytes == 0) { continue; } + const double final_err = bias_needed ? c.error : c.mse; + info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); } if (info.candidate.empty()) { From 12e0524f3a24d4d5c8a81546fff83fee81e0d3e1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 15:12:15 +0100 Subject: [PATCH 120/148] Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 --- src/llama-quant.cpp | 189 +++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 88 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 07a88f0fd68..c607651b05b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -15,6 +15,7 @@ #include #include #include +#include // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -623,7 +624,6 @@ static void signal_handler(int) { // Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, - std::vector> & buffer, const llama_model & model, const std::vector & tensors, const std::map & mapped, @@ -659,6 +659,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_XXS, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, @@ -1127,16 +1128,22 @@ static std::unordered_map target_bpw_type( install_signal_handlers(); auto bpw_data = load_bpw_state(); - std::vector all; - all.reserve(tensors.size()); - for (const auto * tw : tensors) { + + // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, + std::vector> & thread_local_buffer, + std::mutex & loader_mutex, + std::mutex & log_mutex) -> std::optional + { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); - if (!can_quantize(tensor)) { continue; } - check_signal_handler(all); + if (bpw_stop.load(std::memory_order_relaxed)) { + return std::nullopt; + } - // If we already have fully evaluatedd this tensor then reuse it - if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) { + // check for pre-computed results from a checkpoint file. + auto it_saved = bpw_data.find(name); + if (it_saved != bpw_data.end()) { tensor_info info; info.w = tw; info.candidate = it_saved->second.candidate; @@ -1144,17 +1151,21 @@ static std::unordered_map target_bpw_type( info.min_bpw = it_saved->second.min_bpw; info.max_bpw = it_saved->second.max_bpw; info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); - all.push_back(std::move(info)); - continue; + return info; + } + { + std::lock_guard lock(log_mutex); + LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor)); } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { - if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } - tensor->data = buffer.data(); + if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); } + tensor->data = thread_local_buffer.data(); + } + { + std::lock_guard lock(loader_mutex); + ml.load_data_for(tensor); } - - ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample const int64_t n_per_row = tensor->ne[0]; @@ -1170,7 +1181,7 @@ static std::unordered_map target_bpw_type( const int64_t max_rows = 4096; int64_t total_rows = std::llround(slice_budget / std::max(1, n)); total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); - if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors + if (rows <= min_rows * 2) { total_rows = rows; } return total_rows; }; @@ -1191,17 +1202,16 @@ static std::unordered_map target_bpw_type( return; } if (t == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); return; } if (t == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); return; } - if (src_is_quant) { GGML_ASSERT(src_traits && src_traits->to_float); - src_traits->to_float(src, dst, (int) n_per_row); + src_traits->to_float(src, dst, (int)n_per_row); return; } @@ -1266,6 +1276,7 @@ static std::unordered_map target_bpw_type( return; } + std::lock_guard lock(log_mutex); LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); }; @@ -1276,12 +1287,9 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; - info.n_elements = nelem; - - // Prepare scratch buffers sized for the largest candidate row size + info.n_elements = ggml_nelements(tensor); size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) @@ -1295,7 +1303,8 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str()); + std::lock_guard lock(log_mutex); + LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1325,58 +1334,38 @@ static std::unordered_map target_bpw_type( std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); - int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); - std::atomic cidx{0}; - std::vector eval_workers; - eval_workers.reserve(n_eval_threads); - for (int ti = 0; ti < n_eval_threads; ++ti) { - eval_workers.emplace_back([&] { - // thread-local scratch - std::vector tl_quantized_buffer(quantized_buffer.size()); - std::vector tl_dequantized_buffer(dequantized_buffer.size()); - for (;;) { - if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived - const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); - if (i >= compatible_candidates.size()) { break; } - - const ggml_type tensor_types = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_types); - const size_t bytes = tensor_bytes(tensor, tensor_types); - double mse = 0.0; - double proj = 0.0; - const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; - } - }); - } + for (size_t i = 0; i < compatible_candidates.size(); ++i) { + if (bpw_stop.load(std::memory_order_relaxed)) { break; } - for (auto &th : eval_workers) { th.join(); } - - // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry - if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) { - check_signal_handler(all); + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + double mse = 0.0; + double proj = 0.0; + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } + // Check if biasing is needed bool bias_needed = false; if (!lambdas.empty()) { int min_mse = -1; int min_bias = -1; - { - double best_mse = std::numeric_limits::infinity(); - double best_err = std::numeric_limits::infinity(); - for (int i = 0; i < (int)eval_candidates.size(); ++i) { - const auto & c = eval_candidates[i]; - if (c.bytes == 0) { continue; } - if (c.mse < best_mse) { - best_mse = c.mse; - min_mse = i; - } - if (c.error < best_err) { - best_err = c.error; - min_bias = i; - } + double best_mse = std::numeric_limits::infinity(); + double best_err = std::numeric_limits::infinity(); + for (int i = 0; i < (int)eval_candidates.size(); ++i) { + const auto & c = eval_candidates[i]; + if (c.bytes == 0) { continue; } + if (c.mse < best_mse) { + best_mse = c.mse; + min_mse = i; + } + if (c.error < best_err) { + best_err = c.error; + min_bias = i; } } @@ -1388,8 +1377,7 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double mse = std::max(c.mse, epsilon); const double bias_term = std::max(0.0, c.error - c.mse); - const double rel = bias_term / mse; - max_rel_bias = std::max(rel, max_rel_bias); + max_rel_bias = std::max(bias_term / mse, max_rel_bias); } bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? @@ -1404,7 +1392,7 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements; info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } @@ -1416,26 +1404,18 @@ static std::unordered_map target_bpw_type( if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); - const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { return a.bytes == b.bytes; - }); - candidates.erase(last, candidates.end()); - - // Pareto by bytes -> error + }), candidates.end()); std::vector pareto; pareto.reserve(candidates.size()); double best_err = infinity; - size_t last_b = std::numeric_limits::max(); for (const auto & c : candidates) { - if (c.bytes != last_b) { - last_b = c.bytes; - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); - } + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); } } - candidates.swap(pareto); if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull @@ -1470,10 +1450,43 @@ static std::unordered_map target_bpw_type( info.choice = 0; info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; - all.push_back(std::move(info)); - check_signal_handler(all); // save after each tensor + + return info; + }; + + std::vector all; // this vector will be populated by the parallel workers + { + std::atomic tensor_idx{0}; // shared work queue index for all threads + const size_t num_tensors_to_process = tensors.size(); + std::mutex loader_mutex; + std::mutex log_mutex; + std::mutex results_mutex; + std::vector workers; + int num_threads_to_spawn = std::max(1, std::min(nthread, (int)num_tensors_to_process)); + + for (int i = 0; i < num_threads_to_spawn; ++i) { + workers.emplace_back([&]() { + std::vector> thread_local_buffer; + while (true) { + const size_t current_idx = tensor_idx.fetch_add(1); + if (current_idx >= num_tensors_to_process) { break; } + const auto * tw = tensors[current_idx]; + if (!can_quantize(tw->tensor)) { continue; } + // Execute the main processing logic for this tensor + std::optional result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex); + if (result_info) { + std::lock_guard lock(results_mutex); + all.push_back(std::move(*result_info)); + } + } + }); + } + + for (auto & w : workers) { w.join(); } } + check_signal_handler(all); + if (all.empty()) { return {}; } // Compute total elements across all tensors and bytes for non-quantizable tensors @@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); } From b6094a97bfbd831a715ca366200f8b9372a26a0d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 16:30:35 +0100 Subject: [PATCH 121/148] Add quant types --- src/llama-quant.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c607651b05b..56e63f9bb76 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -655,8 +655,11 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, GGML_TYPE_Q2_K, GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, @@ -1155,7 +1158,7 @@ static std::unordered_map target_bpw_type( } { std::lock_guard lock(log_mutex); - LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", func, name.c_str(), ggml_nelements(tensor)); } if (!ml.use_mmap) { @@ -1457,19 +1460,19 @@ static std::unordered_map target_bpw_type( std::vector all; // this vector will be populated by the parallel workers { std::atomic tensor_idx{0}; // shared work queue index for all threads - const size_t num_tensors_to_process = tensors.size(); + const size_t tensors_to_process = tensors.size(); std::mutex loader_mutex; std::mutex log_mutex; std::mutex results_mutex; std::vector workers; - int num_threads_to_spawn = std::max(1, std::min(nthread, (int)num_tensors_to_process)); + int threads_to_spawn = std::max(1, std::min(nthread, (int)tensors_to_process)); - for (int i = 0; i < num_threads_to_spawn; ++i) { + for (int i = 0; i < threads_to_spawn; ++i) { workers.emplace_back([&]() { std::vector> thread_local_buffer; while (true) { const size_t current_idx = tensor_idx.fetch_add(1); - if (current_idx >= num_tensors_to_process) { break; } + if (current_idx >= tensors_to_process) { break; } const auto * tw = tensors[current_idx]; if (!can_quantize(tw->tensor)) { continue; } // Execute the main processing logic for this tensor From ca282302b5cde95945f8337e6df264d92e878501 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 18:23:23 +0100 Subject: [PATCH 122/148] Add --keep-bpw-state option --- include/llama.h | 1 + src/llama-quant.cpp | 16 +++++----------- tools/quantize/quantize.cpp | 5 ++++- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/llama.h b/include/llama.h index 14e12d7c518..f745e2110b7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -366,6 +366,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) + bool keep_bpw_state; // keep bpw state file } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 56e63f9bb76..4b243f1f557 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -659,7 +659,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ2_S, GGML_TYPE_Q2_K, GGML_TYPE_IQ3_XXS, - GGML_TYPE_IQ3_S, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, @@ -773,11 +772,9 @@ static std::unordered_map target_bpw_type( auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); - if (!ofs) { return; } // best-effort - const float target_bpw = params->target_bpw; + if (!ofs) { return; } ofs.write((const char *)&file_magic, sizeof(file_magic)); ofs.write((const char *)&model_id, sizeof(model_id)); - ofs.write((const char *)&target_bpw, sizeof(target_bpw)); const uint64_t n = all_vec.size(); ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { @@ -817,19 +814,14 @@ static std::unordered_map target_bpw_type( uint32_t magic = 0; uint64_t id = 0; - float bpw = 0.0f; ifs.read((char *)&magic, sizeof(magic)); ifs.read((char *)&id, sizeof(id)); - ifs.read((char *)&bpw, sizeof(bpw)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; } else if (id != model_id) { LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else if (bpw != params->target_bpw) { - LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); - return out; } else { LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } @@ -874,7 +866,7 @@ static std::unordered_map target_bpw_type( auto delete_bpw_state = [&] { std::ifstream ifs(checkpoint_file); - if (ifs.good()) { + if (ifs.good() && !params->keep_bpw_state) { LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); std::remove(checkpoint_file.c_str()); } @@ -1489,6 +1481,7 @@ static std::unordered_map target_bpw_type( } check_signal_handler(all); + if (params->keep_bpw_state) { save_bpw_state(all); } if (all.empty()) { return {}; } @@ -2240,7 +2233,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f + /*.target_bpw =*/ -1.0f, + /*.keep_bpw_state =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index c254c3f6b24..ad2563a48d2 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -134,6 +134,7 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --keep-bpw-state: preserve the bpw computations in a state file\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -557,6 +558,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { + params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From b1b58e67df30453edd64706abda76d3c42f0bb03 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 14:54:32 +0100 Subject: [PATCH 123/148] Refactor signal handlers --- src/llama-quant.cpp | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4b243f1f557..d1fa4295530 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -632,6 +632,22 @@ static std::unordered_map target_bpw_type( const llama_model_quantize_params * params, int nthread ) { + // RAII guard for signal handlers + bpw_stop.store(false, std::memory_order_relaxed); + struct signal_scope_guard { + using handler_t = void (*)(int); + handler_t prev_int = SIG_DFL; + handler_t prev_term = SIG_DFL; + signal_scope_guard() { + prev_int = std::signal(SIGINT, signal_handler); + prev_term = std::signal(SIGTERM, signal_handler); + } + ~signal_scope_guard() { + std::signal(SIGINT, prev_int); + std::signal(SIGTERM, prev_term); + } + } _signal_guard; + struct candidate_types { ggml_type type; float bpw; @@ -724,22 +740,6 @@ static std::unordered_map target_bpw_type( return is_quantizable(ggml_get_name(t), model.arch, params); }; - auto install_signal_handlers = [] { - static std::once_flag once; - std::call_once(once, [] { - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - }); - }; - - auto uninstall_signal_handlers = [] { - static std::once_flag once; - std::call_once(once, [] { - std::signal(SIGINT, SIG_DFL); - std::signal(SIGTERM, SIG_DFL); - }); - }; - // Saved state per tensor struct saved_info { std::vector candidate; @@ -1121,7 +1121,6 @@ static std::unordered_map target_bpw_type( return lambdas; }; - install_signal_handlers(); auto bpw_data = load_bpw_state(); // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 @@ -1700,7 +1699,6 @@ static std::unordered_map target_bpw_type( } delete_bpw_state(); // we're done, clear any checkpoint - uninstall_signal_handlers(); return emit_overrides(); } From cd734b89ce3b2af611fd168975a5921f33b475eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 15:15:23 +0100 Subject: [PATCH 124/148] Update quant types --- src/llama-quant.cpp | 3 ++- tools/quantize/quantize.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d1fa4295530..7543ec69618 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -681,7 +681,8 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, + GGML_TYPE_F16 }; const char * important_tensors[] = { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index ad2563a48d2..e67649beb97 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -501,6 +501,8 @@ static const char * get_ftype(const float bpw) { {1.5625, "IQ1_S"}, {1.7500, "IQ1_M"}, {2.0625, "IQ2_XXS"}, + {2.3125, "IQ2_XS"}, + {2.5625, "IQ2_S"}, {2.6250, "Q2_K"}, {3.0625, "IQ3_XXS"}, {3.4375, "Q3_K"}, From b7911f14314387e4101957d4eb4df9650660c877 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 17:46:45 +0100 Subject: [PATCH 125/148] Minor refactoring --- src/llama-quant.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7543ec69618..0f256eface7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1122,9 +1122,9 @@ static std::unordered_map target_bpw_type( return lambdas; }; - auto bpw_data = load_bpw_state(); + const auto bpw_data = load_bpw_state(); - // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1330,7 +1330,7 @@ static std::unordered_map target_bpw_type( std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); for (size_t i = 0; i < compatible_candidates.size(); ++i) { - if (bpw_stop.load(std::memory_order_relaxed)) { break; } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } const ggml_type tensor_types = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(tensor, tensor_types); @@ -1383,6 +1383,8 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double final_err = bias_needed ? c.error : c.mse; info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); + // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n", + // func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err); } if (info.candidate.empty()) { @@ -1426,7 +1428,7 @@ static std::unordered_map target_bpw_type( }; while (hull.size() >= 2) { - if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { hull.pop_back(); } else { break; @@ -1670,7 +1672,6 @@ static std::unordered_map target_bpw_type( const auto & ti = all[i]; const std::string tensor_name = ggml_get_name(ti.w->tensor); int j = ti.choice + 1; - while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; From a6853ea2ae7d828e535874e6f2244786921df594 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 11:20:24 +0100 Subject: [PATCH 126/148] Add tensor type and depth heuristics --- src/llama-quant.cpp | 94 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0f256eface7..38d20e3d0f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -16,6 +16,7 @@ #include #include #include +#include // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -685,13 +686,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_F16 }; - const char * important_tensors[] = { - ".output.weight", - ".attn_output.weight", - ".ffn_down.weight", - ".ffn_down_shexp.weight" - }; - constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 @@ -1544,11 +1538,89 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto is_important = [&](const std::string & tensor_name) -> bool { - return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) { - return tensor_name.find(imp) != std::string::npos; + auto tensor_importance = [&](const std::vector & all_vec) -> std::unordered_map { + std::unordered_map scores; + for (const auto & ti : all_vec) { + const std::string name = ggml_get_name(ti.w->tensor); + float total_score = 0.0f; + float depth_score = 0.0f; + float type_score = 0.0f; + + // Depth component: output, embeddings & early/late layers are important + if (name.find("output.weight") != std::string::npos || + name.find("token_embd.weight") != std::string::npos) { + depth_score = 1.0f; + } + else if (name.find(".attn_output.weight") != std::string::npos) { + depth_score = 0.9f; + } else { + static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); + std::smatch match; + if (std::regex_search(name, match, layer_pattern)) { + const int layer = std::stoi(match[1]); + const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); + const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; + depth_score = 0.2f + 0.6f * center_dist; + } + } + + // Type component: certain tensor types are more important + if (name.find("output.weight") != std::string::npos) { + type_score = 1.0f; + } else if (name.find(".attn_output.weight") != std::string::npos) { + type_score = 0.9f; + } else if (name.find(".ffn_down.weight") != std::string::npos || + name.find(".ffn_down_shexp.weight") != std::string::npos || + name.find(".ffn_down_exps.weight") != std::string::npos) { + type_score = 0.8f; + } else if (name.find(".attn_q.weight") != std::string::npos || + name.find(".attn_k.weight") != std::string::npos || + name.find(".attn_v.weight") != std::string::npos || + name.find(".attn_qkv.weight") != std::string::npos) { + type_score = 0.7f; + } else if (name.find(".ffn_up.weight") != std::string::npos || + name.find(".ffn_gate.weight") != std::string::npos || + name.find(".ffn_up_shexp.weight") != std::string::npos || + name.find(".ffn_gate_shexp.weight") != std::string::npos || + name.find(".ffn_up_exps.weight") != std::string::npos || + name.find(".ffn_gate_exps.weight") != std::string::npos) { + type_score = 0.6f; + } else if (name.find("token_embd.weight") != std::string::npos) { + type_score = 0.5f; } - ); + + // Weighted combination + total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth + scores[name] = total_score; + } + + return scores; + }; + + auto select_tensors = [&](const std::vector & all_vec) -> std::unordered_set { + const auto scores = tensor_importance(all_vec); + + // Sort by score + std::vector> sorted_scores(scores.begin(), scores.end()); + std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); + + // Select top percentile + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25% + + std::unordered_set important; + for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { + important.insert(sorted_scores[i].first); + //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); + } + + LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size()); + return important; + }; + + const auto important_set = select_tensors(all); + + auto is_important = [&](const std::string & tensor_name) -> bool { + return important_set.count(tensor_name) > 0; }; // Lagrangian relaxation to minimise error subject to a bpw target constraint From 0b3e930d5204d3c4be96179835f5378811814247 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 11:41:26 +0100 Subject: [PATCH 127/148] Add option to override bpw state file name --- include/llama.h | 1 + src/llama-quant.cpp | 21 +++++++++++++++++++-- tools/quantize/quantize.cpp | 15 +++++++++++---- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index f745e2110b7..ce04011e191 100644 --- a/include/llama.h +++ b/include/llama.h @@ -367,6 +367,7 @@ extern "C" { void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file + void * bpw_state; // pointer to bpw state file } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 38d20e3d0f3..1dee52d58d9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -762,7 +762,23 @@ static std::unordered_map target_bpw_type( char hex[17]; const uint64_t model_id = metadata_id(ml.meta.get()); std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); - const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + if (params->keep_bpw_state && params->bpw_state) { + const auto * filename = static_cast(params->bpw_state); + std::ifstream ifs(filename, std::ios::binary); + if (ifs.good()) { + checkpoint_file = std::string(filename); + } else { + std::ofstream ofs(filename, std::ios::binary | std::ios::app); + if (ofs.is_open()) { + checkpoint_file = std::string(filename); + ofs.close(); + std::remove(checkpoint_file.c_str()); + } else { + LLAMA_LOG_WARN("%s: %s is not a valid file name. Using %s instead\n", func, filename, checkpoint_file.c_str()); + } + } + } auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; @@ -2306,7 +2322,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.keep_bpw_state =*/ false + /*.keep_bpw_state =*/ false, + /*.bpw_state =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index e67649beb97..945acbe2887 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,8 +117,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); + printf(" [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -128,13 +128,14 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); + printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. Example: --tensor-type attn_q=q8_0\n"); printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --keep-bpw-state: preserve the bpw computations in a state file\n"); + printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); + printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -562,6 +563,12 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; + } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { + if (arg_idx < argc-1) { + params.bpw_state = argv[++arg_idx]; + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From a5103933bb4eec23b71bd8ccaae3b80710a1a82a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 15:11:48 +0100 Subject: [PATCH 128/148] Minor refactoring --- src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1dee52d58d9..b8391a4f2c2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -647,7 +647,7 @@ static std::unordered_map target_bpw_type( std::signal(SIGINT, prev_int); std::signal(SIGTERM, prev_term); } - } _signal_guard; + } signal_guard; struct candidate_types { ggml_type type; @@ -683,7 +683,11 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, +#ifdef GGML_USE_METAL GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; constexpr double epsilon = 1e-12; @@ -1004,17 +1008,30 @@ static std::unordered_map target_bpw_type( // Dequantize into dequantized_buffer { - const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (!traits || !traits->to_float) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } - - for (size_t r = 0; r < sample_rows; ++r) { - const uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - traits->to_float(src, dst, (int)n_per_row); + if (quant_type == GGML_TYPE_F16) { + for (size_t r = 0; r < sample_rows; ++r) { + auto src = (const ggml_fp16_t *)(quantized_buffer.data() + r * row_sz); + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + ggml_fp16_to_fp32_row(src, dst, (int)n_per_row); + } + } else if (quant_type == GGML_TYPE_BF16) { + for (size_t r = 0; r < sample_rows; ++r) { + auto src = (const ggml_bf16_t *)(quantized_buffer.data() + r * row_sz); + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + ggml_bf16_to_fp32_row(src, dst, (int)n_per_row); + } + } else { + const ggml_type_traits * traits = ggml_get_type_traits(quant_type); + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + return infinity; + } + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + traits->to_float(src, dst, (int)n_per_row); + } } } @@ -1500,13 +1517,11 @@ static std::unordered_map target_bpw_type( // Compute total elements across all tensors and bytes for non-quantizable tensors size_t nq_elements = 0; size_t nq_bytes = 0; - for (const auto & it : ml.weights_map) { - const ggml_tensor * tensor = it.second.tensor; - const std::string name = it.first; + for (const auto * it : tensors) { + const ggml_tensor * tensor = it->tensor; + const std::string name = ggml_get_name(tensor); nq_elements += (size_t)ggml_nelements(tensor); - if (!is_quantizable(name, model.arch, params)) { - nq_bytes += ggml_nbytes(tensor); - } + if (!can_quantize(tensor)) { nq_bytes += ggml_nbytes(tensor); } } auto total_bytes = [&]() -> size_t { From fa1df81d49a0512cb4dc6b9b2afc10e7af86bcf2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 20:52:23 +0100 Subject: [PATCH 129/148] Finetune heuristics --- src/llama-quant.cpp | 51 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 91b127789cc..5e3893151c6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1577,13 +1577,9 @@ static std::unordered_map target_bpw_type( float depth_score = 0.0f; float type_score = 0.0f; - // Depth component: output, embeddings & early/late layers are important - if (name.find("output.weight") != std::string::npos || - name.find("token_embd.weight") != std::string::npos) { + // Depth component: output & early/late layers are important + if (name == "output.weight") { depth_score = 1.0f; - } - else if (name.find(".attn_output.weight") != std::string::npos) { - depth_score = 0.9f; } else { static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); std::smatch match; @@ -1591,38 +1587,40 @@ static std::unordered_map target_bpw_type( const int layer = std::stoi(match[1]); const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - depth_score = 0.2f + 0.6f * center_dist; + depth_score = 0.9f * center_dist; } } - // Type component: certain tensor types are more important - if (name.find("output.weight") != std::string::npos) { + // Type component: certain tensor types have more impact on model quality + if (name == "output.weight") { type_score = 1.0f; - } else if (name.find(".attn_output.weight") != std::string::npos) { - type_score = 0.9f; } else if (name.find(".ffn_down.weight") != std::string::npos || - name.find(".ffn_down_shexp.weight") != std::string::npos || name.find(".ffn_down_exps.weight") != std::string::npos) { + type_score = 0.9f; + } else if (name.find(".attn_output.weight") != std::string::npos || + name.find(".time_mix_output.weight") != std::string::npos || + name.find(".attn_o.weight") != std::string::npos) { type_score = 0.8f; - } else if (name.find(".attn_q.weight") != std::string::npos || - name.find(".attn_k.weight") != std::string::npos || - name.find(".attn_v.weight") != std::string::npos || - name.find(".attn_qkv.weight") != std::string::npos) { - type_score = 0.7f; } else if (name.find(".ffn_up.weight") != std::string::npos || name.find(".ffn_gate.weight") != std::string::npos || - name.find(".ffn_up_shexp.weight") != std::string::npos || - name.find(".ffn_gate_shexp.weight") != std::string::npos || name.find(".ffn_up_exps.weight") != std::string::npos || name.find(".ffn_gate_exps.weight") != std::string::npos) { - type_score = 0.6f; + type_score = 0.3f; + } else if (name.find(".attn_q.weight") != std::string::npos || + name.find(".attn_k.weight") != std::string::npos || + name.find(".attn_v.weight") != std::string::npos || + name.find(".attn_qkv.weight") != std::string::npos) { + type_score = 0.2f; } else if (name.find("token_embd.weight") != std::string::npos) { - type_score = 0.5f; + type_score = 0.1f; } // Weighted combination - total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth - scores[name] = total_score; + total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth + if (total_score != 0.0f) { + scores[name] = total_score; + LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); + } } return scores; @@ -1636,15 +1634,16 @@ static std::unordered_map target_bpw_type( std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25% + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25% std::unordered_set important; for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { important.insert(sorted_scores[i].first); - //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); + LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); } - LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size()); + const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); + LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); return important; }; From 00ddf039b306882a8a15761624bcdd673f666f71 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 21:38:49 +0100 Subject: [PATCH 130/148] Update usage --- tools/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 945acbe2887..f994999e591 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); - printf(" [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); From 543b5a99db2b74e2b74cb87a222a25586479bd9b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 21:57:03 +0100 Subject: [PATCH 131/148] Fix lambda capture --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5e3893151c6..e6c9bfa7f0c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1421,7 +1421,7 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - auto pareto_convex = [](std::vector & candidates) { + auto pareto_convex = [epsilon](std::vector & candidates) { if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { From 27bf25e93c9309b96a151c1d8c4eef8fdad0cb21 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 22:04:35 +0100 Subject: [PATCH 132/148] Fix lambda capture --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e6c9bfa7f0c..08f1b302934 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -690,7 +690,7 @@ static std::unordered_map target_bpw_type( #endif }; - constexpr double epsilon = 1e-12; + const double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; @@ -1118,7 +1118,7 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { const int64_t ns = std::max(1, ne2); std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } @@ -1421,7 +1421,7 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - auto pareto_convex = [epsilon](std::vector & candidates) { + auto pareto_convex = [&](std::vector & candidates) { if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { From 04561d5782b930e781627eee5ffcbb6b06e8b558 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 21 Oct 2025 12:53:26 +0100 Subject: [PATCH 133/148] Update epsilon specifier --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 08f1b302934..5280b9a02af 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -690,7 +690,7 @@ static std::unordered_map target_bpw_type( #endif }; - const double epsilon = 1e-12; + constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; From d6ccd5649ac6db0ad87156cf92f036737cf82be3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 25 Oct 2025 12:09:20 +0100 Subject: [PATCH 134/148] Finetune heuristics --- src/llama-quant.cpp | 81 ++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5280b9a02af..617c7d94737 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -838,7 +838,7 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; } else { - LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); + LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); } uint64_t n = 0; @@ -1569,54 +1569,59 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto tensor_importance = [&](const std::vector & all_vec) -> std::unordered_map { + auto tensor_depth = [&](const std::string & name) -> float { + static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); + std::smatch match; + + // Depth component: output, embeddings & early/late layers are important + if (name == "output.weight" || name == "token_embd.weight") { + return 1.0f; + } + if (std::regex_search(name, match, layer_pattern)) { + const int layer = std::stoi(match[1]); + const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); + const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; + return 0.01f + 0.9f * center_dist; + } + + return 0.0f; + }; + + auto tensor_importance = [&](const std::vector & all_tensors) -> std::unordered_map { std::unordered_map scores; - for (const auto & ti : all_vec) { - const std::string name = ggml_get_name(ti.w->tensor); + for (const auto & t : all_tensors) { + const std::string name = ggml_get_name(t.w->tensor); float total_score = 0.0f; float depth_score = 0.0f; float type_score = 0.0f; - // Depth component: output & early/late layers are important + // Type component: certain tensor types have more impact on model quality + const std::vector>> tensor_scores = { + {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, + {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, + {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, + {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, + {0.2f, {"token_embd.weight"}} + }; if (name == "output.weight") { - depth_score = 1.0f; + type_score = 1.0f; } else { - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); - std::smatch match; - if (std::regex_search(name, match, layer_pattern)) { - const int layer = std::stoi(match[1]); - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - depth_score = 0.9f * center_dist; + for (const auto& ts : tensor_scores) { + const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { + return name.find(pattern) != std::string::npos; + }); + if (found) { + type_score = ts.first; + break; + } } } - - // Type component: certain tensor types have more impact on model quality - if (name == "output.weight") { - type_score = 1.0f; - } else if (name.find(".ffn_down.weight") != std::string::npos || - name.find(".ffn_down_exps.weight") != std::string::npos) { - type_score = 0.9f; - } else if (name.find(".attn_output.weight") != std::string::npos || - name.find(".time_mix_output.weight") != std::string::npos || - name.find(".attn_o.weight") != std::string::npos) { - type_score = 0.8f; - } else if (name.find(".ffn_up.weight") != std::string::npos || - name.find(".ffn_gate.weight") != std::string::npos || - name.find(".ffn_up_exps.weight") != std::string::npos || - name.find(".ffn_gate_exps.weight") != std::string::npos) { - type_score = 0.3f; - } else if (name.find(".attn_q.weight") != std::string::npos || - name.find(".attn_k.weight") != std::string::npos || - name.find(".attn_v.weight") != std::string::npos || - name.find(".attn_qkv.weight") != std::string::npos) { - type_score = 0.2f; - } else if (name.find("token_embd.weight") != std::string::npos) { - type_score = 0.1f; + if (type_score > 0.0f) { + depth_score = tensor_depth(name); } // Weighted combination - total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth + total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth if (total_score != 0.0f) { scores[name] = total_score; LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); @@ -1634,7 +1639,7 @@ static std::unordered_map target_bpw_type( std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25% + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front std::unordered_set important; for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { From 5303212324c90745eb82c3e5f5abb32b184cb7fa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 26 Oct 2025 17:40:52 +0000 Subject: [PATCH 135/148] Simplify tensor selection --- src/llama-quant.cpp | 99 +++++---------------------------------------- 1 file changed, 11 insertions(+), 88 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 617c7d94737..04f4ff341af 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -11,11 +11,12 @@ #include #include #include +#include +#include #include #include #include #include -#include #include // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -1151,7 +1152,7 @@ static std::unordered_map target_bpw_type( const auto bpw_data = load_bpw_state(); - // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + // Parallelize tensor processing - courtesy of https://github.com/ddh0 auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1569,93 +1570,15 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto tensor_depth = [&](const std::string & name) -> float { - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); - std::smatch match; - - // Depth component: output, embeddings & early/late layers are important - if (name == "output.weight" || name == "token_embd.weight") { - return 1.0f; - } - if (std::regex_search(name, match, layer_pattern)) { - const int layer = std::stoi(match[1]); - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - return 0.01f + 0.9f * center_dist; - } - - return 0.0f; - }; - - auto tensor_importance = [&](const std::vector & all_tensors) -> std::unordered_map { - std::unordered_map scores; - for (const auto & t : all_tensors) { - const std::string name = ggml_get_name(t.w->tensor); - float total_score = 0.0f; - float depth_score = 0.0f; - float type_score = 0.0f; - - // Type component: certain tensor types have more impact on model quality - const std::vector>> tensor_scores = { - {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, - {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, - {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, - {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, - {0.2f, {"token_embd.weight"}} - }; - if (name == "output.weight") { - type_score = 1.0f; - } else { - for (const auto& ts : tensor_scores) { - const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { - return name.find(pattern) != std::string::npos; - }); - if (found) { - type_score = ts.first; - break; - } - } - } - if (type_score > 0.0f) { - depth_score = tensor_depth(name); - } - - // Weighted combination - total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth - if (total_score != 0.0f) { - scores[name] = total_score; - LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); - } - } - - return scores; - }; - - auto select_tensors = [&](const std::vector & all_vec) -> std::unordered_set { - const auto scores = tensor_importance(all_vec); - - // Sort by score - std::vector> sorted_scores(scores.begin(), scores.end()); - std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); - - // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front - - std::unordered_set important; - for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { - important.insert(sorted_scores[i].first); - LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); - } - - const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); - LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); - return important; - }; - - const auto important_set = select_tensors(all); - + // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - return important_set.count(tensor_name) > 0; + const auto important = tensor_name == "output.weight" || + tensor_name.find(".ffn_down.weight") != std::string::npos || + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".time_mix_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos; + return important; }; // Lagrangian relaxation to minimise error subject to a bpw target constraint From f8863b9a80822bb58e7406fd35d4452a97c4639a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 28 Oct 2025 15:22:32 +0000 Subject: [PATCH 136/148] Minor refactoring --- src/llama-quant.cpp | 48 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 04f4ff341af..fdce1f4285b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -694,6 +694,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 + constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { @@ -731,7 +732,7 @@ static std::unordered_map target_bpw_type( auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { if (is_compatible(t, typ)) { return typ; } - ggml_type fb = fallback_type(typ); + const ggml_type fb = fallback_type(typ); return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; @@ -754,7 +755,7 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < n; ++i) { h = (h << 5) + h + data[i]; } - return h ? h : 0xeabada55cafed00d; + return h ? h : arbitrary_magic; }; auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { @@ -795,7 +796,7 @@ static std::unordered_map target_bpw_type( ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { const std::string name = ggml_get_name(ti.w->tensor); - const uint32_t len = (uint32_t)name.size(); + const auto len = (uint32_t)name.size(); ofs.write((const char *)&len, sizeof(len)); ofs.write(name.data(), len); @@ -835,13 +836,14 @@ static std::unordered_map target_bpw_type( if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else if (id != model_id) { + } + if (id != model_id) { LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else { - LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); } + LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); + uint64_t n = 0; ifs.read((char *)&n, sizeof(n)); for (uint64_t i = 0; i < n; ++i) { @@ -862,15 +864,15 @@ static std::unordered_map target_bpw_type( si.n_elements = (size_t)ne; si.candidate.resize(cn); - for (size_t j = 0; j < si.candidate.size(); ++j) { + for (auto & s : si.candidate) { int32_t t = 0; uint64_t b = 0; ifs.read((char *)&t, sizeof(t)); - si.candidate[j].type = (ggml_type)t; - ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw)); + s.type = (ggml_type)t; + ifs.read((char *)&s.bpw, sizeof(s.bpw)); ifs.read((char *)&b, sizeof(b)); - si.candidate[j].bytes = (size_t)b; - ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error)); + s.bytes = (size_t)b; + ifs.read((char *)&s.error, sizeof(s.error)); } out.emplace(std::move(name), std::move(si)); @@ -886,7 +888,6 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); std::remove(checkpoint_file.c_str()); } - }; auto check_signal_handler = [&](const std::vector & all_vec) { @@ -1198,10 +1199,10 @@ static std::unordered_map target_bpw_type( // Compute rows based on tensor shape and slice count auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t { const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024; - const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt + const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large tensors const double slice_budget = tensor_budget * scale_rows / std::max(1, n2); const int64_t min_rows = has_acts ? 128 : 64; - const int64_t max_rows = 4096; + constexpr int64_t max_rows = 4096; // row limit to avoid excessive memory use int64_t total_rows = std::llround(slice_budget / std::max(1, n)); total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); if (rows <= min_rows * 2) { total_rows = rows; } @@ -1246,7 +1247,7 @@ static std::unordered_map target_bpw_type( f32_sample.clear(); std::vector row_buffer(n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { - std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); + std::mt19937 rng(std::hash{}(name) ^ arbitrary_magic ^ slice); const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); const int64_t stride = std::max(1, nrows_total / rows_sample_max); int64_t offset = 0; @@ -1411,8 +1412,6 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double final_err = bias_needed ? c.error : c.mse; info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); - // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n", - // func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err); } if (info.candidate.empty()) { @@ -1445,16 +1444,15 @@ static std::unordered_map target_bpw_type( if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull // Convex hull (lower envelope) + auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { + const double dx1 = (double)h1.bytes - (double)h0.bytes; + const double dy1 = h1.error - h0.error; + const double dx2 = (double)p.bytes - (double)h0.bytes; + const double dy2 = p.error - h0.error; + return dx1 * dy2 - dx2 * dy1; + }; std::vector hull; hull.reserve(candidates.size()); for (const auto & c : candidates) { - auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { - const double dx1 = (double)h1.bytes - (double)h0.bytes; - const double dy1 = h1.error - h0.error; - const double dx2 = (double)p.bytes - (double)h0.bytes; - const double dy2 = p.error - h0.error; - return dx1 * dy2 - dx2 * dy1; - }; - while (hull.size() >= 2) { if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { hull.pop_back(); From 6e32244a06b1ffe513b1694ee647e92c09904dac Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 30 Oct 2025 21:53:07 +0000 Subject: [PATCH 137/148] Read statistics from imatrix --- include/llama.h | 1 + src/llama-quant.cpp | 28 ++++++++++----- tools/quantize/quantize.cpp | 68 +++++++++++++++++++++++++++++-------- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/include/llama.h b/include/llama.h index ce04011e191..517ef5e0fbe 100644 --- a/include/llama.h +++ b/include/llama.h @@ -368,6 +368,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file + void * statistics; // pointer to statistics data } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fdce1f4285b..a8153494f92 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -631,6 +631,7 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, + const std::unordered_map> * statistics_data, const llama_model_quantize_params * params, int nthread ) { @@ -1815,6 +1816,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } const std::unordered_map> * values_data = nullptr; const std::unordered_map> * activations_data = nullptr; + const std::unordered_map> * statistics_data = nullptr; if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { @@ -1845,6 +1847,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + if (params->statistics) { + statistics_data = static_cast>*>(params->statistics); + if (statistics_data) { + LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size())); + } + } LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -1999,15 +2007,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__); - } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); - } + const char* base_msg = params->activations + ? (params->statistics + ? "imatrix with activations and statistics provided, process will be more accurate\n" + : "imatrix with activations provided, process will be accurate\n") + : "imatrix without activations provided, process will be less accurate\n"; + if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); } + else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); } + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread); } else { - LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); + LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__); } } @@ -2269,7 +2280,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, - /*.bpw_state =*/ nullptr + /*.bpw_state =*/ nullptr, + /*.statistics =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index f994999e591..0b2b05b60a6 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -221,7 +221,8 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & values_data, - std::unordered_map> & activations_data) { + std::unordered_map> & activations_data, + std::unordered_map> & statistics_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -256,24 +257,28 @@ static int load_imatrix(const std::string & imatrix_file, const std::string sums_suffix{ ".in_sum" }; const std::string sums2_suffix{ ".in_sum2" }; const std::string counts_suffix{ ".counts" }; + const std::string stats_suffix{ ".stats" }; // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; if (name.empty()) { continue; } - if (string_remove_suffix(name, sums2_suffix)) { - // in_sum2 + if (string_remove_suffix(name, sums_suffix)) { + // in_sum std::get<0>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, sums2_suffix)) { + // in_sum2 + std::get<1>(sums_counts_for[std::move(name)]) = cur; } else if (string_remove_suffix(name, counts_suffix)) { // counts - std::get<1>(sums_counts_for[std::move(name)]) = cur; - } else if (string_remove_suffix(name, sums_suffix)) { - // in_sum std::get<2>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, stats_suffix)) { + // stats + std::get<3>(sums_counts_for[std::move(name)]) = cur; } else { // ignore other tensors @@ -282,11 +287,12 @@ static int load_imatrix(const std::string & imatrix_file, for (const auto & sc : sums_counts_for) { const std::string & name = sc.first; - const struct ggml_tensor * sums = std::get<2>(sc.second); - const struct ggml_tensor * sums2 = std::get<0>(sc.second); - const struct ggml_tensor * counts = std::get<1>(sc.second); + const struct ggml_tensor * sums = std::get<0>(sc.second); + const struct ggml_tensor * sums2 = std::get<1>(sc.second); + const struct ggml_tensor * counts = std::get<2>(sc.second); + const struct ggml_tensor * stats = std::get<3>(sc.second); - // check that sums, sums2 and counts have the same shape + // check sums2 and counts are present, and that sums and sums2 have the same shape if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); gguf_free(ctx_gguf); @@ -302,6 +308,19 @@ static int load_imatrix(const std::string & imatrix_file, if (sums) { activations.resize(ggml_nelements(sums)); } + if (stats) { + auto & statistics = statistics_data[name]; + statistics.resize(ggml_nelements(stats)); + if (stats->type == GGML_TYPE_F32) { + std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float)); + } else { + fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n", + __func__, ggml_type_name(stats->type), name.c_str()); + statistics.clear(); + statistics_data.erase(name); + } + + } values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { @@ -354,10 +373,11 @@ static int prepare_imatrix(const std::string & imatrix_file, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & values_data, - std::unordered_map> & activations_data) { + std::unordered_map> & activations_data, + std::unordered_map> & statistics_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data); } if (values_data.empty()) { return m_last_call; @@ -380,11 +400,20 @@ static int prepare_imatrix(const std::string & imatrix_file, ++at; } } + for (auto st = statistics_data.begin(); st != statistics_data.end();) { + auto pos = st->first.find(name); + if (pos != std::string::npos) { + st = activations_data.erase(st); + } else { + ++st; + } + } } } if (!included_weights.empty()) { std::unordered_map> tmp_values; std::unordered_map> tmp_activations; + std::unordered_map> tmp_statistics; for (const auto & name : included_weights) { for (auto & e : values_data) { auto pos = e.first.find(name); @@ -398,9 +427,16 @@ static int prepare_imatrix(const std::string & imatrix_file, tmp_activations.emplace(std::move(a)); } } + for (auto & s : statistics_data) { + auto pos = s.first.find(name); + if (pos != std::string::npos) { + tmp_statistics.emplace(std::move(s)); + } + } } values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); + statistics_data = std::move(tmp_statistics); } return m_last_call; @@ -617,7 +653,8 @@ int main(int argc, char ** argv) { std::vector imatrix_datasets; std::unordered_map> values_data; std::unordered_map> activations_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); + std::unordered_map> statistics_data; + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data); if (!values_data.empty()) { params.imatrix = &values_data; { @@ -657,6 +694,9 @@ int main(int argc, char ** argv) { if (!activations_data.empty()) { params.activations = &activations_data; } + if (!statistics_data.empty()) { + params.statistics = &statistics_data; + } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; From c59bb6d49d025765091d7c83a9b95528395de283 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 30 Oct 2025 22:11:40 +0000 Subject: [PATCH 138/148] Add Euclidean-Cosine score to identify important tensors --- src/llama-quant.cpp | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a8153494f92..957dd5f3677 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1571,12 +1571,25 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - const auto important = tensor_name == "output.weight" || - tensor_name.find(".ffn_down.weight") != std::string::npos || - tensor_name.find(".ffn_down_exps.weight") != std::string::npos || - tensor_name.find(".attn_output.weight") != std::string::npos || - tensor_name.find(".time_mix_output.weight") != std::string::npos || - tensor_name.find(".attn_o.weight") != std::string::npos; + bool important = false; + + if (statistics_data) { + float ecs = 0.0f; // Euclidean-Cosine score + const std::string key = remap_imatrix(tensor_name, mapped); + const auto tstats = statistics_data->find(key); + if (tstats != statistics_data->end() && !tstats->second.empty()) { + ecs = tstats->second.front(); + important = ecs == 100.0f; // mark as important if ecs is 100% + } + } else { + important = tensor_name == "output.weight" || + tensor_name.find(".ffn_down.weight") != std::string::npos || + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".time_mix_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos; + } + return important; }; From ac8cfbdd12eb2207098e3bcc4aee9347aa8366bc Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 17 Nov 2025 18:03:09 +0000 Subject: [PATCH 139/148] Improved is_important() logic --- src/llama-quant.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 739172c70f4..1e8a2cda9c9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -635,8 +635,8 @@ static std::unordered_map target_bpw_type( const llama_model_quantize_params * params, int nthread ) { - // RAII guard for signal handlers bpw_stop.store(false, std::memory_order_relaxed); + // Signal handlers struct signal_scope_guard { using handler_t = void (*)(int); handler_t prev_int = SIG_DFL; @@ -1574,12 +1574,23 @@ static std::unordered_map target_bpw_type( bool important = false; if (statistics_data) { - float ecs = 0.0f; // Euclidean-Cosine score const std::string key = remap_imatrix(tensor_name, mapped); const auto tstats = statistics_data->find(key); if (tstats != statistics_data->end() && !tstats->second.empty()) { - ecs = tstats->second.front(); - important = ecs == 100.0f; // mark as important if ecs is 100% + float ecs = 0.0f; // Euclidean-Cosine score + float l2 = 0.0f; // L2 Euclidean Distance + float cs = 0.0f; // Cosine Similarity + try { + // ecs = tstats->second.at(0); + l2 = tstats->second.at(1); + cs = tstats->second.at(2); + } catch (std::out_of_range &) { + LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str()); + return false; + } + ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q) + // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs); + important = ecs >= 99.99f; // mark as important if ecs is >= 99.99% } } else { important = tensor_name == "output.weight" || From a0ba913613235c1639f92877f09e82c3db6fef47 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Nov 2025 11:19:44 +0000 Subject: [PATCH 140/148] Fix lambda capture bug in Windows and initialise candidate_types struct --- src/llama-quant.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e8a2cda9c9..86ca165b6cb 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -652,10 +652,10 @@ static std::unordered_map target_bpw_type( } signal_guard; struct candidate_types { - ggml_type type; - float bpw; - size_t bytes; - double error; + ggml_type type = GGML_TYPE_COUNT; + float bpw = 0.0f; + size_t bytes = 0; + double error = 0.0; double mse = 0.0; double proj = 0.0; }; @@ -751,7 +751,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t { + auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t { uint64_t h = 5381; for (size_t i = 0; i < n; ++i) { h = (h << 5) + h + data[i]; From 9ec3e6e2629d294e7ae95ee58634c360475e67d7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 23 Nov 2025 17:49:53 +0000 Subject: [PATCH 141/148] Remove processing statistics_data --- include/llama.h | 1 - src/llama-quant.cpp | 19 ++---------- tools/quantize/quantize.cpp | 61 ++++++------------------------------- 3 files changed, 12 insertions(+), 69 deletions(-) diff --git a/include/llama.h b/include/llama.h index 3515ee1a13b..c82a4147f4c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,7 +369,6 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file - void * statistics; // pointer to statistics data } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 86ca165b6cb..99759a27c8f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -631,7 +631,6 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, - const std::unordered_map> * statistics_data, const llama_model_quantize_params * params, int nthread ) { @@ -1840,7 +1839,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } const std::unordered_map> * values_data = nullptr; const std::unordered_map> * activations_data = nullptr; - const std::unordered_map> * statistics_data = nullptr; if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { @@ -1871,12 +1869,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (params->statistics) { - statistics_data = static_cast>*>(params->statistics); - if (statistics_data) { - LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size())); - } - } LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -2031,16 +2023,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - const char* base_msg = params->activations - ? (params->statistics - ? "imatrix with activations and statistics provided, process will be more accurate\n" - : "imatrix with activations provided, process will be accurate\n") - : "imatrix without activations provided, process will be less accurate\n"; - if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); } - else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread); + + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__); } @@ -2305,7 +2291,6 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, - /*.statistics =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0b2b05b60a6..aabcd73986f 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -221,8 +221,7 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & values_data, - std::unordered_map> & activations_data, - std::unordered_map> & statistics_data) { + std::unordered_map> & activations_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -257,10 +256,9 @@ static int load_imatrix(const std::string & imatrix_file, const std::string sums_suffix{ ".in_sum" }; const std::string sums2_suffix{ ".in_sum2" }; const std::string counts_suffix{ ".counts" }; - const std::string stats_suffix{ ".stats" }; // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; @@ -276,11 +274,7 @@ static int load_imatrix(const std::string & imatrix_file, } else if (string_remove_suffix(name, counts_suffix)) { // counts std::get<2>(sums_counts_for[std::move(name)]) = cur; - } else if (string_remove_suffix(name, stats_suffix)) { - // stats - std::get<3>(sums_counts_for[std::move(name)]) = cur; - } - else { + } else { // ignore other tensors } } @@ -290,7 +284,6 @@ static int load_imatrix(const std::string & imatrix_file, const struct ggml_tensor * sums = std::get<0>(sc.second); const struct ggml_tensor * sums2 = std::get<1>(sc.second); const struct ggml_tensor * counts = std::get<2>(sc.second); - const struct ggml_tensor * stats = std::get<3>(sc.second); // check sums2 and counts are present, and that sums and sums2 have the same shape if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { @@ -308,19 +301,6 @@ static int load_imatrix(const std::string & imatrix_file, if (sums) { activations.resize(ggml_nelements(sums)); } - if (stats) { - auto & statistics = statistics_data[name]; - statistics.resize(ggml_nelements(stats)); - if (stats->type == GGML_TYPE_F32) { - std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float)); - } else { - fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n", - __func__, ggml_type_name(stats->type), name.c_str()); - statistics.clear(); - statistics_data.erase(name); - } - - } values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { @@ -373,23 +353,22 @@ static int prepare_imatrix(const std::string & imatrix_file, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & values_data, - std::unordered_map> & activations_data, - std::unordered_map> & statistics_data) { + std::unordered_map> & activations_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); } if (values_data.empty()) { return m_last_call; } if (!excluded_weights.empty()) { for (const auto & name : excluded_weights) { - for (auto it = values_data.begin(); it != values_data.end();) { - auto pos = it->first.find(name); + for (auto vt = values_data.begin(); vt != values_data.end();) { + auto pos = vt->first.find(name); if (pos != std::string::npos) { - it = values_data.erase(it); + vt = values_data.erase(vt); } else { - ++it; + ++vt; } } for (auto at = activations_data.begin(); at != activations_data.end();) { @@ -400,20 +379,11 @@ static int prepare_imatrix(const std::string & imatrix_file, ++at; } } - for (auto st = statistics_data.begin(); st != statistics_data.end();) { - auto pos = st->first.find(name); - if (pos != std::string::npos) { - st = activations_data.erase(st); - } else { - ++st; - } - } } } if (!included_weights.empty()) { std::unordered_map> tmp_values; std::unordered_map> tmp_activations; - std::unordered_map> tmp_statistics; for (const auto & name : included_weights) { for (auto & e : values_data) { auto pos = e.first.find(name); @@ -427,16 +397,9 @@ static int prepare_imatrix(const std::string & imatrix_file, tmp_activations.emplace(std::move(a)); } } - for (auto & s : statistics_data) { - auto pos = s.first.find(name); - if (pos != std::string::npos) { - tmp_statistics.emplace(std::move(s)); - } - } } values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); - statistics_data = std::move(tmp_statistics); } return m_last_call; @@ -653,8 +616,7 @@ int main(int argc, char ** argv) { std::vector imatrix_datasets; std::unordered_map> values_data; std::unordered_map> activations_data; - std::unordered_map> statistics_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); if (!values_data.empty()) { params.imatrix = &values_data; { @@ -694,9 +656,6 @@ int main(int argc, char ** argv) { if (!activations_data.empty()) { params.activations = &activations_data; } - if (!statistics_data.empty()) { - params.statistics = &statistics_data; - } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; From 1c9993e13198a28db1b5a8e7cd0fcb5d6bcf89eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 23 Nov 2025 17:51:04 +0000 Subject: [PATCH 142/148] Add --disable-tensor-importance option --- include/llama.h | 1 + src/llama-quant.cpp | 39 ++++++++++++++----------------------- tools/quantize/quantize.cpp | 4 ++++ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/llama.h b/include/llama.h index c82a4147f4c..1f5b2e8a2b2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,6 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file + bool disable_tensor_importance; // treat all tensors equally during quantization } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 99759a27c8f..2b9aba091b9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1570,29 +1570,10 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - bool important = false; - - if (statistics_data) { - const std::string key = remap_imatrix(tensor_name, mapped); - const auto tstats = statistics_data->find(key); - if (tstats != statistics_data->end() && !tstats->second.empty()) { - float ecs = 0.0f; // Euclidean-Cosine score - float l2 = 0.0f; // L2 Euclidean Distance - float cs = 0.0f; // Cosine Similarity - try { - // ecs = tstats->second.at(0); - l2 = tstats->second.at(1); - cs = tstats->second.at(2); - } catch (std::out_of_range &) { - LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str()); - return false; - } - ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q) - // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs); - important = ecs >= 99.99f; // mark as important if ecs is >= 99.99% - } - } else { - important = tensor_name == "output.weight" || + bool important = tensor_name == "output.weight"; + if (!important && !params->disable_tensor_importance) { + important = tensor_name.find(".attn_v.weight") != std::string::npos || + tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || tensor_name.find(".ffn_down_exps.weight") != std::string::npos || tensor_name.find(".attn_output.weight") != std::string::npos || @@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - + if (params->activations) { + LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__); + } else { + LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); + } + if (params->disable_tensor_importance) { + LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + } else { + LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); @@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, + /*.disable_tensor_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index aabcd73986f..4fee8c91a1c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -134,6 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); + printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -560,6 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { + params.disable_tensor_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { From 661600842096145db52a4c631bfe0303a5d454ee Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 24 Nov 2025 18:26:45 +0000 Subject: [PATCH 143/148] Use more descriptive option naming --- include/llama.h | 2 +- src/llama-quant.cpp | 10 +++++----- tools/quantize/quantize.cpp | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/llama.h b/include/llama.h index 1f5b2e8a2b2..50e61d49761 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,7 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file - bool disable_tensor_importance; // treat all tensors equally during quantization + bool no_importance; // allocate target bpw budget equitably across all tensors } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2b9aba091b9..c468a3e4fc9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1571,7 +1571,7 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { bool important = tensor_name == "output.weight"; - if (!important && !params->disable_tensor_importance) { + if (!important && !params->no_importance) { important = tensor_name.find(".attn_v.weight") != std::string::npos || tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || @@ -2009,10 +2009,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); } - if (params->disable_tensor_importance) { - LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + if (params->no_importance) { + LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__); } else { - LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); @@ -2281,7 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, - /*.disable_tensor_importance =*/ false + /*.no_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 4fee8c91a1c..dd4b860e1b9 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,9 +117,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); - printf(" [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); - printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); + printf(" [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -134,8 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); - printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); + printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); + printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -562,8 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { - params.disable_tensor_importance = true; + } else if (strcmp(argv[arg_idx], "--no-importance") == 0) { + params.no_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { From 69a32b6f508a4d0d38f52cf91cc8cd5b42a4bf62 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:28:43 +0000 Subject: [PATCH 144/148] Relax target bpw range --- tools/quantize/quantize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index dd4b860e1b9..ebeea653365 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); @@ -485,13 +485,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 8.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 16.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); return false; } From 5b557ca958d3b0cb4293e12aafe21135c0c12142 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:30:20 +0000 Subject: [PATCH 145/148] Minor refactoring --- src/llama-quant.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c468a3e4fc9..2cb58d46bdb 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -635,7 +635,7 @@ static std::unordered_map target_bpw_type( int nthread ) { bpw_stop.store(false, std::memory_order_relaxed); - // Signal handlers + // SIGINT/SIGTERM signal handlers struct signal_scope_guard { using handler_t = void (*)(int); handler_t prev_int = SIG_DFL; @@ -1361,14 +1361,14 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < compatible_candidates.size(); ++i) { if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } - const ggml_type tensor_types = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_types); - const size_t bytes = tensor_bytes(tensor, tensor_types); + const ggml_type tensor_type = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_type); + const size_t bytes = tensor_bytes(tensor, tensor_type); double mse = 0.0; double proj = 0.0; - const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + const auto err = estimate_error(tensor, tensor_type, f32_sample, rows_sample, values, activations, quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; + eval_candidates[i] = candidate_types{ tensor_type, bpw, bytes, err, mse, proj }; } if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } From 229109f329c498078f84da39b2c1ebb807e60646 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:31:39 +0000 Subject: [PATCH 146/148] Increase importance boost for final pass --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2cb58d46bdb..44f84ec949d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1714,7 +1714,7 @@ static std::unordered_map target_bpw_type( if (err_gain < epsilon) { continue; } // no error improvement double ratio = err_gain / (double)delta_bytes; // error reduction per byte - if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost + if (is_important(tensor_name)) { ratio *= 5.0; } // important tensors get 5x boost // For tie-breaking, prioritize the largest absolute error improvement. if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { From b97cda628960d66a9fcc301062a1dc3925feae9f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 23:52:51 +0000 Subject: [PATCH 147/148] Add B/F16 to get_ftype() --- tools/quantize/quantize.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index ebeea653365..a1426ea4a3f 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -512,7 +512,12 @@ static const char * get_ftype(const float bpw) { {4.5000, "Q4_K"}, {5.5000, "Q5_K"}, {6.5625, "Q6_K"}, - {8.5000, "Q8_0"} + {8.5000, "Q8_0"}, +#ifdef GGML_USE_METAL + {16.0000, "F16"} +#else + {16.0000, "BF16"} +#endif }; return quant_bpw.lower_bound(bpw)->second; From 37cf51ebd032e63c7901835cdd85a0e7e9109e25 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 30 Nov 2025 00:29:35 +0000 Subject: [PATCH 148/148] Process bpw targets up to B/F16 --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 44f84ec949d..6c6926dee85 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -2089,7 +2089,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { + if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);