From ba7335efb363515052a5f8aa755e4a5cd1250150 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 09:54:29 +0100
Subject: [PATCH 001/148] Refactor variable name

---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/llama.h b/include/llama.h
index 545e957e5f5..b17e8f33533 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -354,6 +354,7 @@ extern "C" {
         bool pure;                            // quantize all tensors to the default type
         bool keep_split;                      // quantize to the same number of shards
         void * imatrix;                       // pointer to importance matrix data
+        void * activations;                   // pointer to activations data
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune

From 4d9491141b591d31f7fb91940ef4b1cf41bf94f6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:43:21 +0100
Subject: [PATCH 002/148] Add target_bpw parameter

---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llama.h b/include/llama.h
index b17e8f33533..f44e2383d0e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -358,6 +358,7 @@ extern "C" {
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
+        float target_bpw;                     // target bits per weight (bpw)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From cfec4048abc478cd2769d1908e3ecc53ad2f28bd Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:43:51 +0100
Subject: [PATCH 003/148] Update usage

---
 tools/quantize/quantize.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 470dc3d916b..b2d62f1490d 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,6 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From 5e85fb3ff34c5253c3dfa51eb5b9b9bfd6aaaaea Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:46:36 +0100
Subject: [PATCH 004/148] Add parse_target_bpw()

---
 tools/quantize/quantize.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index b2d62f1490d..afd2edb156e 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -441,6 +441,27 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
     return true;
 }
 
+static bool parse_target_bpw(const char * data, float & target_bpw) {
+    if (!data) {
+        printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        target_bpw = std::stof(data);
+        if (target_bpw < 0.0f || target_bpw > 8.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);

From e6d55dc47b42054dcef4a72145cfffb3cb26bd0f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:49:01 +0100
Subject: [PATCH 005/148] Load activations

---
 tools/quantize/quantize.cpp | 46 ++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index afd2edb156e..3d07abd2d0a 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -247,56 +247,69 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
 
     const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
 
-    const std::string sums_suffix{ ".in_sum2" };
+    const std::string sums_suffix{ ".in_sum" };
+    const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
 
         if (name.empty()) { continue; }
 
-        if (string_remove_suffix(name, sums_suffix)) {
+        if (string_remove_suffix(name, sums2_suffix)) {
             // in_sum2
-            sums_counts_for[std::move(name)].first = cur;
+            std::get<0>(sums_counts_for[std::move(name)]) = cur;
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
-            sums_counts_for[std::move(name)].second = cur;
-        } else {
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
+        }  else if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
+            std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        }
+        else {
             // ignore other tensors
         }
     }
 
     for (const auto & sc : sums_counts_for) {
         const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = sc.second.first;
-        const struct ggml_tensor * counts = sc.second.second;
+        const struct ggml_tensor * sums   = std::get<2>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<0>(sc.second);
+        const struct ggml_tensor * counts = std::get<1>(sc.second);
 
-        if (!sums || !counts) {
+        // check that sums, sums2 and counts have the same shape
+        if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
             fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
             gguf_free(ctx_gguf);
             ggml_free(ctx);
             exit(1);
         }
 
-        const int64_t ne0 = sums->ne[0];
-        const int64_t ne1 = sums->ne[1];
+        const int64_t ne0 = sums2->ne[0];
+        const int64_t ne1 = sums2->ne[1];
 
-        auto & e = imatrix_data[name];
-        e.resize(ggml_nelements(sums));
+        auto & activations = activations_data[name];
+        auto & values = values_data[name];
+        if (sums) {
+            activations.resize(ggml_nelements(sums));
+        }
+        values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
             const float count = ((const float *) counts->data)[j];
             if (count > 0.0f) {
                 for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                    values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
+                    if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
                 }
             } else {
                 // Partial imatrix data, this tensor never got any input during calibration
                 for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = 1;
+                    values[j*ne0 + i] = 1;
+                    if (sums) { activations[j*ne0 + i] = 0; }
                 }
             }
             if (count > max_count) {
@@ -304,7 +317,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
             }
         }
         if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
+                __func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
         }
     }
 

From 77b818c040b97da4fb8b2aa849e64f285e039c98 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:50:37 +0100
Subject: [PATCH 006/148] Populate activations_data with imatrix activations if
 present

---
 tools/quantize/quantize.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3d07abd2d0a..c2a4767fc9e 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -561,10 +561,11 @@ int main(int argc, char ** argv) {
     }
 
     std::vector<std::string> imatrix_datasets;
-    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
-    if (!imatrix_data.empty()) {
-        params.imatrix = &imatrix_data;
+    std::unordered_map<std::string, std::vector<float>> values_data;
+    std::unordered_map<std::string, std::vector<float>> activations_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
+    if (!values_data.empty()) {
+        params.imatrix = &values_data;
         {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);

From 0edbf0c176236b795d8707504388052839556b67 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:51:58 +0100
Subject: [PATCH 007/148] Process activations

---
 tools/quantize/quantize.cpp | 51 +++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index c2a4767fc9e..2c45adab751 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -215,7 +215,10 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
     return m_last_call;
 }
 
-static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file,
+    std::vector<std::string> & imatrix_datasets,
+    std::unordered_map<std::string, std::vector<float>> & values_data,
+    std::unordered_map<std::string, std::vector<float>> & activations_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -225,7 +228,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
     struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
     if (!ctx_gguf) {
         fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
-        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
     }
     const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
     if (n_entries < 1) {
@@ -335,7 +338,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
     }
     printf("]\n");
 
-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);
 
     gguf_free(ctx_gguf);
     ggml_free(ctx);
@@ -347,40 +350,60 @@ static int prepare_imatrix(const std::string & imatrix_file,
         std::vector<std::string> & imatrix_dataset,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
-        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+        std::unordered_map<std::string, std::vector<float>> & values_data,
+        std::unordered_map<std::string, std::vector<float>> & activations_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
     }
-    if (imatrix_data.empty()) {
+    if (values_data.empty()) {
         return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (const auto & name : excluded_weights) {
-            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
+            for (auto it = values_data.begin(); it != values_data.end();) {
                 auto pos = it->first.find(name);
                 if (pos != std::string::npos) {
-                    it = imatrix_data.erase(it);
+                    it = values_data.erase(it);
                 } else {
                     ++it;
                 }
             }
+            for (auto at = activations_data.begin(); at != activations_data.end();) {
+                auto pos = at->first.find(name);
+                if (pos != std::string::npos) {
+                    at = activations_data.erase(at);
+                } else {
+                    ++at;
+                }
+            }
         }
     }
     if (!included_weights.empty()) {
-        std::unordered_map<std::string, std::vector<float>> tmp;
+        std::unordered_map<std::string, std::vector<float>> tmp_values;
+        std::unordered_map<std::string, std::vector<float>> tmp_activations;
         for (const auto & name : included_weights) {
-            for (auto & e : imatrix_data) {
+            for (auto & e : values_data) {
                 auto pos = e.first.find(name);
                 if (pos != std::string::npos) {
-                    tmp.emplace(std::move(e));
+                    tmp_values.emplace(std::move(e));
+                }
+            }
+            for (auto & a : activations_data) {
+                auto pos = a.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_activations.emplace(std::move(a));
                 }
             }
         }
-        imatrix_data = std::move(tmp);
+        values_data = std::move(tmp_values);
+        activations_data = std::move(tmp_activations);
+    }
+    if (!values_data.empty()) {
+        printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size()));
     }
-    if (!imatrix_data.empty()) {
-        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+    if (!activations_data.empty()) {
+        printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size()));
     }
     return m_last_call;
 }

From e8774744584689db682866b71121597fe4d35c84 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:54:02 +0100
Subject: [PATCH 008/148] Process target_bpw parameter

---
 tools/quantize/quantize.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 2c45adab751..5331dec80ca 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -512,6 +512,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
+    float target_bpw = -1.0f;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -538,6 +539,10 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
+            if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 1b3d5b574414ffc03c5d575ef470c74f4e509a80 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:56:02 +0100
Subject: [PATCH 009/148] Populate params

---
 tools/quantize/quantize.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 5331dec80ca..86a96cdfcca 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
+            kvo.val_i64 = values_data.size();
             kv_overrides.emplace_back(std::move(kvo));
         }
 
@@ -628,6 +628,9 @@ int main(int argc, char ** argv) {
             kv_overrides.emplace_back(std::move(kvo));
         }
     }
+    if (!activations_data.empty()) {
+        params.activations = &activations_data;
+    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;
@@ -639,6 +642,9 @@ int main(int argc, char ** argv) {
     if (!prune_layers.empty()) {
         params.prune_layers = &prune_layers;
     }
+    if (target_bpw != -1.0f) {
+        params.target_bpw = target_bpw;
+    }
 
     llama_backend_init();
 
@@ -701,7 +707,7 @@ int main(int argc, char ** argv) {
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
          params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && values_data.empty()) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
         fprintf(stderr, "==========================================================================================================\n\n\n");

From a22a9deeeeb51e6f647bb185301b9874538d0324 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:57:44 +0100
Subject: [PATCH 010/148] Refactor variable and add target_bpw

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1d0361cc166..2e1ca7216e9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1062,9 +1062,11 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.pure                        =*/ false,
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
+        /*.activations                 =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
-        /*.prune_layers                =*/ nullptr
+        /*.prune_layers                =*/ nullptr,
+        /*.target_bpw                  =*/ -1.0f
     };
 
     return result;

From c96b8eef949b479d505b63788d2c214e4221abcb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:00:05 +0100
Subject: [PATCH 011/148] Add fallback_type enum

---
 src/llama-quant.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2e1ca7216e9..b2879bc8470 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -19,6 +19,32 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static enum ggml_type fallback_type(const enum ggml_type new_type) {
+    switch (new_type) {
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+            return GGML_TYPE_Q4_0; // symmetric-ish fallback
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_IQ4_XS:
+            return GGML_TYPE_IQ4_NL;
+        case GGML_TYPE_Q4_K:
+            return GGML_TYPE_Q5_0;
+        case GGML_TYPE_Q5_K:
+            return GGML_TYPE_Q5_1;
+        case GGML_TYPE_Q6_K:
+            return GGML_TYPE_Q8_0;
+        default:
+            return new_type;
+    }
+}
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {

From 9adae08789aefeb945b55858afbdf047e818147f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:00:50 +0100
Subject: [PATCH 012/148] Add is_iq()

---
 src/llama-quant.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b2879bc8470..1e837a7d41c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -19,6 +19,22 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static bool is_iq(const enum ggml_type t) {
+    switch (t) {
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+            return true;
+        default:
+            return false;
+    }
+}
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:

From 017945a3b20726dc000da1245ecdbf539a7ba0cf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:03:52 +0100
Subject: [PATCH 013/148] Validate if imatrix contains activations

---
 src/llama-quant.cpp | 48 ++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e837a7d41c..fdda5d35a10 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -35,6 +35,7 @@ static bool is_iq(const enum ggml_type t) {
             return false;
     }
 }
+
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -61,6 +62,7 @@ static enum ggml_type fallback_type(const enum ggml_type new_type) {
             return new_type;
     }
 }
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -131,10 +133,11 @@ struct quantize_state_impl {
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
 
-    int n_k_quantized = 0;
-    int n_fallback    = 0;
+    int n_k_quantized  = 0;
+    int n_fallback     = 0;
 
-    bool has_imatrix = false;
+    bool has_imatrix     = false;
+    bool has_activations = false;
 
     // used to figure out if a model shares tok_embd with the output weight
     bool has_output = false;
@@ -652,14 +655,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->only_copy) {
         ftype = ml.ftype;
     }
-    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
     if (params->imatrix) {
-        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
-        if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+        values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+        if (values_data) {
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size()));
             qs.has_imatrix = true;
             // check imatrix for nans or infs
-            for (const auto & kv : *imatrix_data) {
+            for (const auto & kv : *values_data) {
                 for (float f : kv.second) {
                     if (!std::isfinite(f)) {
                         throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
@@ -668,8 +672,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    if (params->activations) {
+        activations_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->activations);
+        if (activations_data) {
+            LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size()));
+            qs.has_activations = true;
+            // check activations for nans or infs
+            for (const auto & kv : *activations_data) {
+                for (float f : kv.second) {
+                    if (!std::isfinite(f)) {
+                        throw std::runtime_error(format("activations contain non-finite value %f\n", f));
+                    }
+                }
+            }
+        }
+    }
 
-    const size_t align = GGUF_DEFAULT_ALIGNMENT;
     gguf_context_ptr ctx_out { gguf_init_empty() };
 
     std::vector<int> prune_list = {};
@@ -846,6 +864,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     const auto tn = LLM_TN(model.arch);
     new_ofstream(0);
     for (const auto * it : tensors) {
+        const size_t  align  = GGUF_DEFAULT_ALIGNMENT;
         const auto & weight = *it;
         ggml_tensor * tensor = weight.tensor;
         if (weight.idx != cur_split && params->keep_split) {
@@ -864,10 +883,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         ml.load_data_for(tensor);
 
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
-               ++idx, ml.n_tensors,
-               ggml_get_name(tensor),
-               llama_format_tensor_shape(tensor).c_str(),
-               ggml_type_name(tensor->type));
+            ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type));
 
         // This used to be a regex, but <regex> has an extreme cost to compile times.
         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
@@ -967,9 +983,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             const int64_t nelements = ggml_nelements(tensor);
 
             const float * imatrix = nullptr;
-            if (imatrix_data) {
-                auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
-                if (it == imatrix_data->end()) {
+            if (values_data) {
+                auto it = values_data->find(remap_imatrix(tensor->name, mapped));
+                if (it == values_data->end()) {
                     LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {

From 92f49ab39949221ff84b4f70d4528e4f5f43db93 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:05:01 +0100
Subject: [PATCH 014/148] Add target_bpw_type() logic

---
 src/llama-quant.cpp | 482 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 482 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fdda5d35a10..1e24303c528 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -575,6 +575,488 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
+// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality.
+// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert
+// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert
+// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2])
+static std::unordered_map<std::string, ggml_type> target_bpw_type(
+    llama_model_loader & ml,
+    std::vector<no_init<uint8_t>> & read_data,
+    const llama_model & model,
+    const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
+    const std::map<int, std::string> & mapped,
+    const std::unordered_map<std::string, std::vector<float>> * values_data,
+    const std::unordered_map<std::string, std::vector<float>> * activations_data,
+    float target_bpw,
+    int nthread,
+    int sample_rows_per_expert = 128,
+    float bias_lambda = 1.0
+) {
+    struct candidate_types {
+        ggml_type type;
+        float bpw;
+        size_t bytes;
+        float error;  // lower is better
+    };
+
+    struct tensor_info {
+        const llama_model_loader::llama_tensor_weight * w;
+        std::vector<candidate_types> candidate; // sorted by bpw ascending
+        int choice = -1;             // index into cand
+        float min_bpw = 0.0;
+        float max_bpw = 0.0;
+        size_t n_elements = 0;
+    };
+
+    auto name_tn = LLM_TN(model.arch);
+
+    // The candidate types we consider; adjust as needed
+    const ggml_type base_candidates[] = {
+        // Model's
+        GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
+        GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
+        GGML_TYPE_Q2_K,
+        GGML_TYPE_Q3_K,
+        GGML_TYPE_Q4_0,
+        GGML_TYPE_Q4_1,
+        GGML_TYPE_Q4_K,
+        GGML_TYPE_Q5_0,
+        GGML_TYPE_Q5_1,
+        GGML_TYPE_Q5_K,
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
+    };
+
+    auto can_quantize = [&](const ggml_tensor * t) -> bool {
+        const std::string name = ggml_get_name(t);
+        bool q = name.rfind("weight") == name.size() - 6;
+        q &= (ggml_n_dims(t) >= 2);
+        q &= name.find("_norm.weight") == std::string::npos;
+        //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
+        //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight");
+        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        q &= name.find("altup") == std::string::npos;
+        q &= name.find("laurel") == std::string::npos;
+        q &= name.find("per_layer_model_proj") == std::string::npos;
+        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
+        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
+        q &= name.find("ssm_conv1d.weight") == std::string::npos;
+        q &= name.find("shortconv.conv.weight") == std::string::npos;
+        q &= name.find("time_mix_first.weight") == std::string::npos;
+        q &= name.find("time_mix_w0.weight") == std::string::npos;
+        q &= name.find("time_mix_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_v0.weight") == std::string::npos;
+        q &= name.find("time_mix_v1.weight") == std::string::npos;
+        q &= name.find("time_mix_v2.weight") == std::string::npos;
+        q &= name.find("time_mix_a0.weight") == std::string::npos;
+        q &= name.find("time_mix_a1.weight") == std::string::npos;
+        q &= name.find("time_mix_a2.weight") == std::string::npos;
+        q &= name.find("time_mix_g1.weight") == std::string::npos;
+        q &= name.find("time_mix_g2.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+        q &= name.find("attn_rel_b.weight") == std::string::npos;
+        return q;
+    };
+
+    auto get_values = [&](const std::string & tensor_name) -> const float * {
+        if (!values_data) { return nullptr; }
+        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == values_data->end()) { return nullptr; }
+        return it->second.data();
+    };
+
+    auto get_activations = [&](const std::string & tensor_name) -> const float * {
+        if (!activations_data) { return nullptr; }
+        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == activations_data->end()) { return nullptr; }
+        return it->second.data();
+    };
+
+    auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const size_t  row_sz    = ggml_row_size(typ, n_per_row);
+        return (size_t)ne2 * (size_t)nrows * row_sz;
+    };
+
+    auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
+        const int64_t nelem = ggml_nelements(t);
+        const size_t bytes = total_bytes(t, typ);
+        return bytes * 8.0 / nelem;
+    };
+
+    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t blck = ggml_blck_size(typ);
+        if (blck <= 1) { return true; }  // FP16/BF16/Q8_0 etc
+        return n_per_row % blck == 0;
+    };
+
+    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
+        if (is_compatible(t, typ)) { return typ; }
+        ggml_type fb = fallback_type(typ);
+        if (is_compatible(t, fb)) { return fb; }
+        return GGML_TYPE_F16; // final guard
+    };
+
+    // Estimate error for a given type using a sampled subset of rows.
+    // Uses both imatrix (E[a^2]) and activations (E[a]) if available.
+    auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+
+        const ggml_type_traits * traits = ggml_get_type_traits(typ);
+        if (!traits || !traits->to_float) {
+            // cannot dequantize candidate -> assign very high error
+            return 1e35f;
+        }
+
+        // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly
+        const int64_t rows_per_expert = nrows;
+        const int64_t sample_rows = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows);
+
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        std::vector<uint8_t> qbuf(row_sz * sample_rows);
+        std::vector<float>   f32_sample(sample_rows * n_per_row);
+        std::vector<float>   deq(sample_rows * n_per_row);
+
+        float total_err = 0.0;
+
+        for (int64_t i03 = 0; i03 < ne2; ++i03) {
+            const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr;
+            const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr;
+
+            // Assemble sampled rows into contiguous f32_sample
+            int64_t rs = 0;
+            for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
+                const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row;
+                std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
+                ++rs;
+            }
+            if (rs == 0) { continue; }
+
+            // Quantize sampled rows in one chunk; pass the imatrix for this expert slice
+            const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
+            (void)got; // not strictly needed here
+
+            // Dequantize
+            traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
+
+            // Compute error proxy per sampled row
+            for (int64_t s = 0; s < rs; ++s) {
+                const float * xs = f32_sample.data() + s * n_per_row;
+                const float * ys =        deq.data() + s * n_per_row;
+
+                float mse_w    = 0.0;
+                float bias     = 0.0;
+                float bias_sum = 0.0;
+
+                if (value) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const float e = ys[j] - xs[j];
+                        mse_w += e * e * value[j];
+                        if (activation) {
+                            bias_sum += e * activation[j];
+                        }
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const float e = ys[j] - xs[j];
+                        mse_w += e*e;
+                        if (activation) {
+                            bias_sum += e * activation[j];
+                        }
+                    }
+                }
+
+                if (activation) {
+                    bias = std::abs(bias_sum);
+                }
+
+                // Normalize by n_per_row to get a per-row average scale
+                float row_err = mse_w / std::max<int64_t>(1, n_per_row);
+                if (bias_lambda != 0.0) {
+                    row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
+                }
+
+                total_err += row_err;
+            }
+
+            // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
+            const float scale_rows = rows_per_expert / std::max<int64_t>(1, rs);
+            total_err *= scale_rows;
+        }
+
+        return total_err;
+    };
+
+    // Produce per-tensor candidate lists
+    std::vector<tensor_info> all;
+    all.reserve(tensors.size());
+
+    for (const auto * tw : tensors) {
+        // Temporary workers for dequantization
+        std::vector<std::thread> workers;
+        workers.reserve(std::max(1, nthread));
+
+        ggml_tensor * t = tw->tensor;
+        const std::string name = ggml_get_name(t);
+
+        if (!can_quantize(t)) {
+            continue;
+        }
+
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
+        if (!ml.use_mmap) {
+            if (read_data.size() < ggml_nbytes(t)) {
+                read_data.resize(ggml_nbytes(t));
+            }
+            t->data = read_data.data();
+        }
+        ml.load_data_for(t);
+
+        // Prepare f32 weights for error estimates
+        const int64_t nelem = ggml_nelements(t);
+        std::vector<no_init<float>> f32_conv_buf;
+        float * f32_data = nullptr;
+
+        if (t->type == GGML_TYPE_F32) {
+            f32_data = (float *)t->data;
+        } else {
+            llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread);
+            f32_data = (float *)f32_conv_buf.data();
+        }
+
+        const float * values = get_values(name);
+        const float * activations = get_activations(name);
+
+        tensor_info info;
+        info.w = tw;
+        info.n_elements = nelem;
+
+        // Candidate build with compatibility handling and availability checks
+        for (ggml_type ts_type : base_candidates) {
+            // Skip IQ* without imatrix
+            if (is_iq(ts_type) && !values) { continue; }
+            ggml_type tt = make_compatible(t, ts_type);
+            // After fallback, if still incompatible, skip
+            if (!is_compatible(t, tt)) { continue; }
+
+            // Compute bpw and bytes
+            auto bpw = (float)tensor_bpw(t, tt);
+            size_t bytes = total_bytes(t, tt);
+
+            // Estimate error
+            auto err = (float)estimate_error(t, f32_data, tt, values, activations);
+
+            info.candidate.push_back(candidate_types{tt, bpw, bytes, err});
+        }
+
+        if (info.candidate.empty()) {
+            // as a last resort, keep original type
+            float bpw = ggml_nbytes(t) * 8.0f / nelem;
+            info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
+        }
+
+        // Sort by bpw ascending
+        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+            if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
+            if (a.error != b.error) { return a.error < b.error; }
+            return a.bytes < b.bytes;
+        });
+
+        // collapse candidates with identical storage size (bytes)
+        {
+            std::vector<candidate_types> uniq;
+            uniq.reserve(info.candidate.size());
+
+            for (size_t i = 0; i < info.candidate.size(); ) {
+                size_t j = i + 1;
+                candidate_types best = info.candidate[i];
+                // group same-byte entries, keep the one with the lowest error
+                while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
+                    if (info.candidate[j].error < best.error) { best = info.candidate[j]; }
+                    ++j;
+                }
+                uniq.push_back(best);
+                i = j;
+            }
+            info.candidate.swap(uniq);
+        }
+
+        // Initialize choice at the smallest bpw candidate
+        info.choice = 0;
+        info.min_bpw = info.candidate.front().bpw;
+        info.max_bpw = info.candidate.back().bpw;
+
+        all.push_back(std::move(info));
+    }
+
+    if (all.empty()) { return {}; }
+
+    // Greedy allocation from minimum bpw upward to reach target_bpw
+    // Start with minimal bpw assignment
+    auto current_total_bytes = [&]() -> size_t {
+        size_t b = 0;
+        for (const auto & ti : all) {
+            b += ti.candidate[ti.choice].bytes;
+        }
+        return b;
+    };
+
+    auto total_weights = [&]() -> size_t {
+        size_t w = 0;
+        for (const auto & ti : all) {
+            w += ti.n_elements;
+        }
+        return w;
+    };
+
+    const size_t tw = total_weights();
+    auto current_bpw = [&]() -> double {
+        return (double)current_total_bytes() * 8.0f / (double)tw;
+    };
+
+    // Precompute current bpw
+    double bpw_now = current_bpw();
+
+    // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
+    if (bpw_now >= target_bpw) {
+        std::unordered_map<std::string, ggml_type> overrides;
+        for (const auto & ti : all) {
+            overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+        }
+        return overrides;
+    }
+
+    struct upgrade {
+        int idx;            // tensor index
+        int next;           // next candidate index (strictly larger bytes)
+        double err;         // error reduction
+        size_t delta_bytes; // increase in bytes
+        double ratio;       // err per added bit
+    };
+
+    // Find next strictly-larger candidate index for a tensor
+    auto next_distinct_idx = [&](const tensor_info &ti) -> int {
+        const auto &cand = ti.candidate;
+        const auto &cur  = cand[ti.choice];
+        int j = ti.choice + 1;
+        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j;
+        return j < (int)cand.size() ? j : -1;
+    };
+
+    auto recompute_best_upgrade = [&]() -> upgrade {
+        const double eps = 1e-12;
+        upgrade best{-1, -1, 0.0, 0, -1.0};
+        for (int i = 0; i < (int)all.size(); ++i) {
+            const auto &ti = all[i];
+            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
+
+            int j = next_distinct_idx(ti);
+            if (j < 0) { continue; } // no larger-size candidate remains
+
+            const auto &cur = ti.candidate[ti.choice];
+            const auto &nxt = ti.candidate[j];
+
+            size_t delta_bytes = nxt.bytes - cur.bytes;
+            if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe
+
+            double err = (double)cur.error - (double)nxt.error;
+            err = std::max(err, 0.0); // do not penalize due to sampling noise
+
+            double ratio = err / (double)(delta_bytes * 8ull);
+            if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
+                best = upgrade{i, j, err, delta_bytes, ratio};
+            }
+        }
+        return best;
+    };
+
+    while (true) {
+        upgrade up = recompute_best_upgrade();
+        if (up.idx < 0) { break; }
+
+        size_t now_bytes = current_total_bytes();
+        size_t next_bytes = now_bytes + up.delta_bytes;
+        double bpw_next = (double)next_bytes * 8.0 / (double)tw;
+
+        if (bpw_next <= (double)target_bpw + 1e-12) {
+            all[up.idx].choice = up.next;
+            bpw_now = bpw_next;
+        } else {
+            break;
+        }
+    }
+
+    // We might still be below target but taking any single upgrade overshoots.
+    {
+        double under_gap = (double)target_bpw - bpw_now;
+
+        upgrade best_over{-1, -1, 0.0, 0, -1.0};
+        double best_over_gap = 1e300;
+
+        size_t now_bytes = current_total_bytes();
+
+        for (int i = 0; i < (int)all.size(); ++i) {
+            const auto &ti = all[i];
+            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
+
+            int j = next_distinct_idx(ti);
+            if (j < 0) { continue; }
+
+            const auto &cur = ti.candidate[ti.choice];
+            const auto &nxt = ti.candidate[j];
+
+            size_t delta_bytes = nxt.bytes - cur.bytes;
+            if (delta_bytes == 0) { continue; }
+
+            size_t over_bytes = now_bytes + delta_bytes;
+            double bpw_over = (double)over_bytes * 8.0 / (double)tw;
+
+            double over_gap = std::abs(bpw_over - (double)target_bpw);
+
+            double err = (double)cur.error - (double)nxt.error;
+            if (err < 0.0) { err = 0.0; }
+            double ratio = err / (double)(delta_bytes * 8ull);
+
+            if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
+                best_over_gap = over_gap;
+                best_over = upgrade{i, j, err, delta_bytes, ratio};
+            }
+        }
+
+        if (best_over.idx >= 0) {
+            if (best_over_gap < under_gap) {
+                all[best_over.idx].choice = best_over.next;
+            }
+        }
+    }
+
+    // Build the override map
+    std::unordered_map<std::string, ggml_type> overrides;
+    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw);
+    for (const auto & ti : all) {
+        LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
+            __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
+        overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+    }
+    return overrides;
+}
+
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
     llama_ftype ftype = params->ftype;

From 1187f6aa9eb4cf7a3bf3945d0ecd292a49c03efa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:07:03 +0100
Subject: [PATCH 015/148] Implement bpw_overrides call

---
 src/llama-quant.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e24303c528..b0b3be76cad 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1314,6 +1314,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    std::unordered_map<std::string, ggml_type> bpw_overrides = {};
+    if (params->target_bpw != -1.0f) {
+        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
+        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread);
+    }
+
     int cur_split = -1;
     std::ofstream fout;
     auto close_ofstream = [&]() {
@@ -1430,6 +1436,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!params->pure && ggml_is_quantized(default_type)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // get bpw override
+                const auto override = bpw_overrides.find(name);
+                if (override != bpw_overrides.end()) { new_type = override->second; }
                 // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);

From 5aceb9e3ae016ed057a0963934c53203b74ad3c5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:29:27 +0100
Subject: [PATCH 016/148] Refactor variable names

---
 src/llama-quant.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b0b3be76cad..5af70c1c9b8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -575,13 +575,13 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality.
-// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert
-// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert
-// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2])
+// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
+// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
+// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight
+//              for bias and error, 2.0 means twice as much weight for bias
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
-    std::vector<no_init<uint8_t>> & read_data,
+    std::vector<no_init<uint8_t>> & buffer,
     const llama_model & model,
     const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
     const std::map<int, std::string> & mapped,
@@ -735,24 +735,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         float total_err = 0.0;
 
-        for (int64_t i03 = 0; i03 < ne2; ++i03) {
-            const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr;
-            const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr;
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
+            const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr;
 
-            // Assemble sampled rows into contiguous f32_sample
             int64_t rs = 0;
             for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
-                const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row;
+                const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
                 std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
                 ++rs;
             }
             if (rs == 0) { continue; }
 
-            // Quantize sampled rows in one chunk; pass the imatrix for this expert slice
             const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            (void)got; // not strictly needed here
+            (void)got;
 
-            // Dequantize
             traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
 
             // Compute error proxy per sampled row
@@ -821,10 +818,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
         if (!ml.use_mmap) {
-            if (read_data.size() < ggml_nbytes(t)) {
-                read_data.resize(ggml_nbytes(t));
-            }
-            t->data = read_data.data();
+            if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); }
+            t->data = buffer.data();
         }
         ml.load_data_for(t);
 

From ee05d6bc0b250a7c19b9dedf504163509ef736f8 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:32:53 +0100
Subject: [PATCH 017/148] Update comments

---
 src/llama-quant.cpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5af70c1c9b8..546f6b438c7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,13 +596,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ggml_type type;
         float bpw;
         size_t bytes;
-        float error;  // lower is better
+        float error;
     };
 
     struct tensor_info {
         const llama_model_loader::llama_tensor_weight * w;
-        std::vector<candidate_types> candidate; // sorted by bpw ascending
-        int choice = -1;             // index into cand
+        std::vector<candidate_types> candidate;
+        int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
         size_t n_elements = 0;
@@ -610,7 +610,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
 
-    // The candidate types we consider; adjust as needed
     const ggml_type base_candidates[] = {
         // Model's
         GGML_TYPE_IQ1_S,
@@ -639,8 +638,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bool q = name.rfind("weight") == name.size() - 6;
         q &= (ggml_n_dims(t) >= 2);
         q &= name.find("_norm.weight") == std::string::npos;
-        //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
-        //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight");
         q &= name.find("ffn_gate_inp.weight") == std::string::npos;
         q &= name.find("altup") == std::string::npos;
         q &= name.find("laurel") == std::string::npos;
@@ -719,7 +716,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const ggml_type_traits * traits = ggml_get_type_traits(typ);
         if (!traits || !traits->to_float) {
-            // cannot dequantize candidate -> assign very high error
+            // Cannot dequantize candidate -> assign very high error
             return 1e35f;
         }
 
@@ -842,12 +839,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.w = tw;
         info.n_elements = nelem;
 
-        // Candidate build with compatibility handling and availability checks
+        // Build per-tensor candidate list
         for (ggml_type ts_type : base_candidates) {
-            // Skip IQ* without imatrix
             if (is_iq(ts_type) && !values) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
-            // After fallback, if still incompatible, skip
             if (!is_compatible(t, tt)) { continue; }
 
             // Compute bpw and bytes
@@ -861,19 +856,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         if (info.candidate.empty()) {
-            // as a last resort, keep original type
+            // As a last resort, keep original type
             float bpw = ggml_nbytes(t) * 8.0f / nelem;
             info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
         }
 
-        // Sort by bpw ascending
         std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }
             return a.bytes < b.bytes;
         });
 
-        // collapse candidates with identical storage size (bytes)
+        // Collapse candidates with identical storage size (bytes)
         {
             std::vector<candidate_types> uniq;
             uniq.reserve(info.candidate.size());
@@ -903,7 +897,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     if (all.empty()) { return {}; }
 
     // Greedy allocation from minimum bpw upward to reach target_bpw
-    // Start with minimal bpw assignment
     auto current_total_bytes = [&]() -> size_t {
         size_t b = 0;
         for (const auto & ti : all) {
@@ -938,11 +931,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     struct upgrade {
-        int idx;            // tensor index
-        int next;           // next candidate index (strictly larger bytes)
-        double err;         // error reduction
-        size_t delta_bytes; // increase in bytes
-        double ratio;       // err per added bit
+        int idx;
+        int next;
+        double err;
+        size_t delta_bytes;
+        double ratio;
     };
 
     // Find next strictly-larger candidate index for a tensor
@@ -998,6 +991,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     // We might still be below target but taking any single upgrade overshoots.
+    // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
     {
         double under_gap = (double)target_bpw - bpw_now;
 

From f22b3097eb144a913d02fbb445cbdb9b97e91859 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:34:01 +0100
Subject: [PATCH 018/148] Avoid division by zero if truncation occurs

---
 src/llama-quant.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 546f6b438c7..3911eba43b6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -790,28 +790,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const float scale_rows = rows_per_expert / std::max<int64_t>(1, rs);
+            const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs);
             total_err *= scale_rows;
         }
 
         return total_err;
     };
 
-    // Produce per-tensor candidate lists
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
 
     for (const auto * tw : tensors) {
-        // Temporary workers for dequantization
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
 
         ggml_tensor * t = tw->tensor;
         const std::string name = ggml_get_name(t);
 
-        if (!can_quantize(t)) {
-            continue;
-        }
+        if (!can_quantize(t)) { continue; }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
         if (!ml.use_mmap) {
@@ -820,7 +816,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        // Prepare f32 weights for error estimates
         const int64_t nelem = ggml_nelements(t);
         std::vector<no_init<float>> f32_conv_buf;
         float * f32_data = nullptr;
@@ -955,13 +950,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
             int j = next_distinct_idx(ti);
-            if (j < 0) { continue; } // no larger-size candidate remains
+            if (j < 0) { continue; }
 
             const auto &cur = ti.candidate[ti.choice];
             const auto &nxt = ti.candidate[j];
 
             size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe
+            if (delta_bytes == 0) { continue; }
 
             double err = (double)cur.error - (double)nxt.error;
             err = std::max(err, 0.0); // do not penalize due to sampling noise

From 936294f6afb10aea69ac5ae85fcc29313b49cd9e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 23:31:22 +0100
Subject: [PATCH 019/148] Increase precision for error calculation

---
 src/llama-quant.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3911eba43b6..a4a10da062b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -730,7 +730,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float>   f32_sample(sample_rows * n_per_row);
         std::vector<float>   deq(sample_rows * n_per_row);
 
-        float total_err = 0.0;
+        double total_err = 0.0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
@@ -754,9 +754,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * xs = f32_sample.data() + s * n_per_row;
                 const float * ys =        deq.data() + s * n_per_row;
 
-                float mse_w    = 0.0;
-                float bias     = 0.0;
-                float bias_sum = 0.0;
+                double mse_w    = 0.0;
+                double bias     = 0.0;
+                double bias_sum = 0.0;
 
                 if (value) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -769,19 +769,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
-                        mse_w += e*e;
+                        mse_w += e * e;
                         if (activation) {
                             bias_sum += e * activation[j];
                         }
                     }
                 }
 
-                if (activation) {
-                    bias = std::abs(bias_sum);
-                }
+                if (activation) { bias = std::abs(bias_sum); }
 
                 // Normalize by n_per_row to get a per-row average scale
-                float row_err = mse_w / std::max<int64_t>(1, n_per_row);
+                double row_err = mse_w / std::max<int64_t>(1, n_per_row);
                 if (bias_lambda != 0.0) {
                     row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
                 }
@@ -790,11 +788,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs);
+            const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
             total_err *= scale_rows;
         }
 
-        return total_err;
+        return std::isfinite(total_err) ? total_err : 1e35;
     };
 
     std::vector<tensor_info> all;

From 5cd69a6809c56922e1b973ce900f3680c28a5117 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 09:41:39 +0100
Subject: [PATCH 020/148] Add F16/BF16 type

---
 src/llama-quant.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a4a10da062b..5522fe39d28 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -630,7 +630,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+// TODO: find better way to handle F16/BF16
+#ifdef GGML_USE_METAL
+        GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     auto can_quantize = [&](const ggml_tensor * t) -> bool {

From 69586e212e76849fcdff17e68e8023b91025b415 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 13:23:11 +0100
Subject: [PATCH 021/148] Add F16/BF16 type

---
 tools/quantize/quantize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 86a96cdfcca..b907008cb4f 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -486,13 +486,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 8.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
         return false;
     }
 

From 29b2dc3ec0ddefde21394007649df6c268ebca3d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 13:27:01 +0100
Subject: [PATCH 022/148] Do not mix K and IQ quants

---
 src/llama-quant.cpp | 62 +++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5522fe39d28..9dc903874fb 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) {
     }
 }
 
+static bool is_iq(const enum llama_ftype t) {
+    switch (t) {
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -587,7 +607,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
-    float target_bpw,
+    const llama_model_quantize_params * params,
     int nthread,
     int sample_rows_per_expert = 128,
     float bias_lambda = 1.0
@@ -608,19 +628,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    auto name_tn = LLM_TN(model.arch);
-
-    const ggml_type base_candidates[] = {
-        // Model's
-        GGML_TYPE_IQ1_S,
-        GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
-        GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS,
-        GGML_TYPE_IQ3_S,
-        GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
+    const ggml_type k_candidates[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_0,
@@ -639,6 +647,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
+    const ggml_type iq_candidates[] = {
+        GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
+        GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
+    };
+
+    auto name_tn = LLM_TN(model.arch);
+    float target_bpw = params->target_bpw;
+
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
@@ -838,8 +861,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.w = tw;
         info.n_elements = nelem;
 
+        std::vector<ggml_type> quant_candidates;
+        if (is_iq(params->ftype)) {
+            quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
+        } else {
+            quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
+        }
+
         // Build per-tensor candidate list
-        for (ggml_type ts_type : base_candidates) {
+        for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !values) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
@@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f) {
         LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
-        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread);
+        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
     }
 
     int cur_split = -1;

From 43caadf783a4bae41011e3b9aca5bbe79185a7a6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:24:48 +0100
Subject: [PATCH 023/148] Add better fallbacks for IQ mixes

---
 src/llama-quant.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9dc903874fb..c412191c8f3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -657,6 +657,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
+        // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it
+        GGML_TYPE_Q5_0,
+        GGML_TYPE_Q5_1,
+        GGML_TYPE_Q5_K,
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
     };
 
     auto name_tn = LLM_TN(model.arch);

From 52da4a4f8c28d063378d54dd806da03614251e76 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:26:05 +0100
Subject: [PATCH 024/148] Skip if output.weight or type is COPY

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c412191c8f3..786adfe547b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -697,6 +697,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
+        q &= params->quantize_output_tensor || name != "output.weight";
+        q &= !params->only_copy;
+
         return q;
     };
 

From 3f0118d6029450955c43cd84109bdfc36a8cecd3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:26:37 +0100
Subject: [PATCH 025/148] Fix bias lambda bug

---
 src/llama-quant.cpp | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 786adfe547b..44cf9e30e3c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -782,52 +782,47 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
             if (rs == 0) { continue; }
 
-            const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            (void)got;
-
+            // Quantize sample rows and dequantize back
+            (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
             traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
 
-            // Compute error proxy per sampled row
+            // Compute error proxy per sampled slice
+            double slice_err = 0.0;
             for (int64_t s = 0; s < rs; ++s) {
                 const float * xs = f32_sample.data() + s * n_per_row;
                 const float * ys =        deq.data() + s * n_per_row;
 
                 double mse_w    = 0.0;
-                double bias     = 0.0;
                 double bias_sum = 0.0;
 
                 if (value) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e * value[j];
-                        if (activation) {
-                            bias_sum += e * activation[j];
-                        }
+                        if (activation) { bias_sum += e * activation[j]; }
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e;
-                        if (activation) {
-                            bias_sum += e * activation[j];
-                        }
+                        if (activation) { bias_sum += e * activation[j]; }
                     }
                 }
 
-                if (activation) { bias = std::abs(bias_sum); }
-
                 // Normalize by n_per_row to get a per-row average scale
                 double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (bias_lambda != 0.0) {
-                    row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
+                if (activation && bias_lambda != 0.0) {
+                    // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
+                    const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
+                    row_err += bias_lambda * bias;
                 }
 
-                total_err += row_err;
+                slice_err += row_err;
             }
 
-            // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
-            total_err *= scale_rows;
+            // Scale the slice contribution by the sampling factor
+            const auto  scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
@@ -1002,7 +997,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (delta_bytes == 0) { continue; }
 
             double err = (double)cur.error - (double)nxt.error;
-            err = std::max(err, 0.0); // do not penalize due to sampling noise
+            err = std::max(err, 0.0);
 
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {

From b0b33b7ccbc5880e6ac5206ea309ee328e685c08 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 20:58:26 +0100
Subject: [PATCH 026/148] Optimise tensor sampling

---
 src/llama-quant.cpp | 197 ++++++++++++++++++++++++++------------------
 1 file changed, 119 insertions(+), 78 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 44cf9e30e3c..830bf915cfc 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -609,7 +609,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 128,
+    int sample_rows_per_expert = 256,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -671,7 +671,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
-        q &= (ggml_n_dims(t) >= 2);
+        q &= ggml_n_dims(t) >= 2;
         q &= name.find("_norm.weight") == std::string::npos;
         q &= name.find("ffn_gate_inp.weight") == std::string::npos;
         q &= name.find("altup") == std::string::npos;
@@ -719,9 +719,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t  row_sz    = ggml_row_size(typ, n_per_row);
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const size_t  row_sz = ggml_row_size(typ, n_per_row);
         return (size_t)ne2 * (size_t)nrows * row_sz;
     };
 
@@ -734,7 +734,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
         const int64_t n_per_row = t->ne[0];
         const int64_t blck = ggml_blck_size(typ);
-        if (blck <= 1) { return true; }  // FP16/BF16/Q8_0 etc
+        if (blck <= 1) { return true; }
         return n_per_row % blck == 0;
     };
 
@@ -742,15 +742,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (is_compatible(t, typ)) { return typ; }
         ggml_type fb = fallback_type(typ);
         if (is_compatible(t, fb)) { return fb; }
-        return GGML_TYPE_F16; // final guard
+        return GGML_TYPE_F16;
     };
 
-    // Estimate error for a given type using a sampled subset of rows.
-    // Uses both imatrix (E[a^2]) and activations (E[a]) if available.
-    auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double {
+    // Estimate error for a given type using a sampled subset of rows
+    auto estimate_error = [&](const ggml_tensor * t,
+        const ggml_type typ,
+        const std::vector<float> & f32_sample,
+        const std::vector<int64_t> & sample_rows_per_slice,
+        const std::vector<float> & values_sample,
+        const std::vector<float> & activations_sample) -> double
+    {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const ggml_type_traits * traits = ggml_get_type_traits(typ);
         if (!traits || !traits->to_float) {
@@ -758,70 +763,73 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             return 1e35f;
         }
 
-        // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly
-        const int64_t rows_per_expert = nrows;
-        const int64_t sample_rows = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows);
-
-        const size_t row_sz = ggml_row_size(typ, n_per_row);
-        std::vector<uint8_t> qbuf(row_sz * sample_rows);
-        std::vector<float>   f32_sample(sample_rows * n_per_row);
-        std::vector<float>   deq(sample_rows * n_per_row);
+        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        if (total_sampled_rows == 0) { return 0.0; }
 
-        double total_err = 0.0;
+        const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows;
+        std::vector<uint8_t> qbuf(qbuf_size);
+        std::vector<float> deq(f32_sample.size());
 
+        // Quantize all sampled rows at once and dequantize back
+        size_t qbuf_offset = 0;
+        size_t f32_offset = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
-            const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr;
-
-            int64_t rs = 0;
-            for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
-                const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
-                std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
-                ++rs;
-            }
+            const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            // Quantize sample rows and dequantize back
-            (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
+            const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value);
+            qbuf_offset += ggml_row_size(typ, n_per_row) * rs;
+            f32_offset += rs * n_per_row;
+        }
+
+        traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+
+        double total_err = 0.0;
+        size_t sample_offset = 0;
+
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
+            const int64_t rs = sample_rows_per_slice[slice];
 
-            // Compute error proxy per sampled slice
             double slice_err = 0.0;
             for (int64_t s = 0; s < rs; ++s) {
-                const float * xs = f32_sample.data() + s * n_per_row;
-                const float * ys =        deq.data() + s * n_per_row;
+                const float * xs = f32_sample.data() + sample_offset;
+                const float * ys = deq.data() + sample_offset;
 
-                double mse_w    = 0.0;
+                double mse_w = 0.0;
                 double bias_sum = 0.0;
 
-                if (value) {
+                if (value_slice) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
-                        mse_w += e * e * value[j];
-                        if (activation) { bias_sum += e * activation[j]; }
+                        mse_w += e * e * value_slice[j];
+                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e;
-                        if (activation) { bias_sum += e * activation[j]; }
+                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
                     }
                 }
 
                 // Normalize by n_per_row to get a per-row average scale
                 double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (activation && bias_lambda != 0.0) {
+                if (activation_slice && bias_lambda != 0.0) {
                     // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
                     const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
                     row_err += bias_lambda * bias;
                 }
 
                 slice_err += row_err;
+                sample_offset += n_per_row;
             }
 
             // Scale the slice contribution by the sampling factor
-            const auto  scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            const double rows_per_expert = (double) nrows;
+            const auto   scale_rows = rows_per_expert / std::max(1.0, (double) rs);
             total_err += slice_err * scale_rows;
         }
 
@@ -858,8 +866,40 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_data = (float *)f32_conv_buf.data();
         }
 
-        const float * values = get_values(name);
-        const float * activations = get_activations(name);
+        const float * values_all = get_values(name);
+        const float * activations_all = get_activations(name);
+
+        // Sample the tensor rows once, before looping through quantization candidates.
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows_total = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t rows_per_expert = nrows_total;
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows_max);
+
+        std::vector<float> f32_sample;
+        std::vector<float> values_sample;
+        std::vector<float> activations_sample;
+        std::vector<int64_t> sample_rows_per_slice(ne2);
+
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            int64_t current_sampled_rows = 0;
+            for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) {
+                const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
+                f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
+                current_sampled_rows++;
+            }
+            sample_rows_per_slice[slice] = current_sampled_rows;
+        }
+
+        if (values_all) {
+            values_sample.resize(ne2 * n_per_row);
+            std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float));
+        }
+        if (activations_all) {
+            activations_sample.resize(ne2 * n_per_row);
+            std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float));
+        }
 
         tensor_info info;
         info.w = tw;
@@ -874,7 +914,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Build per-tensor candidate list
         for (ggml_type ts_type : quant_candidates) {
-            if (is_iq(ts_type) && !values) { continue; }
+            if (is_iq(ts_type) && !values_all) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
 
@@ -882,19 +922,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             auto bpw = (float)tensor_bpw(t, tt);
             size_t bytes = total_bytes(t, tt);
 
-            // Estimate error
-            auto err = (float)estimate_error(t, f32_data, tt, values, activations);
-
-            info.candidate.push_back(candidate_types{tt, bpw, bytes, err});
+            // Estimate error using the pre-sampled data
+            auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample);
+            info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
         }
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
             float bpw = ggml_nbytes(t) * 8.0f / nelem;
-            info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
+            info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }
             return a.bytes < b.bytes;
@@ -905,7 +944,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             std::vector<candidate_types> uniq;
             uniq.reserve(info.candidate.size());
 
-            for (size_t i = 0; i < info.candidate.size(); ) {
+            for (size_t i = 0; i < info.candidate.size();) {
                 size_t j = i + 1;
                 candidate_types best = info.candidate[i];
                 // group same-byte entries, keep the one with the lowest error
@@ -972,36 +1011,39 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Find next strictly-larger candidate index for a tensor
-    auto next_distinct_idx = [&](const tensor_info &ti) -> int {
-        const auto &cand = ti.candidate;
-        const auto &cur  = cand[ti.choice];
+    auto next_distinct_idx = [&](const tensor_info & ti) -> int {
+        const auto & cand = ti.candidate;
+        const auto & cur  = cand[ti.choice];
         int j = ti.choice + 1;
-        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j;
+        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) {
+            ++j;
+        }
+
         return j < (int)cand.size() ? j : -1;
     };
 
     auto recompute_best_upgrade = [&]() -> upgrade {
         const double eps = 1e-12;
-        upgrade best{-1, -1, 0.0, 0, -1.0};
-        for (int i = 0; i < (int)all.size(); ++i) {
-            const auto &ti = all[i];
+        upgrade best{ -1, -1, 0.0, 0, -1.0 };
+        for (int i = 0; i < (int) all.size(); ++i) {
+            const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
-            int j = next_distinct_idx(ti);
+            const int j = next_distinct_idx(ti);
             if (j < 0) { continue; }
 
-            const auto &cur = ti.candidate[ti.choice];
-            const auto &nxt = ti.candidate[j];
+            const auto & cur = ti.candidate[ti.choice];
+            const auto & nxt = ti.candidate[j];
 
-            size_t delta_bytes = nxt.bytes - cur.bytes;
+            const size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
-            double err = (double)cur.error - (double)nxt.error;
+            double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
 
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
-                best = upgrade{i, j, err, delta_bytes, ratio};
+                best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
         return best;
@@ -1014,8 +1056,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t now_bytes = current_total_bytes();
         size_t next_bytes = now_bytes + up.delta_bytes;
         double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-
-        if (bpw_next <= (double)target_bpw + 1e-12) {
+        if (bpw_next <= target_bpw + 1e-12) {
             all[up.idx].choice = up.next;
             bpw_now = bpw_next;
         } else {
@@ -1026,22 +1067,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // We might still be below target but taking any single upgrade overshoots.
     // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
     {
-        double under_gap = (double)target_bpw - bpw_now;
+        double under_gap = target_bpw - bpw_now;
 
-        upgrade best_over{-1, -1, 0.0, 0, -1.0};
-        double best_over_gap = 1e300;
+        upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
+        double  best_over_gap = 1e300;
 
         size_t now_bytes = current_total_bytes();
 
-        for (int i = 0; i < (int)all.size(); ++i) {
-            const auto &ti = all[i];
+        for (int i = 0; i < (int) all.size(); ++i) {
+            const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
             int j = next_distinct_idx(ti);
             if (j < 0) { continue; }
 
-            const auto &cur = ti.candidate[ti.choice];
-            const auto &nxt = ti.candidate[j];
+            const auto & cur = ti.candidate[ti.choice];
+            const auto & nxt = ti.candidate[j];
 
             size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
@@ -1051,13 +1092,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             double over_gap = std::abs(bpw_over - (double)target_bpw);
 
-            double err = (double)cur.error - (double)nxt.error;
+            double err = cur.error - nxt.error;
             if (err < 0.0) { err = 0.0; }
             double ratio = err / (double)(delta_bytes * 8ull);
 
             if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
-                best_over = upgrade{i, j, err, delta_bytes, ratio};
+                best_over = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
 

From 35ad0fc4addf92e9dc0700a88004962731f3c9e0 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 23:27:20 +0100
Subject: [PATCH 027/148] Improve error estimation using weighted MSE

---
 src/llama-quant.cpp | 62 +++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 830bf915cfc..f5fa309c444 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -783,14 +783,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_offset += rs * n_per_row;
         }
 
-        traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+        if (typ == GGML_TYPE_F16) {
+            const auto *const src = (const ggml_fp16_t *)qbuf.data();
+            for (size_t r = 0; r < total_sampled_rows; ++r) {
+                ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
+            }
+        } else if (typ == GGML_TYPE_BF16) {
+            const auto *const src = (const ggml_bf16_t *)qbuf.data();
+            for (size_t r = 0; r < total_sampled_rows; ++r) {
+                ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
+            }
+        } else {
+            traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+        }
 
         double total_err = 0.0;
         size_t sample_offset = 0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
+            const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
             const int64_t rs = sample_rows_per_slice[slice];
 
             double slice_err = 0.0;
@@ -799,37 +811,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * ys = deq.data() + sample_offset;
 
                 double mse_w = 0.0;
-                double bias_sum = 0.0;
-
-                if (value_slice) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const float e = ys[j] - xs[j];
-                        mse_w += e * e * value_slice[j];
-                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const float e = ys[j] - xs[j];
-                        mse_w += e * e;
-                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
+                double x2_w = 0.0;
+                double bias_num = 0.0;
+                double bias_den = 0.0;
+
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double e = ys[j] - xs[j];
+                    const double w = wv ? wv[j] : 1.0;
+                    mse_w += w * e * e;
+                    x2_w  += w * xs[j] * xs[j];
+
+                    if (act) {
+                        const double a = act[j];
+                        bias_num += e * a;
+                        bias_den += a * a;
                     }
                 }
 
-                // Normalize by n_per_row to get a per-row average scale
-                double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (activation_slice && bias_lambda != 0.0) {
-                    // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
-                    const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
-                    row_err += bias_lambda * bias;
+                const double eps = 1e-30;
+                double row_err = mse_w / (x2_w + eps);
+
+                if (act && bias_lambda != 0.0) {
+                    const double bias_norm = bias_num * bias_num / (bias_den + eps);
+                    row_err += bias_lambda * bias_norm;
                 }
 
                 slice_err += row_err;
                 sample_offset += n_per_row;
             }
 
-            // Scale the slice contribution by the sampling factor
-            const double rows_per_expert = (double) nrows;
-            const auto   scale_rows = rows_per_expert / std::max(1.0, (double) rs);
+            const auto rows_per_expert = nrows;
+            const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 

From 5ef493ea1a01385c02ef4c56d38dfe5e116c47c6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 09:48:29 +0100
Subject: [PATCH 028/148] Exclude embeddings and output tensor

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f5fa309c444..32013e47baf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -697,8 +697,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= params->quantize_output_tensor || name != "output.weight";
         q &= !params->only_copy;
+        // TODO: Exclude embeddings and output tensors?
+        q &= params->quantize_output_tensor || name != "output.weight";
+        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
     };

From 95b2ab2800e26a5bd5b60c61f9593d720a97eb7a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 10:46:37 +0100
Subject: [PATCH 029/148] Change error estimate to use normalised weighted MSE

---
 src/llama-quant.cpp | 204 +++++++++++++++++++++++++++++---------------
 1 file changed, 134 insertions(+), 70 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 32013e47baf..629056ee065 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -9,6 +9,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <random>
 #include <regex>
 #include <thread>
 #include <unordered_map>
@@ -661,8 +662,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_0,
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q6_K
     };
 
     auto name_tn = LLM_TN(model.arch);
@@ -752,103 +752,125 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const ggml_type typ,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
-        const std::vector<float> & values_sample,
-        const std::vector<float> & activations_sample) -> double
+        const float * values_sample,
+        const float * activations_sample,
+        std::vector<uint8_t> & qbuf,
+        std::vector<float> & deq) -> double
     {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-
-        const ggml_type_traits * traits = ggml_get_type_traits(typ);
-        if (!traits || !traits->to_float) {
-            // Cannot dequantize candidate -> assign very high error
-            return 1e35f;
-        }
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         if (total_sampled_rows == 0) { return 0.0; }
 
-        const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows;
-        std::vector<uint8_t> qbuf(qbuf_size);
-        std::vector<float> deq(f32_sample.size());
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        const size_t need_q = row_sz * total_sampled_rows;
+        if (qbuf.size() < need_q) { qbuf.resize(need_q); }
+        if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); }
 
-        // Quantize all sampled rows at once and dequantize back
-        size_t qbuf_offset = 0;
-        size_t f32_offset = 0;
+        // Quantize sampled rows slice-by-slice
+        size_t qoff = 0;
+        size_t foff = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value);
-            qbuf_offset += ggml_row_size(typ, n_per_row) * rs;
-            f32_offset += rs * n_per_row;
+            const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
+
+            (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
+
+            qoff += row_sz * rs;
+            foff += (size_t)rs * n_per_row;
         }
 
+        // Dequantize to deq
         if (typ == GGML_TYPE_F16) {
-            const auto *const src = (const ggml_fp16_t *)qbuf.data();
-            for (size_t r = 0; r < total_sampled_rows; ++r) {
-                ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
-            }
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
         } else if (typ == GGML_TYPE_BF16) {
-            const auto *const src = (const ggml_bf16_t *)qbuf.data();
-            for (size_t r = 0; r < total_sampled_rows; ++r) {
-                ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
-            }
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
         } else {
-            traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+            const ggml_type_traits * traits = ggml_get_type_traits(typ);
+            if (!traits || !traits->to_float) {
+                // no dequantizer available
+                return 1e35;
+            }
+            traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size());
         }
 
+        // Compute error
+        size_t off = 0;
         double total_err = 0.0;
-        size_t sample_offset = 0;
+        const double eps = 1e-12;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
             const int64_t rs = sample_rows_per_slice[slice];
+            if (rs == 0) { continue; }
+
+            const float * wv  = values_sample ? values_sample + slice * n_per_row : nullptr;
+            const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr;
 
             double slice_err = 0.0;
-            for (int64_t s = 0; s < rs; ++s) {
-                const float * xs = f32_sample.data() + sample_offset;
-                const float * ys = deq.data() + sample_offset;
+
+            for (int64_t r = 0; r < rs; ++r) {
+                const float * x = f32_sample.data() + off;
+                const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
                 double x2_w = 0.0;
-                double bias_num = 0.0;
-                double bias_den = 0.0;
-
-                for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double e = ys[j] - xs[j];
-                    const double w = wv ? wv[j] : 1.0;
-                    mse_w += w * e * e;
-                    x2_w  += w * xs[j] * xs[j];
+                double bnum = 0.0;
+                double bden = 0.0;
 
-                    if (act) {
+                if (wv && act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = wv[j];
+                        const double e = y[j] - x[j];
+                        const double a = act[j];
+                        mse_w += w * e * e;
+                        x2_w += w * x[j] * x[j];
+                        bnum += e * a;
+                        bden += a * a;
+                    }
+                } else if (wv) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = wv[j];
+                        const double e = y[j] - x[j];
+                        mse_w += w * e * e;
+                        x2_w += w * x[j] * x[j];
+                    }
+                } else if (act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
                         const double a = act[j];
-                        bias_num += e * a;
-                        bias_den += a * a;
+                        mse_w += e * e;
+                        x2_w += x[j] * x[j];
+                        bnum += e * a;
+                        bden += a * a;
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
+                        mse_w += e * e;
+                        x2_w += x[j] * x[j];
                     }
                 }
 
-                const double eps = 1e-30;
                 double row_err = mse_w / (x2_w + eps);
-
                 if (act && bias_lambda != 0.0) {
-                    const double bias_norm = bias_num * bias_num / (bias_den + eps);
-                    row_err += bias_lambda * bias_norm;
+                    row_err += bias_lambda * (bnum * bnum) / (bden + eps);
                 }
 
                 slice_err += row_err;
-                sample_offset += n_per_row;
+                off += (size_t)n_per_row;
             }
 
-            const auto rows_per_expert = nrows;
-            const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            // scale back up to the full number of rows in this slice
+            const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
-    };
+};
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
@@ -887,38 +909,70 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const int64_t rows_per_expert = nrows_total;
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows_max);
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
 
         std::vector<float> f32_sample;
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
         std::vector<int64_t> sample_rows_per_slice(ne2);
 
+        std::mt19937 rng(std::random_device{}());
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
-            for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) {
-                const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
+            int64_t offset = 0;
+            if (stride > 1) {
+                std::uniform_int_distribution<int64_t> dist(0, stride - 1);
+                offset = dist(rng);
+            }
+            for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
+                const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row;
                 f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 current_sampled_rows++;
             }
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
 
+        auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
+            const size_t want = (size_t)ne2 * (size_t)n_per_row;
+            dst.clear();
+            if (!src || src_sz == 0) { return; }
+
+            if (src_sz == want) {
+                dst.resize(want);
+                std::memcpy(dst.data(), src, want * sizeof(float));
+            } else if (src_sz == (size_t)n_per_row) {
+                dst.resize(want);
+                for (int64_t s = 0; s < ne2; ++s) {
+                    std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
+                }
+            } else {
+                // Mismatch – safer to skip using it for this tensor
+                LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
+                    __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
+            }
+        };
+
         if (values_all) {
-            values_sample.resize(ne2 * n_per_row);
-            std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float));
+            // get size from the map (not just the raw pointer)
+            auto itv = values_data->find(remap_imatrix(name, mapped));
+            const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
+            copy_or_broadcast(values_all, sz, values_sample);
         }
         if (activations_all) {
-            activations_sample.resize(ne2 * n_per_row);
-            std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float));
+            auto ita = activations_data->find(remap_imatrix(name, mapped));
+            const size_t sz = ita == activations_data->end() ? 0 : ita->second.size();
+            copy_or_broadcast(activations_all, sz, activations_sample);
         }
 
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
 
+        // Prepare scratch buffers sized for the largest candidate row size
+        size_t total_sampled_rows = f32_sample.size() / n_per_row;
+
+        // Build list of candidate types first (compatible ones)
         std::vector<ggml_type> quant_candidates;
         if (is_iq(params->ftype)) {
             quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
@@ -926,18 +980,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
         }
 
-        // Build per-tensor candidate list
+        // Compute maximum row size among compatible candidates (to size qbuf once)
+        size_t max_row_sz = 0;
+        std::vector<ggml_type> compatible_candidates;
+        compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !values_all) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
+            compatible_candidates.push_back(tt);
+            max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
+        }
 
-            // Compute bpw and bytes
+        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
+        std::vector<float>   deq(f32_sample.size());
+
+        // Now evaluate candidates
+        for (ggml_type tt : compatible_candidates) {
             auto bpw = (float)tensor_bpw(t, tt);
             size_t bytes = total_bytes(t, tt);
-
-            // Estimate error using the pre-sampled data
-            auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample);
+            const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
+            const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
+            float  err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq);
             info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
         }
 

From e01dad886bd2314146ce768240fd0c8a2abecabb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 12:47:13 +0100
Subject: [PATCH 030/148] Parallelise candidate evaluation

---
 src/llama-quant.cpp | 87 ++++++++++++++++++++++++++++++---------------
 1 file changed, 59 insertions(+), 28 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 629056ee065..3cade0bf6fc 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -610,7 +610,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 256,
+    int sample_rows_per_expert = 384,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -758,16 +758,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> & deq) -> double
     {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        const size_t nels = f32_sample.size();
+        const size_t total_sampled_rows = nels / (size_t)n_per_row;
         if (total_sampled_rows == 0) { return 0.0; }
 
         const size_t row_sz = ggml_row_size(typ, n_per_row);
         const size_t need_q = row_sz * total_sampled_rows;
         if (qbuf.size() < need_q) { qbuf.resize(need_q); }
-        if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); }
+        if (deq.size() < nels) { deq.resize(nels); }
 
         // Quantize sampled rows slice-by-slice
         size_t qoff = 0;
@@ -777,31 +778,31 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (rs == 0) { continue; }
 
             const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-
             (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
 
-            qoff += row_sz * rs;
-            foff += (size_t)rs * n_per_row;
+            qoff += row_sz * (size_t)rs;
+            foff += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize to deq
+        // Dequantize into deq
         if (typ == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
         } else if (typ == GGML_TYPE_BF16) {
-            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
         } else {
             const ggml_type_traits * traits = ggml_get_type_traits(typ);
             if (!traits || !traits->to_float) {
-                // no dequantizer available
+                LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
                 return 1e35;
             }
-            traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size());
+
+            traits->to_float(qbuf.data(), deq.data(), (int) nels);
         }
 
         // Compute error
+        const double eps = 1e-12;
         size_t off = 0;
         double total_err = 0.0;
-        const double eps = 1e-12;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
@@ -817,9 +818,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
-                double x2_w = 0.0;
-                double bnum = 0.0;
-                double bden = 0.0;
+                double x2_w  = 0.0;
+                double bnum  = 0.0;
+                double bden  = 0.0;
 
                 if (wv && act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -828,8 +829,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double a = act[j];
                         mse_w += w * e * e;
                         x2_w += w * x[j] * x[j];
-                        bnum += e * a;
-                        bden += a * a;
+                        bnum += w * e * a;  // weighted bias
+                        bden += w * a * a;  // weighted norm
                     }
                 } else if (wv) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -856,7 +857,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
 
                 double row_err = mse_w / (x2_w + eps);
+
                 if (act && bias_lambda != 0.0) {
+                    // penalize squared projection of error onto activations
                     row_err += bias_lambda * (bnum * bnum) / (bden + eps);
                 }
 
@@ -864,7 +867,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 off += (size_t)n_per_row;
             }
 
-            // scale back up to the full number of rows in this slice
+            // scale to full rows in this slice (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
@@ -982,10 +985,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Compute maximum row size among compatible candidates (to size qbuf once)
         size_t max_row_sz = 0;
+        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
-            if (is_iq(ts_type) && !values_all) { continue; }
+            if (is_iq(ts_type) && !has_valid_imatrix) {
+                LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str());
+                continue;
+            }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
             compatible_candidates.push_back(tt);
@@ -996,13 +1003,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float>   deq(f32_sample.size());
 
         // Now evaluate candidates
-        for (ggml_type tt : compatible_candidates) {
-            auto bpw = (float)tensor_bpw(t, tt);
-            size_t bytes = total_bytes(t, tt);
-            const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
-            const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-            float  err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq);
-            info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
+        std::vector<candidate_types> cand_out(compatible_candidates.size());
+        const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
+        const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
+
+        int n_eval_threads = std::max(1, nthread);
+        std::atomic<size_t> cidx{0};
+        std::vector<std::thread> eval_workers;
+        eval_workers.reserve(n_eval_threads);
+
+        for (int ti = 0; ti < n_eval_threads; ++ti) {
+            eval_workers.emplace_back([&] {
+                // thread-local scratch
+                std::vector<uint8_t> tl_qbuf(qbuf.size());
+                std::vector<float>   tl_deq(deq.size());
+
+                for (;;) {
+                    const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
+                    if (i >= compatible_candidates.size()) { break; }
+
+                    const ggml_type tt = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(t, tt);
+                    const size_t bytes = total_bytes(t, tt);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq);
+                    cand_out[i] = candidate_types{ tt, bpw, bytes, err };
+                }
+            });
+        }
+        for (auto &th : eval_workers) { th.join(); }
+
+        for (auto &c : cand_out) {
+            if (c.bytes > 0) { info.candidate.push_back(c); }
         }
 
         if (info.candidate.empty()) {

From 887490c5ec3c679e8bc0c274b743b483e7c595e3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 15:11:49 +0100
Subject: [PATCH 031/148] Dequantise sampled rows only

---
 src/llama-quant.cpp | 71 ++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3cade0bf6fc..547281bd7d1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -610,7 +610,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 384,
+    int sample_rows_per_expert = 512,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -699,7 +699,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("attn_rel_b.weight") == std::string::npos;
         q &= !params->only_copy;
         // TODO: Exclude embeddings and output tensors?
-        q &= params->quantize_output_tensor || name != "output.weight";
+        // q &= params->quantize_output_tensor || name != "output.weight";
         q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
@@ -896,31 +896,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const int64_t nelem = ggml_nelements(t);
         std::vector<no_init<float>> f32_conv_buf;
-        float * f32_data = nullptr;
-
-        if (t->type == GGML_TYPE_F32) {
-            f32_data = (float *)t->data;
-        } else {
-            llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread);
-            f32_data = (float *)f32_conv_buf.data();
-        }
-
         const float * values_all = get_values(name);
         const float * activations_all = get_activations(name);
 
-        // Sample the tensor rows once, before looping through quantization candidates.
+        // Dequantize only sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
+
+        const ggml_type src_type = t->type;
+        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
+        const bool src_is_quant = ggml_is_quantized(src_type);
+        const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
 
         std::vector<float> f32_sample;
+        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
+
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
-        std::vector<int64_t> sample_rows_per_slice(ne2);
+        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
+
+        // deterministic sampling seed based on tensor name + fixed constant
+        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
+
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
+
+        // Temporary buffer for one dequantized row
+        std::vector<float> rowbuf((size_t)n_per_row);
 
-        std::mt19937 rng(std::random_device{}());
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
@@ -928,10 +932,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::uniform_int_distribution<int64_t> dist(0, stride - 1);
                 offset = dist(rng);
             }
+
             for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
-                const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row;
-                f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
-                current_sampled_rows++;
+                if (src_type == GGML_TYPE_F32) {
+                    const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
+                    f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
+                } else if (src_type == GGML_TYPE_F16) {
+                    const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else if (src_type == GGML_TYPE_BF16) {
+                    const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else if (src_is_quant) {
+                    const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    if (!src_traits || !src_traits->to_float) {
+                        throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
+                    }
+                    src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else {
+                    throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+                }
+                ++current_sampled_rows;
             }
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
@@ -999,15 +1023,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
         }
 
-        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
-        std::vector<float>   deq(f32_sample.size());
+        std::sort(compatible_candidates.begin(), compatible_candidates.end());
+        compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
         // Now evaluate candidates
         std::vector<candidate_types> cand_out(compatible_candidates.size());
         const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
         const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-
-        int n_eval_threads = std::max(1, nthread);
+        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
+        std::vector<float>   deq(f32_sample.size());
+        int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
         eval_workers.reserve(n_eval_threads);

From 9e11f82e8f5ad29cb62cba0bab7014db17a0b2c2 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 16:25:31 +0100
Subject: [PATCH 032/148] Precompute error denominator in estimate_erro()

---
 src/llama-quant.cpp | 154 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 121 insertions(+), 33 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 547281bd7d1..03f8a4bd117 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -598,8 +598,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
 
 // Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
 // sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
-// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight
-//              for bias and error, 2.0 means twice as much weight for bias
+// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE),
+//              1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -658,7 +658,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
-        // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it
+        // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it?
         GGML_TYPE_Q5_0,
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
@@ -770,7 +770,68 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (qbuf.size() < need_q) { qbuf.resize(need_q); }
         if (deq.size() < nels) { deq.resize(nels); }
 
-        // Quantize sampled rows slice-by-slice
+        // Precompute denominators:
+        // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2
+        // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise
+        std::vector x2_per_row(total_sampled_rows, 0.0);
+        std::vector bden_per_slice(ne2, 0.0);
+
+        const bool has_w = (values_sample != nullptr);
+        const bool has_a = (activations_sample != nullptr);
+
+        // Precompute bden per slice (depends only on w,a)
+        if (has_a) {
+            for (int64_t s = 0; s < ne2; ++s) {
+                const float * wv   = has_w ? values_sample + s * n_per_row : nullptr;
+                const float * act  = activations_sample + s * n_per_row;
+                double        bden = 0.0;
+                if (has_w) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double a = act[j];
+                        bden += (double) wv[j] * a * a;
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double a = act[j];
+                        bden += a * a;
+                    }
+                }
+                bden_per_slice[s] = bden;
+            }
+        }
+
+        // Precompute x2 per sampled row
+        {
+            size_t off = 0;
+            size_t row_idx = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = sample_rows_per_slice[s];
+                if (rs == 0) { continue; }
+
+                const float * wv = has_w ? values_sample + s * n_per_row : nullptr;
+
+                for (int64_t r = 0; r < rs; ++r, ++row_idx) {
+                    const float * x  = f32_sample.data() + off;
+                    double x2 = 0.0;
+                    if (has_w) {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = wv[j];
+                            const double xx = x[j];
+                            x2 += w * xx * xx;
+                        }
+                    } else {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double xx = x[j];
+                            x2 += xx * xx;
+                        }
+                    }
+                    x2_per_row[row_idx] = x2;
+                    off += (size_t)n_per_row;
+                }
+            }
+        }
+
+        // Quantize sampled rows slice-by-slice into qbuf
         size_t qoff = 0;
         size_t foff = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
@@ -784,43 +845,50 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             foff += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize into deq
-        if (typ == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
-        } else if (typ == GGML_TYPE_BF16) {
-            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
-        } else {
+        // Dequantize into deq (row-wise if needed to avoid int overflow)
+        {
             const ggml_type_traits * traits = ggml_get_type_traits(typ);
-            if (!traits || !traits->to_float) {
-                LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
-                return 1e35;
-            }
+            if (typ == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
+            } else if (typ == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
+            } else {
+                if (!traits || !traits->to_float) {
+                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
+                    return 1e35;
+                }
 
-            traits->to_float(qbuf.data(), deq.data(), (int) nels);
+                size_t done = 0;
+                while (done < nels) {
+                    const size_t chunk = std::min((size_t)n_per_row, nels - done);
+                    traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk);
+                    done += chunk;
+                }
+            }
         }
 
         // Compute error
         const double eps = 1e-12;
         size_t off = 0;
+        size_t row_idx = 0;
         double total_err = 0.0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * wv  = values_sample ? values_sample + slice * n_per_row : nullptr;
-            const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr;
+            const float * wv = has_w ? values_sample + slice * n_per_row : nullptr;
+            const float * act = has_a ? activations_sample + slice * n_per_row : nullptr;
+            const double bden = has_a ? bden_per_slice[slice] : 0.0;
 
             double slice_err = 0.0;
 
-            for (int64_t r = 0; r < rs; ++r) {
+            for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + off;
                 const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
-                double x2_w  = 0.0;
                 double bnum  = 0.0;
-                double bden  = 0.0;
 
                 if (wv && act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -828,52 +896,49 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         const double a = act[j];
                         mse_w += w * e * e;
-                        x2_w += w * x[j] * x[j];
-                        bnum += w * e * a;  // weighted bias
-                        bden += w * a * a;  // weighted norm
+                        bnum += w * e * a;
                     }
                 } else if (wv) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double w = wv[j];
                         const double e = y[j] - x[j];
                         mse_w += w * e * e;
-                        x2_w += w * x[j] * x[j];
                     }
                 } else if (act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
                         const double a = act[j];
                         mse_w += e * e;
-                        x2_w += x[j] * x[j];
                         bnum += e * a;
-                        bden += a * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
                         mse_w += e * e;
-                        x2_w += x[j] * x[j];
                     }
                 }
 
-                double row_err = mse_w / (x2_w + eps);
-
+                // corrected normalization: divide the full numerator by x2
+                double numer = mse_w;
                 if (act && bias_lambda != 0.0) {
-                    // penalize squared projection of error onto activations
-                    row_err += bias_lambda * (bnum * bnum) / (bden + eps);
+                    const double proj = bnum * bnum / (bden + eps);
+                    numer += bias_lambda * proj;
                 }
 
+                const double denom = x2_per_row[row_idx] + eps;
+                const double row_err = numer / denom;
+
                 slice_err += row_err;
                 off += (size_t)n_per_row;
             }
 
-            // scale to full rows in this slice (nrows)
+            // scale to full rows (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
-};
+    };
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
@@ -1067,6 +1132,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
+        // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A.
+        {
+            std::vector<candidate_types> pruned;
+            pruned.reserve(info.candidate.size());
+            // Sort by bytes asc, error asc
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+                return a.error < b.error;
+            });
+
+            double best_err = std::numeric_limits<double>::infinity();
+            size_t last_bytes = std::numeric_limits<size_t>::max();
+
+            for (const auto &c : info.candidate) {
+                if (c.error < best_err || c.bytes > last_bytes) {
+                    pruned.push_back(c);
+                    best_err = std::min(best_err, (double)c.error);
+                    last_bytes = c.bytes;
+                }
+            }
+            info.candidate.swap(pruned);
+        }
+
         std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }

From 5b6f1e9fde8dc6fd3456358c5b5c758b1f10b11c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 19:18:54 +0100
Subject: [PATCH 033/148] General code refactor

---
 src/llama-quant.cpp | 407 +++++++++++++++++++++-----------------------
 1 file changed, 192 insertions(+), 215 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 03f8a4bd117..85191a66ae8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,10 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
-// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
-// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE),
-//              1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias
+// Returns per-tensor type overrides to meet target BPW at lowest ppl
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -609,9 +606,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
-    int nthread,
-    int sample_rows_per_expert = 512,
-    float bias_lambda = 1.0
+    int nthread
 ) {
     struct candidate_types {
         ggml_type type;
@@ -621,15 +616,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     struct tensor_info {
-        const llama_model_loader::llama_tensor_weight * w;
-        std::vector<candidate_types> candidate;
+        const llama_model_loader::llama_tensor_weight * w = nullptr;
+        std::vector<candidate_types> candidate = {};
         int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
         size_t n_elements = 0;
     };
 
-    const ggml_type k_candidates[] = {
+    constexpr ggml_type k_quants[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_0,
@@ -648,7 +643,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    const ggml_type iq_candidates[] = {
+    constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
         GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
@@ -665,46 +660,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q6_K
     };
 
-    auto name_tn = LLM_TN(model.arch);
-    float target_bpw = params->target_bpw;
-
-    auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        const std::string name = ggml_get_name(t);
-        bool q = name.rfind("weight") == name.size() - 6;
-        q &= ggml_n_dims(t) >= 2;
-        q &= name.find("_norm.weight") == std::string::npos;
-        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
-        q &= name.find("altup") == std::string::npos;
-        q &= name.find("laurel") == std::string::npos;
-        q &= name.find("per_layer_model_proj") == std::string::npos;
-        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
-        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
-        q &= name.find("ssm_conv1d.weight") == std::string::npos;
-        q &= name.find("shortconv.conv.weight") == std::string::npos;
-        q &= name.find("time_mix_first.weight") == std::string::npos;
-        q &= name.find("time_mix_w0.weight") == std::string::npos;
-        q &= name.find("time_mix_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_v0.weight") == std::string::npos;
-        q &= name.find("time_mix_v1.weight") == std::string::npos;
-        q &= name.find("time_mix_v2.weight") == std::string::npos;
-        q &= name.find("time_mix_a0.weight") == std::string::npos;
-        q &= name.find("time_mix_a1.weight") == std::string::npos;
-        q &= name.find("time_mix_a2.weight") == std::string::npos;
-        q &= name.find("time_mix_g1.weight") == std::string::npos;
-        q &= name.find("time_mix_g2.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-        q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= !params->only_copy;
-        // TODO: Exclude embeddings and output tensors?
-        // q &= params->quantize_output_tensor || name != "output.weight";
-        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
-
-        return q;
-    };
-
     auto get_values = [&](const std::string & tensor_name) -> const float * {
         if (!values_data) { return nullptr; }
         const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
@@ -719,7 +674,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return it->second.data();
     };
 
-    auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
+    auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
@@ -729,8 +684,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
         const int64_t nelem = ggml_nelements(t);
-        const size_t bytes = total_bytes(t, typ);
-        return bytes * 8.0 / nelem;
+        const size_t bytes = tensor_bytes(t, typ);
+        return (double)bytes * 8.0 / (double)nelem;
     };
 
     auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
@@ -747,189 +702,220 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return GGML_TYPE_F16;
     };
 
+    auto name_tn = LLM_TN(model.arch);
+    auto can_quantize = [&](const ggml_tensor * t) -> bool {
+        const std::string name = ggml_get_name(t);
+        bool q = name.rfind("weight") == name.size() - 6;
+        q &= ggml_n_dims(t) >= 2;
+        q &= name.find("_norm.weight") == std::string::npos;
+        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        q &= name.find("altup") == std::string::npos;
+        q &= name.find("laurel") == std::string::npos;
+        q &= name.find("per_layer_model_proj") == std::string::npos;
+        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
+        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
+        q &= name.find("ssm_conv1d.weight") == std::string::npos;
+        q &= name.find("shortconv.conv.weight") == std::string::npos;
+        q &= name.find("time_mix_first.weight") == std::string::npos;
+        q &= name.find("time_mix_w0.weight") == std::string::npos;
+        q &= name.find("time_mix_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_v0.weight") == std::string::npos;
+        q &= name.find("time_mix_v1.weight") == std::string::npos;
+        q &= name.find("time_mix_v2.weight") == std::string::npos;
+        q &= name.find("time_mix_a0.weight") == std::string::npos;
+        q &= name.find("time_mix_a1.weight") == std::string::npos;
+        q &= name.find("time_mix_a2.weight") == std::string::npos;
+        q &= name.find("time_mix_g1.weight") == std::string::npos;
+        q &= name.find("time_mix_g2.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+        q &= name.find("attn_rel_b.weight") == std::string::npos;
+        q &= !params->only_copy;
+        // TODO: Exclude embeddings and output tensors?
+        // q &= params->quantize_output_tensor || name != "output.weight";
+        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
+
+        return q;
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
-        const ggml_type typ,
+        const ggml_type quant_type,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
         const float * values_sample,
         const float * activations_sample,
-        std::vector<uint8_t> & qbuf,
-        std::vector<float> & deq) -> double
+        std::vector<uint8_t> & quantized_buffer,
+        std::vector<float> & dequantized_buffer) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        const size_t nels = f32_sample.size();
-        const size_t total_sampled_rows = nels / (size_t)n_per_row;
-        if (total_sampled_rows == 0) { return 0.0; }
+        const size_t sample_element_count = f32_sample.size();
+        const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
+        if (sample_row_count == 0) { return 0.0; }
 
-        const size_t row_sz = ggml_row_size(typ, n_per_row);
-        const size_t need_q = row_sz * total_sampled_rows;
-        if (qbuf.size() < need_q) { qbuf.resize(need_q); }
-        if (deq.size() < nels) { deq.resize(nels); }
+        const size_t row_size = ggml_row_size(quant_type, n_per_row);
+        const size_t buffer_size = row_size * sample_row_count;
+        if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
+        if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        // Precompute denominators:
-        // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2
-        // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise
-        std::vector x2_per_row(total_sampled_rows, 0.0);
-        std::vector bden_per_slice(ne2, 0.0);
+        std::vector row_sq_norm(sample_row_count, 0.0);
+        std::vector bias_denominator_per_slice(ne2, 0.0);
 
-        const bool has_w = (values_sample != nullptr);
-        const bool has_a = (activations_sample != nullptr);
-
-        // Precompute bden per slice (depends only on w,a)
-        if (has_a) {
+        // Precompute bias denominator per slice
+        const bool has_values = (values_sample != nullptr);
+        const bool has_activations = (activations_sample != nullptr);
+        if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
-                const float * wv   = has_w ? values_sample + s * n_per_row : nullptr;
-                const float * act  = activations_sample + s * n_per_row;
-                double        bden = 0.0;
-                if (has_w) {
+                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
+                const float * activations = activations_sample + s * n_per_row;
+                double bias_denominator = 0.0;
+                if (has_values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = act[j];
-                        bden += (double) wv[j] * a * a;
+                        const double a = activations[j];
+                        bias_denominator += values[j] * a * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = act[j];
-                        bden += a * a;
+                        const double a = activations[j];
+                        bias_denominator += a * a;
                     }
                 }
-                bden_per_slice[s] = bden;
+                bias_denominator_per_slice[s] = bias_denominator;
             }
         }
 
-        // Precompute x2 per sampled row
+        // Compute squared norms of sampled rows
         {
-            size_t off = 0;
+            size_t offset = 0;
             size_t row_idx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
                 const int64_t rs = sample_rows_per_slice[s];
                 if (rs == 0) { continue; }
 
-                const float * wv = has_w ? values_sample + s * n_per_row : nullptr;
+                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
 
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * x  = f32_sample.data() + off;
-                    double x2 = 0.0;
-                    if (has_w) {
+                    const float * row  = f32_sample.data() + offset;
+                    double rsn = 0.0;
+                    if (has_values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = wv[j];
-                            const double xx = x[j];
-                            x2 += w * xx * xx;
+                            const double v = values[j];
+                            const double x = row[j];
+                            rsn += v * x * x;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double xx = x[j];
-                            x2 += xx * xx;
+                            const double x = row[j];
+                            rsn += x * x;
                         }
                     }
-                    x2_per_row[row_idx] = x2;
-                    off += (size_t)n_per_row;
+                    row_sq_norm[row_idx] = rsn;
+                    offset += (size_t)n_per_row;
                 }
             }
         }
 
-        // Quantize sampled rows slice-by-slice into qbuf
-        size_t qoff = 0;
-        size_t foff = 0;
+        // Quantize sampled rows slice-by-slice into quantized_buffer
+        size_t quantised_offset = 0;
+        size_t floats_offset = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
             const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-            (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
+            (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value);
 
-            qoff += row_sz * (size_t)rs;
-            foff += (size_t)rs * (size_t)n_per_row;
+            quantised_offset += row_size * (size_t)rs;
+            floats_offset += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize into deq (row-wise if needed to avoid int overflow)
+        // Dequantize into dequantized_buffer
         {
-            const ggml_type_traits * traits = ggml_get_type_traits(typ);
-            if (typ == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
-            } else if (typ == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
+            const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
+            if (quant_type == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
+            } else if (quant_type == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
             } else {
                 if (!traits || !traits->to_float) {
-                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
+                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
                     return 1e35;
                 }
 
                 size_t done = 0;
-                while (done < nels) {
-                    const size_t chunk = std::min((size_t)n_per_row, nels - done);
-                    traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk);
+                while (done < sample_element_count) {
+                    const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done);
+                    traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk);
                     done += chunk;
                 }
             }
         }
 
         // Compute error
-        const double eps = 1e-12;
-        size_t off = 0;
+        size_t offset = 0;
         size_t row_idx = 0;
         double total_err = 0.0;
-
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * wv = has_w ? values_sample + slice * n_per_row : nullptr;
-            const float * act = has_a ? activations_sample + slice * n_per_row : nullptr;
-            const double bden = has_a ? bden_per_slice[slice] : 0.0;
-
+            const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
+            const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
+            const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0;
             double slice_err = 0.0;
-
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                const float * x = f32_sample.data() + off;
-                const float * y = deq.data() + off;
-
-                double mse_w = 0.0;
-                double bnum  = 0.0;
-
-                if (wv && act) {
+                const float * x = f32_sample.data() + offset;
+                const float * y = dequantized_buffer.data() + offset;
+                double weighted_mse = 0.0;
+                double bias_numerator  = 0.0;
+                if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = wv[j];
+                        const double v = values[j];
                         const double e = y[j] - x[j];
-                        const double a = act[j];
-                        mse_w += w * e * e;
-                        bnum += w * e * a;
+                        const double a = activations[j];
+                        weighted_mse += v * e * e;
+                        bias_numerator += v * e * a;
                     }
-                } else if (wv) {
+                } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = wv[j];
+                        const double v = values[j];
                         const double e = y[j] - x[j];
-                        mse_w += w * e * e;
+                        weighted_mse += v * e * e;
                     }
-                } else if (act) {
+                } else if (activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
-                        const double a = act[j];
-                        mse_w += e * e;
-                        bnum += e * a;
+                        const double a = activations[j];
+                        weighted_mse += e * e;
+                        bias_numerator += e * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
-                        mse_w += e * e;
+                        weighted_mse += e * e;
                     }
                 }
 
-                // corrected normalization: divide the full numerator by x2
-                double numer = mse_w;
-                if (act && bias_lambda != 0.0) {
-                    const double proj = bnum * bnum / (bden + eps);
-                    numer += bias_lambda * proj;
+                double err_numerator = weighted_mse;
+                constexpr double epsilon = 1e-12;
+                constexpr float bias_lambda = 1.0;
+                //bias_lambda defines the weight of the bias term in the weigthed MSE error function
+                // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
+                // 2.0 means twice as much weight for bias, etc
+                if (activations && bias_lambda != 0.0) {
+                    const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
+                    err_numerator += bias_lambda * proj;
                 }
 
-                const double denom = x2_per_row[row_idx] + eps;
-                const double row_err = numer / denom;
-
+                const double err_denominator = row_sq_norm[row_idx] + epsilon;
+                const double row_err = err_numerator / err_denominator;
                 slice_err += row_err;
-                off += (size_t)n_per_row;
+                offset += (size_t)n_per_row;
             }
 
             // scale to full rows (nrows)
@@ -942,14 +928,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
-
     for (const auto * tw : tensors) {
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
-
         ggml_tensor * t = tw->tensor;
         const std::string name = ggml_get_name(t);
-
         if (!can_quantize(t)) { continue; }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
@@ -959,37 +942,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        const int64_t nelem = ggml_nelements(t);
-        std::vector<no_init<float>> f32_conv_buf;
-        const float * values_all = get_values(name);
-        const float * activations_all = get_activations(name);
-
         // Dequantize only sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        const ggml_type src_type = t->type;
-        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
-        const bool src_is_quant = ggml_is_quantized(src_type);
-        const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
-
+        // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
+        int sample_rows_per_expert = 512;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 
-        std::vector<float> values_sample;
-        std::vector<float> activations_sample;
-        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
-
         // deterministic sampling seed based on tensor name + fixed constant
         std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
-
+        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
         const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
-
-        // Temporary buffer for one dequantized row
-        std::vector<float> rowbuf((size_t)n_per_row);
-
+        std::vector<float> row_buffer(n_per_row);
+        const ggml_type src_type = t->type;
+        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
+        const bool src_is_quant = ggml_is_quantized(src_type);
+        const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
@@ -1004,19 +976,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
                     const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
                     const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
                     const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (!src_traits || !src_traits->to_float) {
                         throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
                     }
-                    src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else {
                     throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
                 }
@@ -1045,6 +1017,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
+        const float * values_all = get_values(name);
+        const float * activations_all = get_activations(name);
+        std::vector<float> values_sample;
+        std::vector<float> activations_sample;
         if (values_all) {
             // get size from the map (not just the raw pointer)
             auto itv = values_data->find(remap_imatrix(name, mapped));
@@ -1057,6 +1033,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             copy_or_broadcast(activations_all, sz, activations_sample);
         }
 
+        const int64_t nelem = ggml_nelements(t);
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
@@ -1067,12 +1044,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Build list of candidate types first (compatible ones)
         std::vector<ggml_type> quant_candidates;
         if (is_iq(params->ftype)) {
-            quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
+            quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants));
         } else {
-            quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
+            quant_candidates.assign(std::begin(k_quants), std::end(k_quants));
         }
 
-        // Compute maximum row size among compatible candidates (to size qbuf once)
+        // Compute maximum row size among compatible candidates (to size quantized_buffer once)
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         std::vector<ggml_type> compatible_candidates;
@@ -1092,21 +1069,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
         // Now evaluate candidates
-        std::vector<candidate_types> cand_out(compatible_candidates.size());
-        const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
-        const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
-        std::vector<float>   deq(f32_sample.size());
+        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
+        const float *values = values_sample.empty() ? nullptr : values_sample.data();
+        const float *activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
+        std::vector<float> dequantised_buffer(f32_sample.size());
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
         eval_workers.reserve(n_eval_threads);
-
         for (int ti = 0; ti < n_eval_threads; ++ti) {
             eval_workers.emplace_back([&] {
                 // thread-local scratch
-                std::vector<uint8_t> tl_qbuf(qbuf.size());
-                std::vector<float>   tl_deq(deq.size());
+                std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
+                std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
 
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
@@ -1114,15 +1090,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
-                    const size_t bytes = total_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq);
-                    cand_out[i] = candidate_types{ tt, bpw, bytes, err };
+                    const size_t bytes = tensor_bytes(t, tt);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
+                    eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
         }
+
         for (auto &th : eval_workers) { th.join(); }
 
-        for (auto &c : cand_out) {
+        for (auto &c : eval_candidates) {
             if (c.bytes > 0) { info.candidate.push_back(c); }
         }
 
@@ -1132,7 +1109,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A.
+        // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
         {
             std::vector<candidate_types> pruned;
             pruned.reserve(info.candidate.size());
@@ -1155,36 +1132,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.swap(pruned);
         }
 
-        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-            if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
-            if (a.error != b.error) { return a.error < b.error; }
-            return a.bytes < b.bytes;
-        });
-
         // Collapse candidates with identical storage size (bytes)
         {
-            std::vector<candidate_types> uniq;
-            uniq.reserve(info.candidate.size());
+            std::vector<candidate_types> unique;
+            unique.reserve(info.candidate.size());
+            // Sort by bpw asc, error asc, bytes asc
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
+                if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
+                if (a.error != b.error) { return a.error < b.error; }
+                return a.bytes < b.bytes;
+            });
 
             for (size_t i = 0; i < info.candidate.size();) {
-                size_t j = i + 1;
+                size_t          j    = i + 1;
                 candidate_types best = info.candidate[i];
                 // group same-byte entries, keep the one with the lowest error
                 while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
-                    if (info.candidate[j].error < best.error) { best = info.candidate[j]; }
+                    if (info.candidate[j].error < best.error) {
+                        best = info.candidate[j];
+                    }
                     ++j;
                 }
-                uniq.push_back(best);
+                unique.push_back(best);
                 i = j;
             }
-            info.candidate.swap(uniq);
+            info.candidate.swap(unique);
         }
 
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
-
         all.push_back(std::move(info));
     }
 
@@ -1196,6 +1174,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (const auto & ti : all) {
             b += ti.candidate[ti.choice].bytes;
         }
+
         return b;
     };
 
@@ -1204,6 +1183,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (const auto & ti : all) {
             w += ti.n_elements;
         }
+
         return w;
     };
 
@@ -1215,12 +1195,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Precompute current bpw
     double bpw_now = current_bpw();
 
+    float target_bpw = params->target_bpw;
     // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
     if (bpw_now >= target_bpw) {
         std::unordered_map<std::string, ggml_type> overrides;
         for (const auto & ti : all) {
             overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
         }
+
         return overrides;
     }
 
@@ -1268,6 +1250,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
+
         return best;
     };
 
@@ -1286,16 +1269,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
-    // We might still be below target but taking any single upgrade overshoots.
-    // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
+    // We might still be below target so we try to find the best upgrade one last time
     {
-        double under_gap = target_bpw - bpw_now;
-
         upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
         double  best_over_gap = 1e300;
-
+        double  under_gap = target_bpw - bpw_now;
         size_t now_bytes = current_total_bytes();
-
         for (int i = 0; i < (int) all.size(); ++i) {
             const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
@@ -1305,19 +1284,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const auto & cur = ti.candidate[ti.choice];
             const auto & nxt = ti.candidate[j];
-
             size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
             size_t over_bytes = now_bytes + delta_bytes;
             double bpw_over = (double)over_bytes * 8.0 / (double)tw;
-
-            double over_gap = std::abs(bpw_over - (double)target_bpw);
-
             double err = cur.error - nxt.error;
             if (err < 0.0) { err = 0.0; }
             double ratio = err / (double)(delta_bytes * 8ull);
 
+            double over_gap = std::abs(bpw_over - (double)target_bpw);
             if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
                 best_over = upgrade{ i, j, err, delta_bytes, ratio };
@@ -1339,6 +1315,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
         overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
     }
+
     return overrides;
 }
 

From ec0afbe79ff001af56846365f91f97240bd2dbf4 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 01:46:09 +0100
Subject: [PATCH 034/148] Include embeddings and output tensors

---
 src/llama-quant.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 85191a66ae8..b9e3c19a89a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -733,9 +733,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
         q &= !params->only_copy;
-        // TODO: Exclude embeddings and output tensors?
-        // q &= params->quantize_output_tensor || name != "output.weight";
-        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
     };

From 35c1504441eb03b126b15a6ddd4625f094dc7dfe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:01:57 +0100
Subject: [PATCH 035/148] Fix byte count for 3d or higher tensors

---
 src/llama-quant.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b9e3c19a89a..8cc5f221ea1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -676,10 +676,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t  row_sz = ggml_row_size(typ, n_per_row);
-        return (size_t)ne2 * (size_t)nrows * row_sz;
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        const int64_t nrows = ggml_nrows(t);
+        return (size_t)nrows * row_sz;
     };
 
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {

From bb0d912c1f93de2ef1af4ef9fb467c4862012898 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:02:56 +0100
Subject: [PATCH 036/148] Update comments

---
 src/llama-quant.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8cc5f221ea1..4b846c7d0c7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -703,6 +703,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
+        // This list should be kept in sync with llama_tensor_quantize_impl()
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
         q &= ggml_n_dims(t) >= 2;
@@ -902,7 +903,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 constexpr float bias_lambda = 1.0;
                 //bias_lambda defines the weight of the bias term in the weigthed MSE error function
                 // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
-                // 2.0 means twice as much weight for bias, etc
+                // 2.0 means twice as much weight for bias, etc. Default is 1.0.
                 if (activations && bias_lambda != 0.0) {
                     const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
                     err_numerator += bias_lambda * proj;
@@ -1192,7 +1193,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     double bpw_now = current_bpw();
 
     float target_bpw = params->target_bpw;
-    // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
+    // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw)
     if (bpw_now >= target_bpw) {
         std::unordered_map<std::string, ggml_type> overrides;
         for (const auto & ti : all) {

From 2f13fee795639841de46b8f415a233062aa5d2b8 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:05:55 +0100
Subject: [PATCH 037/148] Parameterise type

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4b846c7d0c7..e5e27da5096 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -760,8 +760,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
         if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        std::vector row_sq_norm(sample_row_count, 0.0);
-        std::vector bias_denominator_per_slice(ne2, 0.0);
+        std::vector<double> row_sq_norm(sample_row_count, 0.0);
+        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
 
         // Precompute bias denominator per slice
         const bool has_values = (values_sample != nullptr);

From 47cdbe21552324cd79b9243485eeb455cab4673a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:11:11 +0100
Subject: [PATCH 038/148] Reduce sampling window to speedup process

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e5e27da5096..5460669e7ce 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -945,7 +945,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
-        int sample_rows_per_expert = 512;
+        constexpr int sample_rows_per_expert = 384;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 

From 01c927fb94163ddb36365323683274071c034690 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:14:14 +0100
Subject: [PATCH 039/148] Improve pareto efficient candidate selection

---
 src/llama-quant.cpp | 49 +++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5460669e7ce..14d9087f53e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1106,54 +1106,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
+        // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
         {
             std::vector<candidate_types> pruned;
             pruned.reserve(info.candidate.size());
-            // Sort by bytes asc, error asc
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+
+            // Sort by bytes ascending, error ascending
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
 
             double best_err = std::numeric_limits<double>::infinity();
             size_t last_bytes = std::numeric_limits<size_t>::max();
-
-            for (const auto &c : info.candidate) {
-                if (c.error < best_err || c.bytes > last_bytes) {
-                    pruned.push_back(c);
-                    best_err = std::min(best_err, (double)c.error);
+            for (const auto & c : info.candidate) {
+                // Only keep the best error seen so far at strictly larger byte sizes
+                if (c.bytes != last_bytes) {
+                    // first time we see this byte size
                     last_bytes = c.bytes;
-                }
-            }
-            info.candidate.swap(pruned);
-        }
-
-        // Collapse candidates with identical storage size (bytes)
-        {
-            std::vector<candidate_types> unique;
-            unique.reserve(info.candidate.size());
-            // Sort by bpw asc, error asc, bytes asc
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-                if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
-                if (a.error != b.error) { return a.error < b.error; }
-                return a.bytes < b.bytes;
-            });
-
-            for (size_t i = 0; i < info.candidate.size();) {
-                size_t          j    = i + 1;
-                candidate_types best = info.candidate[i];
-                // group same-byte entries, keep the one with the lowest error
-                while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
-                    if (info.candidate[j].error < best.error) {
-                        best = info.candidate[j];
+                    if (c.error < best_err) {
+                        pruned.push_back(c);
+                        best_err = c.error;
                     }
-                    ++j;
+                } else {
+                    // same bytes: we already sorted by error; skip
                 }
-                unique.push_back(best);
-                i = j;
             }
-            info.candidate.swap(unique);
+            info.candidate.swap(pruned);
         }
 
         // Initialize choice at the smallest bpw candidate

From 897decbe8a062ded079f1f1a866392571ed7f95f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:15:11 +0100
Subject: [PATCH 040/148] Show skipped IQ tensors

---
 src/llama-quant.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 14d9087f53e..c5c19f3c5f3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1019,7 +1019,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
         if (values_all) {
-            // get size from the map (not just the raw pointer)
             auto itv = values_data->find(remap_imatrix(name, mapped));
             const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
             copy_or_broadcast(values_all, sz, values_sample);
@@ -1053,7 +1052,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str());
                 continue;
             }
             ggml_type tt = make_compatible(t, ts_type);
@@ -1214,13 +1213,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const auto & cur = ti.candidate[ti.choice];
             const auto & nxt = ti.candidate[j];
-
             const size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
             double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
-
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
                 best = upgrade{ i, j, err, delta_bytes, ratio };

From f05c8483d8b138c58a41ecdf32f95947bb130be5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:17:58 +0100
Subject: [PATCH 041/148] Improve dequantized_buffer fill

---
 src/llama-quant.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c5c19f3c5f3..db4a0e1a20e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -843,12 +843,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
                     return 1e35;
                 }
-
-                size_t done = 0;
-                while (done < sample_element_count) {
-                    const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done);
-                    traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk);
-                    done += chunk;
+                const size_t row_size = ggml_row_size(quant_type, n_per_row);
+                for (size_t r = 0; r < sample_row_count; ++r) {
+                    traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row);
                 }
             }
         }

From fea99d051ad3a9f3cce3cdf084074e0655f47e14 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 16:57:58 +0100
Subject: [PATCH 042/148] Refactor and combine lambdas

---
 src/llama-quant.cpp | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index db4a0e1a20e..10993e89c6a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -660,20 +660,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q6_K
     };
 
-    auto get_values = [&](const std::string & tensor_name) -> const float * {
-        if (!values_data) { return nullptr; }
-        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == values_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
-    auto get_activations = [&](const std::string & tensor_name) -> const float * {
-        if (!activations_data) { return nullptr; }
-        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == activations_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
@@ -991,6 +977,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
 
+        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
+            if (!m) { return {nullptr, 0}; }
+            const std::string key = remap_imatrix(tensor_name, mapped);
+            const auto it = m->find(key);
+            if (it == m->end()) { return {nullptr, 0}; }
+            return { it->second.data(), it->second.size() };
+        };
+
+        // Copy this row's side data (values and activations), or broadcasts to all slices
         auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             dst.clear();
@@ -1005,26 +1000,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
             } else {
-                // Mismatch – safer to skip using it for this tensor
                 LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
                     __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
             }
         };
 
-        const float * values_all = get_values(name);
-        const float * activations_all = get_activations(name);
+        const auto [values_all, values_sz] = side_data(values_data, name);
+        const auto [activations_all, activations_sz] = side_data(activations_data, name);
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
-        if (values_all) {
-            auto itv = values_data->find(remap_imatrix(name, mapped));
-            const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
-            copy_or_broadcast(values_all, sz, values_sample);
-        }
-        if (activations_all) {
-            auto ita = activations_data->find(remap_imatrix(name, mapped));
-            const size_t sz = ita == activations_data->end() ? 0 : ita->second.size();
-            copy_or_broadcast(activations_all, sz, activations_sample);
-        }
+        if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
+        if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
         const int64_t nelem = ggml_nelements(t);
         tensor_info info;

From 6d17889addf3aa18000334e1dd958111104cdf3e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 16:58:46 +0100
Subject: [PATCH 043/148] Log if override is from tensor-type or from
 bpw-target

---
 src/llama-quant.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 10993e89c6a..721deaddad0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1049,8 +1049,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Now evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
-        const float *values = values_sample.empty() ? nullptr : values_sample.data();
-        const float *activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        const float * values = values_sample.empty() ? nullptr : values_sample.data();
+        const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantised_buffer(f32_sample.size());
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
@@ -1656,15 +1656,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                 // get bpw override
                 const auto override = bpw_overrides.find(name);
-                if (override != bpw_overrides.end()) { new_type = override->second; }
-                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
+                if (override != bpw_overrides.end() && override->second != new_type) {
+                    LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type));
+                    new_type = override->second;
+                }
+                // unless the user specifies a type, and the tensor shape will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type));
                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                             }
                         }

From 9a4b1154974d5ddbfb9d9d3f785f5a29bb202fac Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 01:08:01 +0100
Subject: [PATCH 044/148] Explicitly adding <atomic> include

---
 src/llama-quant.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 721deaddad0..d17b21d0086 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -4,6 +4,7 @@
 #include "llama-model-loader.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cmath>
 #include <cstring>
 #include <cinttypes>

From f75265f55bb1d4470dea57f4c9e3ad108cc343a1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 01:08:37 +0100
Subject: [PATCH 045/148] Fix typo

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d17b21d0086..6e3aa3f83d1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1535,7 +1535,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f) {
-        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
+        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw);
         bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
     }
 

From 73124a9921b967fe9e5afbb9f48924a3d48983a6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 02:17:22 +0100
Subject: [PATCH 046/148] Refactor estimate_error()

---
 src/llama-quant.cpp | 131 ++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6e3aa3f83d1..3c358fb67e2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -742,38 +742,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
         if (sample_row_count == 0) { return 0.0; }
 
-        const size_t row_size = ggml_row_size(quant_type, n_per_row);
-        const size_t buffer_size = row_size * sample_row_count;
-        if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
+        const size_t row_sz = ggml_row_size(quant_type, n_per_row);
+        const size_t buffer_sz = row_sz * sample_row_count;
+
+        if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); }
         if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        std::vector<double> row_sq_norm(sample_row_count, 0.0);
-        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
+        const bool has_values = values_sample != nullptr;
+        const bool has_activations = activations_sample != nullptr;
 
-        // Precompute bias denominator per slice
-        const bool has_values = (values_sample != nullptr);
-        const bool has_activations = (activations_sample != nullptr);
+        // Bias denominators per slice (only needed if we have activations)
+        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
                 const float * activations = activations_sample + s * n_per_row;
-                double bias_denominator = 0.0;
-                if (has_values) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = activations[j];
-                        bias_denominator += values[j] * a * a;
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = activations[j];
-                        bias_denominator += a * a;
-                    }
+                double denom = 0.0;
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double a = activations[j];
+                    const double w = values ? values[j] : 1.0;
+                    denom += w * a * a;
                 }
-                bias_denominator_per_slice[s] = bias_denominator;
+                bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Compute squared norms of sampled rows
+        // Compute per-row squared norms with weighting (if values are provided)
+        std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
             size_t row_idx = 0;
@@ -784,18 +779,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
 
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * row  = f32_sample.data() + offset;
+                    const float * x = f32_sample.data() + offset;
                     double rsn = 0.0;
-                    if (has_values) {
+                    if (values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double v = values[j];
-                            const double x = row[j];
-                            rsn += v * x * x;
+                            const double v  = values[j];
+                            const double xx = x[j];
+                            rsn += v * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double x = row[j];
-                            rsn += x * x;
+                            const double xx = x[j];
+                            rsn += xx * xx;
                         }
                     }
                     row_sq_norm[row_idx] = rsn;
@@ -805,35 +800,44 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Quantize sampled rows slice-by-slice into quantized_buffer
-        size_t quantised_offset = 0;
-        size_t floats_offset = 0;
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = sample_rows_per_slice[slice];
-            if (rs == 0) { continue; }
+        {
+            size_t q_offset = 0;
+            size_t f_offset = 0;
+            for (int64_t slice = 0; slice < ne2; ++slice) {
+                const int64_t rs = sample_rows_per_slice[slice];
+                if (rs == 0) { continue; }
 
-            const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-            (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value);
+                const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
+                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
 
-            quantised_offset += row_size * (size_t)rs;
-            floats_offset += (size_t)rs * (size_t)n_per_row;
+                q_offset += row_sz * (size_t)rs;
+                f_offset += (size_t)rs * (size_t)n_per_row;
+            }
         }
 
         // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (quant_type == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
-            } else if (quant_type == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
-            } else {
-                if (!traits || !traits->to_float) {
-                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
-                    return 1e35;
-                }
-                const size_t row_size = ggml_row_size(quant_type, n_per_row);
-                for (size_t r = 0; r < sample_row_count; ++r) {
-                    traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row);
+            auto row_to_float = [&](size_t r) {
+                uint8_t * src = quantized_buffer.data() + r * row_sz;
+                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                if (quant_type == GGML_TYPE_F16) {
+                    ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+                } else if (quant_type == GGML_TYPE_BF16) {
+                    ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+                } else {
+                    if (!traits || !traits->to_float) {
+                        LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
+                        return false;
+                    }
+                    traits->to_float(src, dst, (int)n_per_row);
                 }
+
+                return true;
+            };
+
+            for (size_t r = 0; r < sample_row_count; ++r) {
+                if (!row_to_float(r)) { return 1e35; }
             }
         }
 
@@ -847,20 +851,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
-            const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+            const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+
             double slice_err = 0.0;
+
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
                 double weighted_mse = 0.0;
-                double bias_numerator  = 0.0;
+                double bias_num = 0.0;
                 if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double v = values[j];
                         const double e = y[j] - x[j];
                         const double a = activations[j];
                         weighted_mse += v * e * e;
-                        bias_numerator += v * e * a;
+                        bias_num += v * e * a;
                     }
                 } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -873,7 +879,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         const double a = activations[j];
                         weighted_mse += e * e;
-                        bias_numerator += e * a;
+                        bias_num += e * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -882,24 +888,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                double err_numerator = weighted_mse;
+                constexpr float bias_lambda = 1.75f;
                 constexpr double epsilon = 1e-12;
-                constexpr float bias_lambda = 1.0;
-                //bias_lambda defines the weight of the bias term in the weigthed MSE error function
-                // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
-                // 2.0 means twice as much weight for bias, etc. Default is 1.0.
-                if (activations && bias_lambda != 0.0) {
-                    const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
-                    err_numerator += bias_lambda * proj;
+                double err_num = weighted_mse;
+                if (activations && bias_lambda != 0.0f) {
+                    const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                    err_num += (double)bias_lambda * proj;
                 }
 
-                const double err_denominator = row_sq_norm[row_idx] + epsilon;
-                const double row_err = err_numerator / err_denominator;
-                slice_err += row_err;
+                const double err_den = row_sq_norm[row_idx] + epsilon;
+                slice_err += err_num / err_den;
                 offset += (size_t)n_per_row;
             }
 
-            // scale to full rows (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }

From 68ae5e66cea41457a3ed11018374b64e2f94d3d3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 02:50:55 +0100
Subject: [PATCH 047/148] Improve list of candidate types

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3c358fb67e2..392a23b5ca2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1023,21 +1023,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
-        std::vector<ggml_type> quant_candidates;
-        if (is_iq(params->ftype)) {
-            quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants));
-        } else {
-            quant_candidates.assign(std::begin(k_quants), std::end(k_quants));
-        }
+        const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
+        const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]);
 
-        // Compute maximum row size among compatible candidates (to size quantized_buffer once)
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
+
         std::vector<ggml_type> compatible_candidates;
-        compatible_candidates.reserve(quant_candidates.size());
-        for (ggml_type ts_type : quant_candidates) {
+        compatible_candidates.reserve(base_sz);
+
+        for (size_t i = 0; i < base_sz; ++i) {
+            ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n",
+                    __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
             ggml_type tt = make_compatible(t, ts_type);

From decafae27060ed923c69ce3b89db505538a9b230 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 11:30:11 +0100
Subject: [PATCH 048/148] Adjust bias_lambda

---
 src/llama-quant.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 392a23b5ca2..4ce651723f8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -888,7 +888,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                constexpr float bias_lambda = 1.75f;
+                // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+                // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger
+                constexpr float bias_lambda = 1.5f;
                 constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
@@ -1024,7 +1026,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Build list of candidate types first (compatible ones)
         const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
-        const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]);
+        const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
 
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;

From 3856d60328349c5b2a4e381d6fdff20d272415ab Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 14:45:07 +0100
Subject: [PATCH 049/148] Restrict quant types per family

---
 src/llama-quant.cpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ce651723f8..7615376e31c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -628,11 +628,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr ggml_type k_quants[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_1,
         GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_0,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
@@ -646,19 +642,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
-        GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
         GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS,
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
-        // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it?
-        GGML_TYPE_Q5_0,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
     };
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
@@ -888,8 +877,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-                // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger
+                // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+                // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
                 constexpr float bias_lambda = 1.5f;
                 constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;

From 61c0e01f500ef2610904045c6a7852956c7ba6ba Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 24 Aug 2025 13:36:03 +0100
Subject: [PATCH 050/148] Execute bpw_overrides() only if an imatrix file is
 provided

---
 src/llama-quant.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7615376e31c..4ed94540687 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1525,9 +1525,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
-    if (params->target_bpw != -1.0f) {
-        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw);
-        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+    if (params->target_bpw != -1.0f && !params->only_copy) {
+        if (params->imatrix) {
+            if (params->activations) {
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__);
+            } else {
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
+            }
+            LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
+            bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+        } else {
+            LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
+        }
     }
 
     int cur_split = -1;

From d4ac2106fb5b9e1a98d6aef8a0931e73e46f324e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 24 Aug 2025 13:39:10 +0100
Subject: [PATCH 051/148] Improve logging and some minor code refactoring

---
 src/llama-quant.cpp         | 26 +++++++++++++++-----------
 tools/quantize/quantize.cpp |  7 +------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ed94540687..407a63d887d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -132,7 +132,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
 
         for (const auto & p : mapped) {
             if (p.second == blk) {
-                LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
                 return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
             }
         }
@@ -1257,7 +1256,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Build the override map
     std::unordered_map<std::string, ggml_type> overrides;
-    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw);
+    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__);
     for (const auto & ti : all) {
         LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
             __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
@@ -1352,7 +1351,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size()));
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries",int(values_data->size()));
             qs.has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *values_data) {
@@ -1367,7 +1366,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->activations) {
         activations_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->activations);
         if (activations_data) {
-            LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size()));
+            LLAMA_LOG_INFO(" and %d activations",int(activations_data->size()));
             qs.has_activations = true;
             // check activations for nans or infs
             for (const auto & kv : *activations_data) {
@@ -1379,6 +1378,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
 
@@ -1655,12 +1655,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!params->pure && ggml_is_quantized(default_type)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                // get bpw override
-                const auto override = bpw_overrides.find(name);
-                if (override != bpw_overrides.end() && override->second != new_type) {
-                    LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type));
-                    new_type = override->second;
+
+                // get quantization type overrides targeting a given bits per weight budget
+                if (params->target_bpw != -1.0f && !bpw_overrides.empty()) {
+                    const auto override = bpw_overrides.find(name);
+                    if (override != bpw_overrides.end() && override->second != new_type) {
+                        LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type));
+                        new_type = override->second;
+                    }
                 }
+
                 // unless the user specifies a type, and the tensor shape will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
@@ -1668,7 +1672,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_DEBUG("(type override %s) ", ggml_type_name(new_type));
                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                             }
                         }
@@ -1699,7 +1703,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (values_data) {
                 auto it = values_data->find(remap_imatrix(tensor->name, mapped));
                 if (it == values_data->end()) {
-                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s, ", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
                         imatrix = it->second.data();
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index b907008cb4f..77fa6b90cea 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -399,12 +399,7 @@ static int prepare_imatrix(const std::string & imatrix_file,
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
     }
-    if (!values_data.empty()) {
-        printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size()));
-    }
-    if (!activations_data.empty()) {
-        printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size()));
-    }
+
     return m_last_call;
 }
 

From 4286690019f21cae3abb92a7903c6675a3367e5e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 26 Aug 2025 21:39:40 +0100
Subject: [PATCH 052/148] Minor comment update

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 407a63d887d..cbbfdedfbd6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor type overrides to meet target BPW at lowest ppl
+// Returns per-tensor type overrides to meet target BPW at lowest error
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,

From 04946114c9009cd04f665ed98b55304e376e19d3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:01:03 +0100
Subject: [PATCH 053/148] Refactor epsilon into a function-wide variable

---
 src/llama-quant.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index cbbfdedfbd6..da1267ddbc6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -649,6 +649,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q8_0
     };
 
+    constexpr double epsilon = 1e-12;
+
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
@@ -1193,7 +1195,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
             double ratio = err / (double)(delta_bytes * 8ull);
-            if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
+            if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) {
                 best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
@@ -1208,7 +1210,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t now_bytes = current_total_bytes();
         size_t next_bytes = now_bytes + up.delta_bytes;
         double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-        if (bpw_next <= target_bpw + 1e-12) {
+        if (bpw_next <= target_bpw + epsilon) {
             all[up.idx].choice = up.next;
             bpw_now = bpw_next;
         } else {
@@ -1241,7 +1243,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double ratio = err / (double)(delta_bytes * 8ull);
 
             double over_gap = std::abs(bpw_over - (double)target_bpw);
-            if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
+            if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
                 best_over = upgrade{ i, j, err, delta_bytes, ratio };
             }

From 8df1d00ae4042a1eee38c1fc9ac06137d5ce5078 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:04:28 +0100
Subject: [PATCH 054/148] Add directional scaling

---
 src/llama-quant.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index da1267ddbc6..a9621eab8e1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -900,6 +900,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return std::isfinite(total_err) ? total_err : 1e35;
     };
 
+    auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
+        if (!activations) { return 1.0f; }
+        // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
+        // If no values, use v=1
+        double sum_v   = 0.0;
+        double sum_aw2 = 0.0;
+        double sum_a2  = 0.0;
+        for (int64_t j = 0; j < n_per_row; ++j) {
+            const double v = values ? std::max(0.0f, values[j]) : 1.0;
+            const double a = activations[j];
+            sum_v += v;
+            sum_aw2 += v * a * a;
+            sum_a2 += a * a;
+        }
+        const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row));
+        const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
+        const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
+
+        // Clamp to a reasonable range
+        return (float)std::clamp(scale, 0.5, 2.0);
+    };
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {

From 66aff8fa1ee1d34c7faaa0ff658a730a9554ef36 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:06:42 +0100
Subject: [PATCH 055/148] Add precise_lambda()

---
 src/llama-quant.cpp | 102 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a9621eab8e1..662760fbe9a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -921,6 +921,108 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Clamp to a reasonable range
         return (float)std::clamp(scale, 0.5, 2.0);
     };
+
+    // Returns an adaptive lambda for this tensor using a small probe set
+    // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+    // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
+    auto precise_lambda = [&](const ggml_tensor * t,
+        const std::vector<float> & f32_sample,
+        const std::vector<int64_t> & sample_rows_per_slice,
+        const float * values,
+        const float * activations,
+        const std::vector<ggml_type> & compatible_candidates) -> float
+    {
+        // No activations => no projection term
+        if (!activations) { return 0.0f; }
+
+        // pick a tiny probe set: try to spread around mid-range types
+        std::vector<ggml_type> probes;
+        probes.reserve(3);
+        auto push_if = [&](const ggml_type tiny) {
+            if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
+                probes.push_back(tiny);
+            }
+        };
+
+        // Prefer family-consistent probes; fall back to whatever exists
+        push_if(GGML_TYPE_Q4_K);
+        push_if(GGML_TYPE_Q3_K);
+        push_if(GGML_TYPE_Q5_K);
+        if (probes.empty() && !compatible_candidates.empty()) {
+            probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
+        }
+        if (probes.size() == 1 && compatible_candidates.size() >= 2) {
+            probes.push_back(compatible_candidates.front());
+        }
+        if (probes.empty()) { return 0.0f; }
+
+        // Scratch buffers (reused)
+        const int64_t n_per_row = t->ne[0];
+        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        size_t max_row_sz = 0;
+        for (auto pt : probes) {
+            max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
+        }
+        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
+        std::vector<float>   dequantized_buffer(f32_sample.size());
+
+        std::vector<double> ratios;
+        ratios.reserve(probes.size());
+
+        for (const auto pt : probes) {
+            // err at lambda=0 => pure weighted MSE part
+            double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
+            // err at lambda=1 => weighted MSE + projection penalty
+            const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f);
+
+            const double p = std::max(0.0, err1 - err0);  // projection term contribution
+            const double m = std::max(0.0, err0); // MSE term contribution
+            if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
+                ratios.push_back(m / p);
+            }
+        }
+
+        if (ratios.empty()) { return 0.0f; }
+
+        std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
+        double lambda = ratios[ratios.size() / 2];
+
+        // activations directional scale
+        const float scale = directional_scale(values, activations, n_per_row);
+        lambda *= scale;
+
+        // clamp to safe range
+        lambda = std::clamp(lambda, 0.0, 8.0);
+        return (float)lambda;
+    };
+
+    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
+        if (!activations) { return 0.0f; }
+        double s = 0.0;
+        double s2 = 0.0;
+        for (int64_t j = 0; j < n_per_row; ++j) {
+            const double w = values ? std::max(0.0f, values[j]) : 1.0;
+            const double aw = std::sqrt(w) * activations[j];
+            const double aw2 = aw * aw;
+            s += aw2;
+            s2 += aw2 * aw2;
+        }
+        if (s2 <= 0.0) { return 0.0f; }
+        const auto d = (double)n_per_row;
+        //const double p = s * s / (d * s2 + epsilon);
+        //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
+        // Map p in (0,1] to lambda in [0,8] decreasing
+        double base = 1.0 - s * s / (d * s2 + epsilon);
+        base = std::clamp(base, 0.0, 1.0);
+
+        // activations directional scale
+        const double scale = directional_scale(values, activations, n_per_row);
+        // clamp to safe range
+        const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
+
+        return (float)lambda;
+    };
+
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {

From 556f6b04fed2092568e31948708af8102c9e5433 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:08:08 +0100
Subject: [PATCH 056/148] Add --precise-lambda option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 27 +++++++++++++++++----------
 tools/quantize/quantize.cpp |  6 +++++-
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 01c5b67c755..3a5bda32eab 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -357,6 +357,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
+        bool precise_lambda;                  // use precise_lambda calculation - slow computation but very accurate
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 662760fbe9a..98fc11d8403 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -722,7 +722,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * values_sample,
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
-        std::vector<float> & dequantized_buffer) -> double
+        std::vector<float> & dequantized_buffer,
+        float bias_lambda) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
@@ -878,10 +879,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-                // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
-                constexpr float bias_lambda = 1.5f;
-                constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
                     const double proj = bias_num * bias_num / (bias_denom + epsilon);
@@ -1163,6 +1160,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(compatible_candidates.begin(), compatible_candidates.end());
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
+        // Compute adaptive bias_lambda for this tensor
+        float bias_lambda = 0.0f;
+        {
+            const float * values = values_sample.empty() ? nullptr : values_sample.data();
+            const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
+            bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
+                fast_lambda(values, activations, n_per_row);
+        }
+
         // Now evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
@@ -1186,7 +1192,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
                     const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
                     eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
@@ -1301,7 +1307,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto recompute_best_upgrade = [&]() -> upgrade {
-        const double eps = 1e-12;
         upgrade best{ -1, -1, 0.0, 0, -1.0 };
         for (int i = 0; i < (int) all.size(); ++i) {
             const auto & ti = all[i];
@@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
             if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__);
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__);
             } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
+            LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f
+        /*.target_bpw                  =*/ -1.0f,
+        /*.precise_lambda              =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 77fa6b90cea..0c9460513c8 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,9 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
+    printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -538,6 +540,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) {
+            params.precise_lambda = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From eab8708244db703c5c7219261b0c875c4b57825f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 30 Aug 2025 10:14:46 +0100
Subject: [PATCH 057/148] Minor factoring for efficiency and correctness

---
 src/llama-quant.cpp | 126 +++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 66 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 98fc11d8403..db688fdf02c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor type overrides to meet target BPW at lowest error
+// Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     constexpr double epsilon = 1e-12;
+    constexpr double infinity = std::numeric_limits<double>::infinity();
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -680,7 +681,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        // This list should be kept in sync with llama_tensor_quantize_impl()
+        // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
         q &= ggml_n_dims(t) >= 2;
@@ -730,9 +731,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const size_t sample_element_count = f32_sample.size();
-        const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
+        const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
         if (sample_row_count == 0) { return 0.0; }
 
+        size_t expected_rows = 0;
+        for (int64_t s = 0; s < ne2; ++s) {
+            expected_rows += (size_t)sample_rows_per_slice[s];
+        }
+        if (expected_rows != sample_row_count) { return infinity; }
+
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
         const size_t buffer_sz = row_sz * sample_row_count;
 
@@ -750,15 +757,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * activations = activations_sample + s * n_per_row;
                 double denom = 0.0;
                 for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double w = values ? std::max(0.0f, values[j]) : 1.0;
                     const double a = activations[j];
-                    const double w = values ? values[j] : 1.0;
                     denom += w * a * a;
                 }
                 bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Compute per-row squared norms with weighting (if values are provided)
+        // Per-row squared norms with weighting
         std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
@@ -768,15 +775,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (rs == 0) { continue; }
 
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                     const float * x = f32_sample.data() + offset;
                     double rsn = 0.0;
                     if (values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double v  = values[j];
+                            const double w = std::max(0.0f, values[j]);
                             const double xx = x[j];
-                            rsn += v * xx * xx;
+                            rsn += w * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
@@ -790,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        // Quantize sampled rows slice-by-slice into quantized_buffer
+        // Quantize sampled rows per slice -> quantized_buffer
         {
             size_t q_offset = 0;
             size_t f_offset = 0;
@@ -800,35 +806,32 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
                 (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
-
                 q_offset += row_sz * (size_t)rs;
                 f_offset += (size_t)rs * (size_t)n_per_row;
             }
         }
 
-        // Dequantize into dequantized_buffer
+        // quantized_buffer -> dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            auto row_to_float = [&](size_t r) {
-                uint8_t * src = quantized_buffer.data() + r * row_sz;
-                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                if (quant_type == GGML_TYPE_F16) {
-                    ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                } else if (quant_type == GGML_TYPE_BF16) {
-                    ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-                } else {
-                    if (!traits || !traits->to_float) {
-                        LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
-                        return false;
+
+            const bool is_fp16 = quant_type == GGML_TYPE_F16;
+            const bool is_bf16 = quant_type == GGML_TYPE_BF16;
+            if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
+                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row));
+            } else {
+                for (size_t r = 0; r < sample_row_count; ++r) {
+                    uint8_t * src = quantized_buffer.data() + r * row_sz;
+                    float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
+                    if (is_fp16) {
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                    } else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                    } else {
+                        if (!traits || !traits->to_float) { return infinity; }
+                        traits->to_float(src, dst, (int)n_per_row);
                     }
-                    traits->to_float(src, dst, (int)n_per_row);
                 }
-
-                return true;
-            };
-
-            for (size_t r = 0; r < sample_row_count; ++r) {
-                if (!row_to_float(r)) { return 1e35; }
             }
         }
 
@@ -836,6 +839,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t offset = 0;
         size_t row_idx = 0;
         double total_err = 0.0;
+
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
@@ -843,9 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
             const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
-
             double slice_err = 0.0;
-
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
@@ -853,17 +855,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 double bias_num = 0.0;
                 if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double v = values[j];
+                        const double w = std::max(0.0f, values[j]);
                         const double e = y[j] - x[j];
                         const double a = activations[j];
-                        weighted_mse += v * e * e;
-                        bias_num += v * e * a;
+                        weighted_mse += w * e * e;
+                        bias_num += w * e * a;
                     }
                 } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double v = values[j];
+                        const double w = std::max(0.0f, values[j]);
                         const double e = y[j] - x[j];
-                        weighted_mse += v * e * e;
+                        weighted_mse += w * e * e;
                     }
                 } else if (activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -881,26 +883,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
-                    const double proj = bias_num * bias_num / (bias_denom + epsilon);
-                    err_num += (double)bias_lambda * proj;
+                    if (bias_denom > 0.0) {
+                        const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                        err_num += bias_lambda * proj;
+                    }
                 }
 
-                const double err_den = row_sq_norm[row_idx] + epsilon;
-                slice_err += err_num / err_den;
+                const double denom = row_sq_norm[row_idx] + epsilon;
+                slice_err += err_num / denom;
                 offset += (size_t)n_per_row;
             }
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
+            if (!std::isfinite(total_err)) { return infinity; }
         }
 
-        return std::isfinite(total_err) ? total_err : 1e35;
+        return std::isfinite(total_err) ? total_err : infinity;
     };
 
+    // Scaling factor to increase lambda when activations are concentrated
     auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
         if (!activations) { return 1.0f; }
-        // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
-        // If no values, use v=1
         double sum_v   = 0.0;
         double sum_aw2 = 0.0;
         double sum_a2  = 0.0;
@@ -915,13 +919,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
         const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
 
-        // Clamp to a reasonable range
         return (float)std::clamp(scale, 0.5, 2.0);
     };
 
-    // Returns an adaptive lambda for this tensor using a small probe set
-    // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-    // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
+    // Higher precision but much longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
@@ -929,10 +930,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * activations,
         const std::vector<ggml_type> & compatible_candidates) -> float
     {
-        // No activations => no projection term
         if (!activations) { return 0.0f; }
 
-        // pick a tiny probe set: try to spread around mid-range types
         std::vector<ggml_type> probes;
         probes.reserve(3);
         auto push_if = [&](const ggml_type tiny) {
@@ -941,7 +940,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
-        // Prefer family-consistent probes; fall back to whatever exists
         push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q3_K);
         push_if(GGML_TYPE_Q5_K);
@@ -953,19 +951,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         if (probes.empty()) { return 0.0f; }
 
-        // Scratch buffers (reused)
+        // Scratch buffers
         const int64_t n_per_row = t->ne[0];
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         size_t max_row_sz = 0;
         for (auto pt : probes) {
             max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
         }
+
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float>   dequantized_buffer(f32_sample.size());
-
         std::vector<double> ratios;
         ratios.reserve(probes.size());
-
         for (const auto pt : probes) {
             // err at lambda=0 => pure weighted MSE part
             double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
@@ -984,17 +981,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
         double lambda = ratios[ratios.size() / 2];
 
-        // activations directional scale
         const float scale = directional_scale(values, activations, n_per_row);
         lambda *= scale;
-
-        // clamp to safe range
         lambda = std::clamp(lambda, 0.0, 8.0);
+
         return (float)lambda;
     };
 
+    // Faster to compute but lower precision. Best option for the vast majority of models
     auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
         if (!activations) { return 0.0f; }
+
         double s = 0.0;
         double s2 = 0.0;
         for (int64_t j = 0; j < n_per_row; ++j) {
@@ -1004,17 +1001,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             s += aw2;
             s2 += aw2 * aw2;
         }
+
         if (s2 <= 0.0) { return 0.0f; }
         const auto d = (double)n_per_row;
-        //const double p = s * s / (d * s2 + epsilon);
-        //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
-        // Map p in (0,1] to lambda in [0,8] decreasing
         double base = 1.0 - s * s / (d * s2 + epsilon);
         base = std::clamp(base, 0.0, 1.0);
 
-        // activations directional scale
         const double scale = directional_scale(values, activations, n_per_row);
-        // clamp to safe range
         const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
 
         return (float)lambda;
@@ -1036,13 +1029,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        // Dequantize only sampled rows into f32_sample
+        // Dequantize sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
-        constexpr int sample_rows_per_expert = 384;
+        // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
+        constexpr int sample_rows_per_expert = 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 
@@ -1096,6 +1089,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
             if (it == m->end()) { return {nullptr, 0}; }
+
             return { it->second.data(), it->second.size() };
         };
 
@@ -1104,7 +1098,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             dst.clear();
             if (!src || src_sz == 0) { return; }
-
             if (src_sz == want) {
                 dst.resize(want);
                 std::memcpy(dst.data(), src, want * sizeof(float));
@@ -1160,7 +1153,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(compatible_candidates.begin(), compatible_candidates.end());
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
-        // Compute adaptive bias_lambda for this tensor
+        // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
+        // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
         float bias_lambda = 0.0f;
         {
             const float * values = values_sample.empty() ? nullptr : values_sample.data();

From 04c07b3272f067ba30d32fb82d693fb0013cc47d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 10 Sep 2025 18:00:56 +0100
Subject: [PATCH 058/148] Add better control over MSE and directional bias
 computation

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 41 +++++++++----------------------------
 tools/quantize/quantize.cpp | 31 +++++++++++++++++++++++++---
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index d0ca37dc65a..ba6c185346c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        bool precise_lambda;                  // use precise_lambda calculation - slow computation but very accurate
+        int32_t bpw_bias;                     // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index db688fdf02c..74ceb3de9cc 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -902,26 +902,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Scaling factor to increase lambda when activations are concentrated
-    auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
-        if (!activations) { return 1.0f; }
-        double sum_v   = 0.0;
-        double sum_aw2 = 0.0;
-        double sum_a2  = 0.0;
-        for (int64_t j = 0; j < n_per_row; ++j) {
-            const double v = values ? std::max(0.0f, values[j]) : 1.0;
-            const double a = activations[j];
-            sum_v += v;
-            sum_aw2 += v * a * a;
-            sum_a2 += a * a;
-        }
-        const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row));
-        const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
-        const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
-
-        return (float)std::clamp(scale, 0.5, 2.0);
-    };
-
     // Higher precision but much longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
@@ -979,11 +959,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (ratios.empty()) { return 0.0f; }
 
         std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
-        double lambda = ratios[ratios.size() / 2];
-
-        const float scale = directional_scale(values, activations, n_per_row);
-        lambda *= scale;
-        lambda = std::clamp(lambda, 0.0, 8.0);
+        const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0);
 
         return (float)lambda;
     };
@@ -1007,8 +983,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         double base = 1.0 - s * s / (d * s2 + epsilon);
         base = std::clamp(base, 0.0, 1.0);
 
-        const double scale = directional_scale(values, activations, n_per_row);
-        const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
+        const double lambda = std::clamp(base, 0.0, 1.0) * 8.0;
 
         return (float)lambda;
     };
@@ -1159,8 +1134,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         {
             const float * values = values_sample.empty() ? nullptr : values_sample.data();
             const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-            bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
-                fast_lambda(values, activations, n_per_row);
+            if (params->bpw_bias == 1) {
+                bias_lambda = fast_lambda(values, activations, n_per_row);
+            } else if (params->bpw_bias == 2) {
+                bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
+            }
         }
 
         // Now evaluate candidates
@@ -1656,7 +1634,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda");
+            const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"};
+            LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]);
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -1967,7 +1946,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.precise_lambda              =*/ false
+        /*.bpw_bias                    =*/ 1
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0c9460513c8..0fe65daea0d 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -134,7 +134,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n");
+    printf("  --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -496,6 +496,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
+static bool parse_bpw_bias(const char * data, int & bpw_bias) {
+    if (!data) {
+        printf("\n%s: error bias type not provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        bpw_bias = std::stoi(data);
+        if (bpw_bias < 0 || bpw_bias > 2) {
+            printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -510,6 +531,7 @@ int main(int argc, char ** argv) {
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
+    int bpw_bias = 1;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -540,8 +562,11 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) {
-            params.precise_lambda = true;
+        } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) {
+            if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) {
+                usage(argv[0]);
+            }
+            params.bpw_bias = bpw_bias;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 886536d80ab5c227cd6c3f8813b8b5fbf5bea41d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:27:23 +0100
Subject: [PATCH 059/148] Increase error type precision

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 74ceb3de9cc..c4c525c68e4 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -612,7 +612,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ggml_type type;
         float bpw;
         size_t bytes;
-        float error;
+        double error;
     };
 
     struct tensor_info {

From bc8762f27f185c5db1cbd0d8ec3bcc8e1771856d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:33:22 +0100
Subject: [PATCH 060/148] Capture surrounding function name

---
 src/llama-quant.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c4c525c68e4..cae908803be 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -651,6 +651,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
+    const char * func = __func__;
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -1083,7 +1084,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
             } else {
                 LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
-                    __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
+                    func, name.c_str(), src_sz, (size_t)n_per_row, want);
             }
         };
 

From 4dff85fbe54336130155a8e4fa5e7f4db48f4451 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:41:37 +0100
Subject: [PATCH 061/148] Improve precise_lambda() efficiency

---
 src/llama-quant.cpp | 126 ++++++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 40 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index cae908803be..1677b242d9e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
         std::vector<float> & dequantized_buffer,
-        float bias_lambda) -> double
+        float bias_lambda,
+        double * out_mse = nullptr,
+        double * out_proj = nullptr) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
@@ -733,13 +735,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const size_t sample_element_count = f32_sample.size();
         const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
-        if (sample_row_count == 0) { return 0.0; }
+        if (sample_row_count == 0) {
+            if (out_mse) { *out_mse = 0.0; }
+            if (out_proj) { *out_proj = 0.0; }
+
+            return 0.0;
+        }
 
         size_t expected_rows = 0;
         for (int64_t s = 0; s < ne2; ++s) {
             expected_rows += (size_t)sample_rows_per_slice[s];
         }
-        if (expected_rows != sample_row_count) { return infinity; }
+        if (expected_rows != sample_row_count) {
+            if (out_mse) { *out_mse = infinity; }
+            if (out_proj) { *out_proj = 0.0; }
+
+            return infinity;
+        }
 
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
         const size_t buffer_sz = row_sz * sample_row_count;
@@ -750,7 +762,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const bool has_values = values_sample != nullptr;
         const bool has_activations = activations_sample != nullptr;
 
-        // Bias denominators per slice (only needed if we have activations)
+        // Bias denominators per slice
         std::vector<double> bias_denominator_per_slice(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
@@ -815,7 +827,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // quantized_buffer -> dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-
             const bool is_fp16 = quant_type == GGML_TYPE_F16;
             const bool is_bf16 = quant_type == GGML_TYPE_BF16;
             if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
@@ -825,12 +836,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     uint8_t * src = quantized_buffer.data() + r * row_sz;
                     float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
                     if (is_fp16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
-                    } else if (is_bf16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
-                    } else {
-                        if (!traits || !traits->to_float) { return infinity; }
-                        traits->to_float(src, dst, (int)n_per_row);
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row);
+                    }
+                    else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row);
+                    }
+                    else {
+                        if (!traits || !traits->to_float) {
+                            if (out_mse) { *out_mse = infinity; }
+                            if (out_proj) { *out_proj = 0.0; }
+
+                            return infinity;
+                        }
+                        traits->to_float(src, dst, (int) n_per_row);
                     }
                 }
             }
@@ -839,8 +857,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Compute error
         size_t offset = 0;
         size_t row_idx = 0;
-        double total_err = 0.0;
-
+        double total_mse = 0.0;
+        double total_proj = 0.0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
@@ -848,7 +866,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
             const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
-            double slice_err = 0.0;
+            std::vector<double> row_mse_norm;
+            std::vector<double> row_proj_norm;
+            row_mse_norm.reserve(rs);
+            if (activations) { row_proj_norm.reserve(rs); }
+
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
@@ -868,13 +890,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         weighted_mse += w * e * e;
                     }
-                } else if (activations) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double e = y[j] - x[j];
-                        const double a = activations[j];
-                        weighted_mse += e * e;
-                        bias_num += e * a;
-                    }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
@@ -882,28 +897,64 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                double err_num = weighted_mse;
-                if (activations && bias_lambda != 0.0f) {
+                const double denom_x = row_sq_norm[row_idx];
+                double m_norm = weighted_mse / (denom_x + epsilon);
+                row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
+
+                if (activations) {
+                    double p_norm = 0.0;
                     if (bias_denom > 0.0) {
                         const double proj = bias_num * bias_num / (bias_denom + epsilon);
-                        err_num += bias_lambda * proj;
+                        p_norm = std::isfinite(proj) ? proj : 0.0;
                     }
+                    row_proj_norm.push_back(p_norm);
                 }
-
-                const double denom = row_sq_norm[row_idx] + epsilon;
-                slice_err += err_num / denom;
                 offset += (size_t)n_per_row;
             }
 
+            // Trimmed sum to avoid outlier rows dominating the results
+            auto trimmed_sum = [&](std::vector<double> & v) -> double {
+                if (v.empty()) { return 0.0; }
+                const int64_t n = (int64_t)v.size();
+                if (n < 50) {
+                    double s = 0.0;
+                    for (const double z : v) { s += z; }
+                    return s;
+                }
+
+                int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
+                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // but not more than 3.125%
+                std::nth_element(v.begin(), v.begin() + k, v.end());
+                std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+                double s = 0.0;
+                for (int64_t i = k; i < n - k; ++i) {
+                    s += v[i];
+                }
+
+                return s;
+            };
+
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
-            total_err += slice_err * scale_rows;
-            if (!std::isfinite(total_err)) { return infinity; }
+
+            total_mse += trimmed_sum(row_mse_norm) * scale_rows;
+            if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; }
+
+            if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) {
+                if (out_mse) { *out_mse = infinity; }
+                if (out_proj) { *out_proj = 0.0; }
+
+                return infinity;
+            }
         }
 
+        if (out_mse) { *out_mse = total_mse; }
+        if (out_proj) { *out_proj = total_proj; }
+
+        const double total_err = total_mse + bias_lambda * total_proj;
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Higher precision but much longer to compute
+    // Higher precision but longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
@@ -936,22 +987,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         size_t max_row_sz = 0;
-        for (auto pt : probes) {
-            max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
-        }
+        for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
 
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float>   dequantized_buffer(f32_sample.size());
+
         std::vector<double> ratios;
         ratios.reserve(probes.size());
         for (const auto pt : probes) {
-            // err at lambda=0 => pure weighted MSE part
-            double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
-            // err at lambda=1 => weighted MSE + projection penalty
-            const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f);
-
-            const double p = std::max(0.0, err1 - err0);  // projection term contribution
-            const double m = std::max(0.0, err0); // MSE term contribution
+            double m = 0.0;
+            double p = 0.0;
+            (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p);
             if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
                 ratios.push_back(m / p);
             }

From 7d85993f268d9fa35bea9178f6acf2d72833dffa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:44:41 +0100
Subject: [PATCH 062/148] Minor refactoring

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1677b242d9e..15ea36721e8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -617,7 +617,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     struct tensor_info {
         const llama_model_loader::llama_tensor_weight * w = nullptr;
-        std::vector<candidate_types> candidate = {};
+        std::vector<candidate_types> candidate;
         int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
@@ -972,8 +972,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
-        push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q3_K);
+        push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q5_K);
         if (probes.empty() && !compatible_candidates.empty()) {
             probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
@@ -1011,7 +1011,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return (float)lambda;
     };
 
-    // Faster to compute but lower precision. Best option for the vast majority of models
+    // Faster to compute but may yield lower precision. Best option for the vast majority of cases
     auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
         if (!activations) { return 0.0f; }
 
@@ -1057,12 +1057,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
-        constexpr int sample_rows_per_expert = 256;
+        const int sample_rows_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 
-        // deterministic sampling seed based on tensor name + fixed constant
-        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
         std::vector<int64_t> sample_rows_per_slice(ne2, 0);
         const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
@@ -1072,6 +1070,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
         for (int64_t slice = 0; slice < ne2; ++slice) {
+            std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
             if (stride > 1) {
@@ -1084,11 +1083,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
-                    const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
-                    const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
@@ -1211,7 +1210,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
                     const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
+                    const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
                     eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
@@ -1240,7 +1239,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return a.error < b.error;
             });
 
-            double best_err = std::numeric_limits<double>::infinity();
+            double best_err = infinity;
             size_t last_bytes = std::numeric_limits<size_t>::max();
             for (const auto & c : info.candidate) {
                 // Only keep the best error seen so far at strictly larger byte sizes

From 12e816b51199b38a6571141d5f1e5f1039ebe706 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 09:24:23 +0100
Subject: [PATCH 063/148] Replace greedy allocator with lagrangian relaxation

---
 src/llama-quant.cpp | 266 ++++++++++++++++++++++++++------------------
 1 file changed, 156 insertions(+), 110 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 15ea36721e8..a369d50ffe6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1266,152 +1266,198 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
-    // Greedy allocation from minimum bpw upward to reach target_bpw
-    auto current_total_bytes = [&]() -> size_t {
-        size_t b = 0;
+    // Lagrangian relaxation to minimise error subject to a bpw target constraint
+    auto total_bytes = [&]() -> size_t {
+        size_t tb = 0;
         for (const auto & ti : all) {
-            b += ti.candidate[ti.choice].bytes;
+            tb += ti.candidate[ti.choice].bytes;
         }
 
-        return b;
+        return tb;
     };
 
-    auto total_weights = [&]() -> size_t {
-        size_t w = 0;
-        for (const auto & ti : all) {
-            w += ti.n_elements;
-        }
-
-        return w;
-    };
+    size_t total_elems = 0;
+    size_t min_bytes = 0;
+    size_t max_bytes = 0;
+    for (const auto & ti : all) {
+        total_elems += (size_t)ti.n_elements;
+        min_bytes += ti.candidate.front().bytes;  // smallest candidate per tensor
+        max_bytes += ti.candidate.back().bytes;   // largest candidate per tensor
+    }
 
-    const size_t tw = total_weights();
-    auto current_bpw = [&]() -> double {
-        return (double)current_total_bytes() * 8.0f / (double)tw;
-    };
+    if (total_elems == 0) { return {}; }
 
-    // Precompute current bpw
-    double bpw_now = current_bpw();
+    const double target_bpw = params->target_bpw;
+    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0);
 
-    float target_bpw = params->target_bpw;
-    // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw)
-    if (bpw_now >= target_bpw) {
+    auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
+        LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
         for (const auto & ti : all) {
+            LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
+                func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
             overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
         }
 
         return overrides;
+    };
+
+    if (budget_bytes <= min_bytes) {
+        for (auto & ti : all) { ti.choice = 0; }
+
+        return emit_overrides();
     }
+    if (budget_bytes >= max_bytes) {
+        for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
 
-    struct upgrade {
-        int idx;
-        int next;
-        double err;
-        size_t delta_bytes;
-        double ratio;
-    };
+        return emit_overrides();
+    }
 
-    // Find next strictly-larger candidate index for a tensor
-    auto next_distinct_idx = [&](const tensor_info & ti) -> int {
-        const auto & cand = ti.candidate;
-        const auto & cur  = cand[ti.choice];
-        int j = ti.choice + 1;
-        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) {
-            ++j;
-        }
+    auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
+        choice.resize(all.size());
+        bytes = 0;
+        err = 0.0;
+        for (size_t i = 0; i < all.size(); ++i) {
+            const auto & cand = all[i].candidate;
+            int best_j = 0;
+            double best_val = infinity;
+            for (int j = 0; j < (int)cand.size(); ++j) {
+                const double bits = (double)cand[j].bytes * 8.0;
+                const double val = cand[j].error + mu * bits;
+                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) {
+                    best_val = val;
+                    best_j = j;
+                }
+            }
 
-        return j < (int)cand.size() ? j : -1;
+            choice[i] = best_j;
+            bytes += cand[best_j].bytes;
+            err += cand[best_j].error;
+        }
     };
 
-    auto recompute_best_upgrade = [&]() -> upgrade {
-        upgrade best{ -1, -1, 0.0, 0, -1.0 };
-        for (int i = 0; i < (int) all.size(); ++i) {
-            const auto & ti = all[i];
-            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
-
-            const int j = next_distinct_idx(ti);
-            if (j < 0) { continue; }
-
-            const auto & cur = ti.candidate[ti.choice];
-            const auto & nxt = ti.candidate[j];
-            const size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; }
-
-            double err = cur.error - nxt.error;
-            err = std::max(err, 0.0);
-            double ratio = err / (double)(delta_bytes * 8ull);
-            if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) {
-                best = upgrade{ i, j, err, delta_bytes, ratio };
+    size_t bytes_lo = 0;
+    size_t bytes_hi = 0;
+    size_t bytes_mid = 0;
+    double mu_lo = 0.0;
+    double mu_hi = 1.0;
+    double err_lo = 0.0;
+    double err_hi = 0.0;
+    double err_mid = 0.0;
+    std::vector<int> choice_lo;
+    std::vector<int> choice_hi;
+    std::vector<int> choice_mid;
+    std::vector<int> best_under_choice;
+    std::vector<int> best_over_choice;
+
+    lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo);
+
+    // increase mu until we get under budget or hit a safety cap
+    {
+        int expand = 0;
+        while (true) {
+            lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
+            if (bytes_hi <= budget_bytes) {
+                break;
+            }
+            mu_hi *= 2.0;
+            if (++expand > 60) {
+                break;
             }
         }
+    }
 
-        return best;
-    };
+    double best_under_gap = infinity;
+    double best_over_gap = infinity;
+    double best_under_err = infinity;
+    double best_over_err = infinity;
+    for (int it = 0; it < 40; ++it) {
+        double mu = 0.5 * (mu_lo + mu_hi);
+        lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
-    while (true) {
-        upgrade up = recompute_best_upgrade();
-        if (up.idx < 0) { break; }
+        const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
 
-        size_t now_bytes = current_total_bytes();
-        size_t next_bytes = now_bytes + up.delta_bytes;
-        double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-        if (bpw_next <= target_bpw + epsilon) {
-            all[up.idx].choice = up.next;
-            bpw_now = bpw_next;
+        if (bytes_mid > budget_bytes) {
+            // Too big, need stronger penalty
+            mu_lo = mu;
+
+            if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) {
+                best_over_gap = gap;
+                best_over_err = err_mid;
+                best_over_choice = choice_mid;
+            }
         } else {
-            break;
-        }
-    }
+            // Under budget, good candidate
+            mu_hi = mu;
 
-    // We might still be below target so we try to find the best upgrade one last time
-    {
-        upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
-        double  best_over_gap = 1e300;
-        double  under_gap = target_bpw - bpw_now;
-        size_t now_bytes = current_total_bytes();
-        for (int i = 0; i < (int) all.size(); ++i) {
-            const auto & ti = all[i];
-            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
-
-            int j = next_distinct_idx(ti);
-            if (j < 0) { continue; }
-
-            const auto & cur = ti.candidate[ti.choice];
-            const auto & nxt = ti.candidate[j];
-            size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; }
-
-            size_t over_bytes = now_bytes + delta_bytes;
-            double bpw_over = (double)over_bytes * 8.0 / (double)tw;
-            double err = cur.error - nxt.error;
-            if (err < 0.0) { err = 0.0; }
-            double ratio = err / (double)(delta_bytes * 8ull);
-
-            double over_gap = std::abs(bpw_over - (double)target_bpw);
-            if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) {
-                best_over_gap = over_gap;
-                best_over = upgrade{ i, j, err, delta_bytes, ratio };
+            if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) {
+                best_under_gap = gap;
+                best_under_err = err_mid;
+                best_under_choice = choice_mid;
             }
         }
+    }
 
-        if (best_over.idx >= 0) {
-            if (best_over_gap < under_gap) {
-                all[best_over.idx].choice = best_over.next;
+    if (!best_under_choice.empty()) {
+        for (size_t i = 0; i < all.size(); ++i) {
+            all[i].choice = best_under_choice[i];
+        }
+    } else if (!best_over_choice.empty()) {
+        for (size_t i = 0; i < all.size(); ++i) {
+            all[i].choice = best_over_choice[i];
+        }
+    } else {
+        // Pick whichever side we already have, or keep minimal
+        if (bytes_hi <= budget_bytes && !choice_hi.empty()) {
+            for (size_t i = 0; i < all.size(); ++i) {
+                all[i].choice = choice_hi[i];
+            }
+        } else {
+            for (auto & ti : all) {
+                ti.choice = 0;
             }
         }
     }
 
-    // Build the override map
-    std::unordered_map<std::string, ggml_type> overrides;
-    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__);
-    for (const auto & ti : all) {
-        LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
-            __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
-        overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+    // Spend any remaining budget with best upgrades that still fit (one pass)
+    {
+        auto cur_bytes = total_bytes();
+        while (true) {
+            int best_i = -1;
+            int best_j = -1;
+            double best_ratio = -1.0;
+            size_t best_delta = 0;
+
+            for (int i = 0; i < (int)all.size(); ++i) {
+                const auto & ti = all[i];
+                if (ti.choice >= (int)ti.candidate.size() - 1) {
+                    continue;
+                }
+
+                int j = ti.choice + 1;
+                while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
+                if (j >= (int)ti.candidate.size()) { continue; }
+
+                size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
+                if (cur_bytes + delta > budget_bytes) { continue; }
+
+                double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error);
+                double ratio = err_gain / (double)(delta * 8);
+                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
+                    best_ratio = ratio;
+                    best_delta = delta;
+                    best_i = i;
+                    best_j = j;
+                }
+            }
+
+            if (best_i < 0) { break; }
+            all[best_i].choice = best_j;
+            cur_bytes += best_delta;
+        }
     }
 
-    return overrides;
+    return emit_overrides();
 }
 
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {

From 2b516068e2ef0e51373be32b1917eb7295bcfc54 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 09:41:52 +0100
Subject: [PATCH 064/148] "Convexify" candidate list

---
 src/llama-quant.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a369d50ffe6..955e6c12fe3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1257,6 +1257,32 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.swap(pruned);
         }
 
+        // Enforce convexity in (bytes, error) curve
+        {
+            const auto & c = info.candidate;
+            if (c.size() >= 3) {
+                std::vector<candidate_types> convex;
+                convex.reserve(c.size());
+                auto slope = [](const candidate_types & a, const candidate_types & b) -> double {
+                    const double dx = (double)b.bytes - (double)a.bytes;
+                    if (dx <= 0.0) { return infinity; }
+
+                    return ((double)b.error - (double)a.error) / dx;
+                };
+
+                for (const auto & p : c) {
+                    while (convex.size() >= 2) {
+                        double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]);
+                        double s2 = slope(convex[convex.size() - 1], p);
+                        if (s2 + epsilon < s1) { convex.pop_back(); }
+                        else { break; }
+                    }
+                    convex.push_back(p);
+                }
+                info.candidate.swap(convex);
+            }
+        }
+
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;

From 8503d59ee44bc30b0d030cceb5e17590b334730d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 11:49:18 +0100
Subject: [PATCH 065/148] Increase IQ options

---
 src/llama-quant.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 955e6c12fe3..41fd819f86f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -641,12 +641,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
         GGML_TYPE_IQ2_S,
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+        // TODO: find better way to handle F16/BF16
+#ifdef GGML_USE_METAL
+        GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     constexpr double epsilon = 1e-12;

From c709e1a3353cbefbe58320c2eae1a1edafc0f618 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 14 Sep 2025 22:38:27 +0100
Subject: [PATCH 066/148] Fix MoE tensor estimation

---
 src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 41fd819f86f..1efb1c5eeed 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1021,27 +1021,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Faster to compute but may yield lower precision. Best option for the vast majority of cases
-    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
+    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
         if (!activations) { return 0.0f; }
 
-        double s = 0.0;
-        double s2 = 0.0;
-        for (int64_t j = 0; j < n_per_row; ++j) {
-            const double w = values ? std::max(0.0f, values[j]) : 1.0;
-            const double aw = std::sqrt(w) * activations[j];
-            const double aw2 = aw * aw;
-            s += aw2;
-            s2 += aw2 * aw2;
-        }
+        double accum = 0.0;
+        int ns = 0;
+
+        for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
+            const float * v = values ? values + s * n_per_row : nullptr;
+            const float * a = activations + s * n_per_row;
+
+            double s1 = 0.0;
+            double s2 = 0.0;
+            for (int64_t j = 0; j < n_per_row; ++j) {
+                const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                const double aw = std::sqrt(w) * a[j];
+                const double aw2 = aw * aw;
+                s1 += aw2;
+                s2 += aw2 * aw2;
+            }
 
-        if (s2 <= 0.0) { return 0.0f; }
-        const auto d = (double)n_per_row;
-        double base = 1.0 - s * s / (d * s2 + epsilon);
-        base = std::clamp(base, 0.0, 1.0);
+            if (s1 > 0.0) {
+                const double n = (double)n_per_row;
+                double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
+                double lambda = 8.0 * (c / (c + 1.0));
+                accum += std::clamp(lambda, 0.0, 8.0);
+                ++ns;
+            }
+        }
 
-        const double lambda = std::clamp(base, 0.0, 1.0) * 8.0;
+        if (ns == 0) { return 0.0f; }
 
-        return (float)lambda;
+        return (float)(accum / ns);
     };
 
     std::vector<tensor_info> all;
@@ -1190,7 +1201,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = values_sample.empty() ? nullptr : values_sample.data();
             const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
             if (params->bpw_bias == 1) {
-                bias_lambda = fast_lambda(values, activations, n_per_row);
+                bias_lambda = fast_lambda(values, activations, n_per_row, ne2);
             } else if (params->bpw_bias == 2) {
                 bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
             }

From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 21:31:31 +0100
Subject: [PATCH 067/148] General refactoring

---
 src/llama-quant.cpp | 75 +++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c6051a480c0..6e5562379cf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -729,19 +729,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,
         const std::vector<float> & f32_sample,
-        const std::vector<int64_t> & sample_rows_per_slice,
+        const std::vector<int64_t> & rows_sample,
         const float * values_sample,
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
         std::vector<float> & dequantized_buffer,
-        float bias_lambda,
+        float tensor_bias_lambda,
+        const float * slice_bias_lambda,
         double * out_mse = nullptr,
         double * out_proj = nullptr) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-
         const size_t sample_element_count = f32_sample.size();
         const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
         if (sample_row_count == 0) {
@@ -753,8 +753,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         size_t expected_rows = 0;
         for (int64_t s = 0; s < ne2; ++s) {
-            expected_rows += (size_t)sample_rows_per_slice[s];
+            expected_rows += (size_t)rows_sample[s];
         }
+
         if (expected_rows != sample_row_count) {
             if (out_mse) { *out_mse = infinity; }
             if (out_proj) { *out_proj = 0.0; }
@@ -783,17 +784,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const double a = activations[j];
                     denom += w * a * a;
                 }
+
                 bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Per-row squared norms with weighting
+        // Weighted per-row squared norms
         std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
             size_t row_idx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
-                const int64_t rs = sample_rows_per_slice[s];
+                const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
@@ -823,7 +825,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             size_t q_offset = 0;
             size_t f_offset = 0;
             for (int64_t slice = 0; slice < ne2; ++slice) {
-                const int64_t rs = sample_rows_per_slice[slice];
+                const int64_t rs = rows_sample[slice];
                 if (rs == 0) { continue; }
 
                 const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -843,21 +845,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             } else {
                 for (size_t r = 0; r < sample_row_count; ++r) {
                     uint8_t * src = quantized_buffer.data() + r * row_sz;
-                    float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
                     if (is_fp16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row);
-                    }
-                    else if (is_bf16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row);
-                    }
-                    else {
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+                    } else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+                    } else {
                         if (!traits || !traits->to_float) {
                             if (out_mse) { *out_mse = infinity; }
                             if (out_proj) { *out_proj = 0.0; }
 
                             return infinity;
                         }
-                        traits->to_float(src, dst, (int) n_per_row);
+                        traits->to_float(src, dst, (int)n_per_row);
                     }
                 }
             }
@@ -1098,20 +1098,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 offset = dist(rng);
             }
 
-            for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
+            for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
                 if (src_type == GGML_TYPE_F32) {
-                    const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
+                    const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row;
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
-                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
-                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
-                    const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (!src_traits || !src_traits->to_float) {
                         throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
                     }
@@ -1120,9 +1120,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 } else {
                     throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
                 }
+
                 ++current_sampled_rows;
             }
-            sample_rows_per_slice[slice] = current_sampled_rows;
+
+            rows_sample[slice] = current_sampled_rows;
         }
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
@@ -1160,7 +1162,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
         if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
-        const int64_t nelem = ggml_nelements(t);
+        const int64_t nelem = ggml_nelements(tensor);
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
@@ -1185,8 +1187,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
-            ggml_type tt = make_compatible(t, ts_type);
-            if (!is_compatible(t, tt)) { continue; }
+
+            ggml_type tt = make_compatible(tensor, ts_type);
+            if (!is_compatible(tensor, tt)) { continue; }
             compatible_candidates.push_back(tt);
             max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
         }
@@ -1222,16 +1225,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 // thread-local scratch
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
-
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
                     if (i >= compatible_candidates.size()) { break; }
 
-                    const ggml_type tt = compatible_candidates[i];
-                    const auto bpw = (float)tensor_bpw(t, tt);
-                    const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
-                    eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
+                    const ggml_type tensor_types = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(tensor, tensor_types);
+                    const size_t bytes = tensor_bytes(tensor, tensor_types);
+                    const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+                        tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda);
+                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
                 }
             });
         }
@@ -1244,8 +1247,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
-            float bpw = ggml_nbytes(t) * 8.0f / nelem;
-            info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
+            float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
+            info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
         // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
@@ -1274,6 +1277,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     // same bytes: we already sorted by error; skip
                 }
             }
+
             info.candidate.swap(pruned);
         }
 
@@ -1299,6 +1303,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                     convex.push_back(p);
                 }
+
                 info.candidate.swap(convex);
             }
         }
@@ -1312,7 +1317,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
-    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto total_bytes = [&]() -> size_t {
         size_t tb = 0;
         for (const auto & ti : all) {
@@ -1359,6 +1363,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
+    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
         bytes = 0;
@@ -1406,6 +1411,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (bytes_hi <= budget_bytes) {
                 break;
             }
+
             mu_hi *= 2.0;
             if (++expand > 60) {
                 break;
@@ -1422,11 +1428,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
         const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
-
         if (bytes_mid > budget_bytes) {
             // Too big, need stronger penalty
             mu_lo = mu;
-
             if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) {
                 best_over_gap = gap;
                 best_over_err = err_mid;
@@ -1435,7 +1439,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         } else {
             // Under budget, good candidate
             mu_hi = mu;
-
             if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) {
                 best_under_gap = gap;
                 best_under_err = err_mid;

From a36946997e2c365e9317062f14e298af6e9928a9 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 21:36:54 +0100
Subject: [PATCH 068/148] Replace fast_bias() for per slice version and remove
 precise_bias()

---
 src/llama-quant.cpp | 167 +++++++++++++++-----------------------------
 1 file changed, 58 insertions(+), 109 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6e5562379cf..fe10365772a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -868,8 +868,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t row_idx = 0;
         double total_mse = 0.0;
         double total_proj = 0.0;
+        double total_bias = 0.0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = sample_rows_per_slice[slice];
+            const int64_t rs = rows_sample[slice];
             if (rs == 0) { continue; }
 
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -918,21 +919,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                     row_proj_norm.push_back(p_norm);
                 }
+
                 offset += (size_t)n_per_row;
             }
 
             // Trimmed sum to avoid outlier rows dominating the results
             auto trimmed_sum = [&](std::vector<double> & v) -> double {
                 if (v.empty()) { return 0.0; }
+
                 const int64_t n = (int64_t)v.size();
                 if (n < 50) {
                     double s = 0.0;
                     for (const double z : v) { s += z; }
+
                     return s;
                 }
 
-                int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // but not more than 3.125%
+                int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
+                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // cap at ~3.125%
                 std::nth_element(v.begin(), v.begin() + k, v.end());
                 std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
                 double s = 0.0;
@@ -944,11 +948,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             };
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
+            const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
+            const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
 
-            total_mse += trimmed_sum(row_mse_norm) * scale_rows;
-            if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; }
+            total_mse += slice_mse;
+            total_proj += slice_proj;
 
-            if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) {
+            // per-slice lambda if provided, otherwise use scalar
+            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda;
+            total_bias += bl * slice_proj;
+
+            if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) {
                 if (out_mse) { *out_mse = infinity; }
                 if (out_proj) { *out_proj = 0.0; }
 
@@ -959,100 +969,42 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_mse) { *out_mse = total_mse; }
         if (out_proj) { *out_proj = total_proj; }
 
-        const double total_err = total_mse + bias_lambda * total_proj;
+        const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
+
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Higher precision but longer to compute
-    auto precise_lambda = [&](const ggml_tensor * t,
-        const std::vector<float> & f32_sample,
-        const std::vector<int64_t> & sample_rows_per_slice,
-        const float * values,
-        const float * activations,
-        const std::vector<ggml_type> & compatible_candidates) -> float
+    // Returns lambda per slice or 0.0 if no activations
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float>
     {
-        if (!activations) { return 0.0f; }
-
-        std::vector<ggml_type> probes;
-        probes.reserve(3);
-        auto push_if = [&](const ggml_type tiny) {
-            if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
-                probes.push_back(tiny);
-            }
-        };
-
-        push_if(GGML_TYPE_Q3_K);
-        push_if(GGML_TYPE_Q4_K);
-        push_if(GGML_TYPE_Q5_K);
-        if (probes.empty() && !compatible_candidates.empty()) {
-            probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
-        }
-        if (probes.size() == 1 && compatible_candidates.size() >= 2) {
-            probes.push_back(compatible_candidates.front());
-        }
-        if (probes.empty()) { return 0.0f; }
-
-        // Scratch buffers
-        const int64_t n_per_row = t->ne[0];
-        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
-        size_t max_row_sz = 0;
-        for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
-
-        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
-        std::vector<float>   dequantized_buffer(f32_sample.size());
-
-        std::vector<double> ratios;
-        ratios.reserve(probes.size());
-        for (const auto pt : probes) {
-            double m = 0.0;
-            double p = 0.0;
-            (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p);
-            if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
-                ratios.push_back(m / p);
-            }
-        }
-
-        if (ratios.empty()) { return 0.0f; }
-
-        std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
-        const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0);
-
-        return (float)lambda;
-    };
-
-    // Faster to compute but may yield lower precision. Best option for the vast majority of cases
-    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
-        if (!activations) { return 0.0f; }
-
-        double accum = 0.0;
-        int ns = 0;
+        std::vector<float> lambdas(std::max<int64_t>(1, ne2), 0.0f);
+        if (!activations) { return lambdas; }
 
         for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
             const float * v = values ? values + s * n_per_row : nullptr;
             const float * a = activations + s * n_per_row;
-
             double s1 = 0.0;
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
-                const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                const double w = v ? std::max(0.0f, v[j]) : 1.0;
                 const double aw = std::sqrt(w) * a[j];
                 const double aw2 = aw * aw;
                 s1 += aw2;
                 s2 += aw2 * aw2;
             }
 
+            float l = 0.0f;
             if (s1 > 0.0) {
-                const double n = (double)n_per_row;
-                double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
+                const auto n = (double)n_per_row;
+                const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
                 double lambda = 8.0 * (c / (c + 1.0));
-                accum += std::clamp(lambda, 0.0, 8.0);
-                ++ns;
+                l = (float)std::clamp(lambda, 0.0, 12.0);
             }
-        }
 
-        if (ns == 0) { return 0.0f; }
+            lambdas[(size_t)s] = l;
+        }
 
-        return (float)(accum / ns);
+        return lambdas;
     };
 
     std::vector<tensor_info> all;
@@ -1060,32 +1012,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     for (const auto * tw : tensors) {
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
-        ggml_tensor * t = tw->tensor;
-        const std::string name = ggml_get_name(t);
-        if (!can_quantize(t)) { continue; }
+        ggml_tensor * tensor = tw->tensor;
+        const std::string name = ggml_get_name(tensor);
+        if (!can_quantize(tensor)) { continue; }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor));
         if (!ml.use_mmap) {
-            if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); }
-            t->data = buffer.data();
+            if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
+            tensor->data = buffer.data();
         }
-        ml.load_data_for(t);
+
+        ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
-        const int64_t n_per_row = t->ne[0];
-        const int64_t nrows_total = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t n_per_row = tensor->ne[0];
+        const int64_t nrows_total = tensor->ne[1];
+        const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
 
-        // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
-        const int sample_rows_per_expert = activations_data ? 512 : 256;
+        // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
+        const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
-        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
+        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
 
-        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
+        std::vector<int64_t> rows_sample(ne2, 0);
+        const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
         std::vector<float> row_buffer(n_per_row);
-        const ggml_type src_type = t->type;
+        const ggml_type src_type = tensor->type;
         const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
@@ -1199,23 +1152,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
         // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
-        float bias_lambda = 0.0f;
-        {
-            const float * values = values_sample.empty() ? nullptr : values_sample.data();
-            const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-            if (params->bpw_bias == 1) {
-                bias_lambda = fast_lambda(values, activations, n_per_row, ne2);
-            } else if (params->bpw_bias == 2) {
-                bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
-            }
-        }
-
-        // Now evaluate candidates
-        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
+        float tensor_lambda = 0.0f;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        auto lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+        double acc = 0.0;
+        int ns = 0;
+        for (float l : lambdas) { acc += l; ++ns; }
+        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+
+        // Evaluate candidates
+        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantised_buffer(f32_sample.size());
+        const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
@@ -1476,7 +1426,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             int best_j = -1;
             double best_ratio = -1.0;
             size_t best_delta = 0;
-
             for (int i = 0; i < (int)all.size(); ++i) {
                 const auto & ti = all[i];
                 if (ti.choice >= (int)ti.candidate.size() - 1) {

From 9e74f8341120d5f26939267e96fbaba04451d516 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 23:06:37 +0100
Subject: [PATCH 069/148] Replace --bpw-bias flag with --no-bias

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 18 +++++++++-------
 tools/quantize/quantize.cpp | 42 ++++++++-----------------------------
 3 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index ba6c185346c..502bedbb802 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        int32_t bpw_bias;                     // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)
+        bool no_bias;                         // use mean square error estimation only (no aligment bias)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9d7a9f97428..9e7d9d295cf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1153,13 +1153,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
         // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
         float tensor_lambda = 0.0f;
+        std::vector<float> lambdas;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-        auto lambdas = estimate_lambda(values, activations, n_per_row, ne2);
-        double acc = 0.0;
-        int ns = 0;
-        for (float l : lambdas) { acc += l; ++ns; }
-        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+        if (!params->no_bias) {
+            double acc = 0.0;
+            int ns = 0;
+            lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+            for (float l : lambdas) { acc += l; ++ns; }
+            tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+        }
 
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
@@ -1726,8 +1729,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"};
-            LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]);
+            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -2038,7 +2040,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.bpw_bias                    =*/ 1
+        /*.no_bias                     =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0fe65daea0d..03018cc3012 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,12 +117,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
+    printf("       [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure: disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
@@ -134,7 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n");
+    printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
+    printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -496,27 +497,6 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
-static bool parse_bpw_bias(const char * data, int & bpw_bias) {
-    if (!data) {
-        printf("\n%s: error bias type not provided\n\n", __func__);
-        return false;
-    }
-
-    try {
-        bpw_bias = std::stoi(data);
-        if (bpw_bias < 0 || bpw_bias > 2) {
-            printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__);
-            return false;
-        }
-    }
-    catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
-        return false;
-    }
-
-    return true;
-}
-
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -531,7 +511,6 @@ int main(int argc, char ** argv) {
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
-    int bpw_bias = 1;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -562,11 +541,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) {
-            if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) {
-                usage(argv[0]);
-            }
-            params.bpw_bias = bpw_bias;
+        } else if (strcmp(argv[arg_idx], "--no-bias") == 0) {
+            params.no_bias = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From e8e2aed17a4ade7b14021e05f2a55f9b8f26510f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:41:44 +0100
Subject: [PATCH 070/148] Refactor row sampling

---
 src/llama-quant.cpp | 49 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9e7d9d295cf..4a8c08e68f7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1029,7 +1029,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
 
-        // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
         const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
@@ -1037,11 +1036,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<int64_t> rows_sample(ne2, 0);
         const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
-        std::vector<float> row_buffer(n_per_row);
         const ggml_type src_type = tensor->type;
-        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
+        const ggml_type_traits * src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
+
+        std::vector<float> row_buffer(n_per_row);
+        auto row_to_fp32 = [&](const uint8_t * src, float * dst) {
+            if (src_type == GGML_TYPE_F32) {
+                std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row);
+            } else if (src_type == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+            } else if (src_type == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+            } else if (src_is_quant) {
+                if (!src_traits || !src_traits->to_float) {
+                    throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
+                }
+
+                src_traits->to_float(src, dst, (int)n_per_row);
+            } else {
+                throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+            }
+        };
+
         for (int64_t slice = 0; slice < ne2; ++slice) {
             std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
             int64_t current_sampled_rows = 0;
@@ -1052,31 +1070,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
+                const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                 if (src_type == GGML_TYPE_F32) {
-                    const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row;
-                    f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
-                } else if (src_type == GGML_TYPE_F16) {
-                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
-                } else if (src_type == GGML_TYPE_BF16) {
-                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
-                } else if (src_is_quant) {
-                    const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                    if (!src_traits || !src_traits->to_float) {
-                        throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
-                    }
-                    src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+                    auto src_f32 = (const float *)src_row;
+                    f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
                 } else {
-                    throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+                    row_to_fp32(src_row, row_buffer.data());
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 }
 
                 ++current_sampled_rows;
             }
-
+            
             rows_sample[slice] = current_sampled_rows;
         }
 

From bdefdb673c0d28b59c23d505307536b4f1724858 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:07 +0100
Subject: [PATCH 071/148] Refactor copy_or_broadcast()

---
 src/llama-quant.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4a8c08e68f7..b1302df431b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1087,6 +1087,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
             if (!m) { return {nullptr, 0}; }
+
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
             if (it == m->end()) { return {nullptr, 0}; }
@@ -1095,22 +1096,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         };
 
         // Copy this row's side data (values and activations), or broadcasts to all slices
-        auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
-            const size_t want = (size_t)ne2 * (size_t)n_per_row;
+        auto copy_or_broadcast = [&](const float * src, size_t src_sz, std::vector<float> & dst) {
             dst.clear();
             if (!src || src_sz == 0) { return; }
+
+            const size_t want = (size_t)ne2 * (size_t)n_per_row;
             if (src_sz == want) {
                 dst.resize(want);
                 std::memcpy(dst.data(), src, want * sizeof(float));
-            } else if (src_sz == (size_t)n_per_row) {
+
+                return;
+            }
+            if (src_sz == (size_t)n_per_row) {
                 dst.resize(want);
                 for (int64_t s = 0; s < ne2; ++s) {
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
-            } else {
-                LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
-                    func, name.c_str(), src_sz, (size_t)n_per_row, want);
+
+                return;
             }
+
+            LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want);
         };
 
         const auto [values_all, values_sz] = side_data(values_data, name);

From 6b8cedf3bcd2282e9f31b00026178d6bb393fc3e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:31 +0100
Subject: [PATCH 072/148] Refactor estimate_lambda()

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b1302df431b..ebacf688062 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -975,30 +975,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float>
-    {
-        std::vector<float> lambdas(std::max<int64_t>(1, ne2), 0.0f);
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+        const int64_t ns = std::max<int64_t>(1, ne2);
+        std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
 
-        for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
+        for (int64_t s = 0; s < ns; ++s) {
             const float * v = values ? values + s * n_per_row : nullptr;
             const float * a = activations + s * n_per_row;
             double s1 = 0.0;
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
                 const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                const double aw = std::sqrt(w) * a[j];
-                const double aw2 = aw * aw;
-                s1 += aw2;
-                s2 += aw2 * aw2;
+                const double aw2 = std::sqrt(w) * a[j];
+                const double z = aw2 * aw2;
+                s1 += z;
+                s2 += z * z;
             }
 
             float l = 0.0f;
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                double lambda = 8.0 * (c / (c + 1.0));
-                l = (float)std::clamp(lambda, 0.0, 12.0);
+                l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From c466c53808e566f5eb81a654c9f131064246cdaf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:54 +0100
Subject: [PATCH 073/148] Refactor pareto pruning and convexification

---
 src/llama-quant.cpp | 91 +++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 49 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index ebacf688062..ab6601a8bf9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1146,8 +1146,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n",
-                    __func__, ggml_type_name(ts_type), name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 
@@ -1214,60 +1213,54 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
-        // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
+        // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
         {
-            std::vector<candidate_types> pruned;
-            pruned.reserve(info.candidate.size());
-
-            // Sort by bytes ascending, error ascending
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
-                return a.error < b.error;
-            });
-
-            double best_err = infinity;
-            size_t last_bytes = std::numeric_limits<size_t>::max();
-            for (const auto & c : info.candidate) {
-                // Only keep the best error seen so far at strictly larger byte sizes
-                if (c.bytes != last_bytes) {
-                    // first time we see this byte size
-                    last_bytes = c.bytes;
-                    if (c.error < best_err) {
-                        pruned.push_back(c);
-                        best_err = c.error;
+            auto & candidates = info.candidate;
+            if (!candidates.empty()) {
+                std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                    if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+
+                    return a.error < b.error;
+                });
+
+                std::vector<candidate_types> pareto;
+                pareto.reserve(candidates.size());
+                double best_err = infinity;
+                size_t last_bytes = std::numeric_limits<size_t>::max();
+                for (const auto & c : candidates) {
+                    if (c.bytes != last_bytes) {
+                        last_bytes = c.bytes;
+                        if (c.error < best_err) {
+                            best_err = c.error;
+                            pareto.push_back(c);
+                        }
                     }
-                } else {
-                    // same bytes: we already sorted by error; skip
                 }
-            }
 
-            info.candidate.swap(pruned);
-        }
+                candidates.swap(pareto);
 
-        // Enforce convexity in (bytes, error) curve
-        {
-            const auto & c = info.candidate;
-            if (c.size() >= 3) {
-                std::vector<candidate_types> convex;
-                convex.reserve(c.size());
-                auto slope = [](const candidate_types & a, const candidate_types & b) -> double {
-                    const double dx = (double)b.bytes - (double)a.bytes;
-                    if (dx <= 0.0) { return infinity; }
-
-                    return ((double)b.error - (double)a.error) / dx;
-                };
-
-                for (const auto & p : c) {
-                    while (convex.size() >= 2) {
-                        double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]);
-                        double s2 = slope(convex[convex.size() - 1], p);
-                        if (s2 + epsilon < s1) { convex.pop_back(); }
-                        else { break; }
+                if (candidates.size() >= 3) {
+                    std::vector<candidate_types> hull;
+                    hull.reserve(candidates.size());
+                    auto slope = [](const candidate_types & a, const candidate_types & b) {
+                        const double dx = b.bytes - a.bytes;
+
+                        return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
+                    };
+
+                    for (const auto & p : candidates) {
+                        while (hull.size() >= 2) {
+                            double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
+                            double s2 = slope(hull[hull.size() - 1], p);
+                            if (s2 + epsilon < s1) { hull.pop_back(); }
+                            else { break; }
+                        }
+
+                        hull.push_back(p);
                     }
-                    convex.push_back(p);
-                }
 
-                info.candidate.swap(convex);
+                    candidates.swap(hull);
+                }
             }
         }
 

From b433fd95472c39c4974892aa9100e3cdc7b9c63d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:43:09 +0100
Subject: [PATCH 074/148] Refactor last budget pass

---
 src/llama-quant.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index ab6601a8bf9..e062b2dc6a3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1433,19 +1433,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double best_ratio = -1.0;
             size_t best_delta = 0;
             for (int i = 0; i < (int)all.size(); ++i) {
-                const auto & ti = all[i];
-                if (ti.choice >= (int)ti.candidate.size() - 1) {
-                    continue;
-                }
-
+                const auto &ti = all[i];
                 int j = ti.choice + 1;
+                // skip same-bytes entries
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; }
 
                 size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
                 if (cur_bytes + delta > budget_bytes) { continue; }
 
-                double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error);
+                double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
                 double ratio = err_gain / (double)(delta * 8);
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
                     best_ratio = ratio;
@@ -1454,7 +1451,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     best_j = j;
                 }
             }
-
             if (best_i < 0) { break; }
             all[best_i].choice = best_j;
             cur_bytes += best_delta;

From b6c008fd8a12a9b1970c4810585cbd540bf0737e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:04:13 +0100
Subject: [PATCH 075/148] Refactor helper lambdas

---
 src/llama-quant.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e062b2dc6a3..d31552ea23a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -665,28 +665,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
-        const int64_t nrows = ggml_nrows(t);
-        return (size_t)nrows * row_sz;
+        return (size_t)ggml_nrows(t) * row_sz;
     };
 
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
-        const int64_t nelem = ggml_nelements(t);
         const size_t bytes = tensor_bytes(t, typ);
-        return (double)bytes * 8.0 / (double)nelem;
+        return (double)bytes * 8.0 / (double)ggml_nelements(t);
     };
 
     auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
-        const int64_t n_per_row = t->ne[0];
         const int64_t blck = ggml_blck_size(typ);
-        if (blck <= 1) { return true; }
-        return n_per_row % blck == 0;
+        return blck <= 1 || (t->ne[0] % blck) == 0;
     };
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) { return typ; }
+        if (is_compatible(t, typ)) return typ;
         ggml_type fb = fallback_type(typ);
-        if (is_compatible(t, fb)) { return fb; }
-        return GGML_TYPE_F16;
+        return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
     auto name_tn = LLM_TN(model.arch);
@@ -1080,7 +1075,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 ++current_sampled_rows;
             }
-            
+
             rows_sample[slice] = current_sampled_rows;
         }
 

From 7386d4eadd64006ac7f0fbc992d7d4bcb195bd6c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:18:26 +0100
Subject: [PATCH 076/148] Refactor row sampling

---
 src/llama-quant.cpp | 83 +++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d31552ea23a..f2dab6a898a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1019,64 +1019,73 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
+        const int rows_sample_per_expert = activations_data ? 512 : 256;
         const int64_t n_per_row = tensor->ne[0];
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
-
-        const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
-
         std::vector<int64_t> rows_sample(ne2, 0);
-        const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
         const ggml_type src_type = tensor->type;
         const ggml_type_traits * src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
 
-        std::vector<float> row_buffer(n_per_row);
+        // Convert a single row to fp32
         auto row_to_fp32 = [&](const uint8_t * src, float * dst) {
-            if (src_type == GGML_TYPE_F32) {
+            const ggml_type t = src_type;
+            if (t == GGML_TYPE_F32) {
                 std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row);
-            } else if (src_type == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-            } else if (src_type == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-            } else if (src_is_quant) {
-                if (!src_traits || !src_traits->to_float) {
-                    throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
-                }
+                return;
+            }
+            if (t == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                return;
+            }
+            if (t == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                return;
+            }
 
-                src_traits->to_float(src, dst, (int)n_per_row);
-            } else {
-                throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+            if (src_is_quant) {
+                GGML_ASSERT(src_traits && src_traits->to_float);
+                src_traits->to_float(src, dst, (int) n_per_row);
+                return;
             }
+
+            throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(t)));
         };
 
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
-            int64_t current_sampled_rows = 0;
-            int64_t offset = 0;
-            if (stride > 1) {
-                std::uniform_int_distribution<int64_t> dist(0, stride - 1);
-                offset = dist(rng);
-            }
+        // Sample rows randomly per slice
+        {
+            f32_sample.clear();
+            std::vector<float> row_buffer(n_per_row);
+            for (int64_t slice = 0; slice < ne2; ++slice) {
+                std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
+                const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
+                const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
+                int64_t offset = 0;
+                if (stride > 1) {
+                    std::uniform_int_distribution<int64_t> dist(0, stride - 1);
+                    offset = dist(rng);
+                }
 
-            for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
-                const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                if (src_type == GGML_TYPE_F32) {
-                    auto src_f32 = (const float *)src_row;
-                    f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
-                } else {
-                    row_to_fp32(src_row, row_buffer.data());
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+                int64_t current = 0;
+                for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) {
+                    const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    if (src_type == GGML_TYPE_F32) {
+                        auto src_f32 = (const float *)src_row;
+                        f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
+                    } else {
+                        row_to_fp32(src_row, row_buffer.data());
+                        f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+                    }
+
+                    ++current;
                 }
 
-                ++current_sampled_rows;
+                rows_sample[slice] = current;
             }
-
-            rows_sample[slice] = current_sampled_rows;
         }
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {

From 08146fd67f5ec6b93e2406340afaaa5aa336596a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:03 +0100
Subject: [PATCH 077/148] Refactor side_data() and copy_or_broadcast()

---
 src/llama-quant.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f2dab6a898a..b8eb12690e3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1088,14 +1088,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
-            if (!m) { return {nullptr, 0}; }
+        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) {
+            if (!m) { return std::pair<const float*, size_t>{nullptr, 0}; }
 
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
-            if (it == m->end()) { return {nullptr, 0}; }
-
-            return { it->second.data(), it->second.size() };
+            return it == m->end() ? std::pair<const float*, size_t>{nullptr, 0} : std::pair<const float*, size_t>{ it->second.data(), it->second.size() };
         };
 
         // Copy this row's side data (values and activations), or broadcasts to all slices
@@ -1105,9 +1103,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             if (src_sz == want) {
-                dst.resize(want);
-                std::memcpy(dst.data(), src, want * sizeof(float));
-
+                dst.assign(src, src + want);
                 return;
             }
             if (src_sz == (size_t)n_per_row) {
@@ -1115,7 +1111,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 for (int64_t s = 0; s < ne2; ++s) {
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
-
                 return;
             }
 

From 17be7615ce070af61cd1a0f80b38947c3fea5709 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:28 +0100
Subject: [PATCH 078/148] Refactor candidate types build

---
 src/llama-quant.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b8eb12690e3..beac311d50e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1133,19 +1133,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
+        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
+        size_t max_row_sz = 0;
         const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
         const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
-
-        size_t max_row_sz = 0;
-        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
-
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(base_sz);
 
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 

From b09662f86aefb5750842c9d68dac42db9054e90c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:49 +0100
Subject: [PATCH 079/148] Refactor estimate_lambda()

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index beac311d50e..63779ded487 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -982,8 +982,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
                 const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                const double aw2 = std::sqrt(w) * a[j];
-                const double z = aw2 * aw2;
+                const double aw = std::sqrt(w) * a[j];
+                const double z  = aw * aw;
                 s1 += z;
                 s2 += z * z;
             }
@@ -992,7 +992,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From a7ee915e19d9acd7a1187ba7d8d772d3a52a8f0d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:20:06 +0100
Subject: [PATCH 080/148] Refactor trimmed_sum()

---
 src/llama-quant.cpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 63779ded487..67de29df872 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -920,26 +920,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             // Trimmed sum to avoid outlier rows dominating the results
             auto trimmed_sum = [&](std::vector<double> & v) -> double {
-                if (v.empty()) { return 0.0; }
-
                 const int64_t n = (int64_t)v.size();
-                if (n < 50) {
-                    double s = 0.0;
-                    for (const double z : v) { s += z; }
-
-                    return s;
-                }
+                if (n == 0) { return 0.0; }
+                if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
 
                 int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
-                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // cap at ~3.125%
+                k = std::clamp<int64_t>(k, 0, n / 32); // cap at ~3.125%
                 std::nth_element(v.begin(), v.begin() + k, v.end());
                 std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
-                double s = 0.0;
-                for (int64_t i = k; i < n - k; ++i) {
-                    s += v[i];
-                }
-
-                return s;
+                return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
             };
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);

From 1a3e9ea4c88c40b7fea3a94ff45522531f31f005 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:21:00 +0100
Subject: [PATCH 081/148] Refactor estimate_error()

---
 src/llama-quant.cpp | 191 ++++++++++++++++++++------------------------
 1 file changed, 85 insertions(+), 106 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 67de29df872..b3e4b3cbf7b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -737,12 +737,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t sample_element_count = f32_sample.size();
-        const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
-        if (sample_row_count == 0) {
+        const size_t sample_elems = f32_sample.size();
+        const size_t sample_rows  = n_per_row > 0 ? sample_elems / (size_t)n_per_row : 0;
+
+        if (sample_rows == 0) {
             if (out_mse) { *out_mse = 0.0; }
             if (out_proj) { *out_proj = 0.0; }
-
             return 0.0;
         }
 
@@ -751,105 +751,102 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             expected_rows += (size_t)rows_sample[s];
         }
 
-        if (expected_rows != sample_row_count) {
+        if (expected_rows != sample_rows) {
             if (out_mse) { *out_mse = infinity; }
             if (out_proj) { *out_proj = 0.0; }
-
             return infinity;
         }
 
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
-        const size_t buffer_sz = row_sz * sample_row_count;
+        const size_t buf_sz = row_sz * sample_rows;
 
-        if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); }
-        if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
+        if (quantized_buffer.size() < buf_sz) { quantized_buffer.resize(buf_sz); }
+        if (dequantized_buffer.size() < sample_elems) { dequantized_buffer.resize(sample_elems); }
 
         const bool has_values = values_sample != nullptr;
         const bool has_activations = activations_sample != nullptr;
 
         // Bias denominators per slice
-        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
+        std::vector<double> bias_denom(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
-                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-                const float * activations = activations_sample + s * n_per_row;
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                const float * a = activations_sample + s * n_per_row;
                 double denom = 0.0;
                 for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double w = values ? std::max(0.0f, values[j]) : 1.0;
-                    const double a = activations[j];
-                    denom += w * a * a;
+                    const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                    const double aj = a[j];
+                    denom += w * aj * aj;
                 }
 
-                bias_denominator_per_slice[s] = denom;
+                bias_denom[s] = denom;
             }
         }
 
-        // Weighted per-row squared norms
-        std::vector<double> row_sq_norm(sample_row_count, 0.0);
+        // Row squared norms (weighted if values present)
+        std::vector<double> row_sq_norm(sample_rows, 0.0);
         {
-            size_t offset = 0;
-            size_t row_idx = 0;
+            size_t off = 0;
+            size_t ridx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
                 const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
-                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-                for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * x = f32_sample.data() + offset;
-                    double rsn = 0.0;
-                    if (values) {
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                for (int64_t r = 0; r < rs; ++r, ++ridx) {
+                    const float * x = f32_sample.data() + off;
+                    double sum = 0.0;
+                    if (v) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = std::max(0.0f, values[j]);
+                            const double w = std::max(0.0f, v[j]);
                             const double xx = x[j];
-                            rsn += w * xx * xx;
+                            sum += w * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
                             const double xx = x[j];
-                            rsn += xx * xx;
+                            sum += xx * xx;
                         }
                     }
-                    row_sq_norm[row_idx] = rsn;
-                    offset += (size_t)n_per_row;
+
+                    row_sq_norm[ridx] = sum;
+                    off += (size_t)n_per_row;
                 }
             }
         }
 
-        // Quantize sampled rows per slice -> quantized_buffer
+        // Quantize per slice into quantized_buffer
         {
-            size_t q_offset = 0;
-            size_t f_offset = 0;
-            for (int64_t slice = 0; slice < ne2; ++slice) {
-                const int64_t rs = rows_sample[slice];
+            size_t qoff = 0;
+            size_t foff = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
-                const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
-                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
-                q_offset += row_sz * (size_t)rs;
-                f_offset += (size_t)rs * (size_t)n_per_row;
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v);
+                qoff += row_sz * (size_t)rs;
+                foff += (size_t)rs * (size_t)n_per_row;
             }
         }
 
-        // quantized_buffer -> dequantized_buffer
+        // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            const bool is_fp16 = quant_type == GGML_TYPE_F16;
-            const bool is_bf16 = quant_type == GGML_TYPE_BF16;
-            if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
-                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row));
+            if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) {
+                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row));
             } else {
-                for (size_t r = 0; r < sample_row_count; ++r) {
-                    uint8_t * src = quantized_buffer.data() + r * row_sz;
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
                     float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                    if (is_fp16) {
+                    if (quant_type == GGML_TYPE_F16) {
                         ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                    } else if (is_bf16) {
+                    } else if (quant_type == GGML_TYPE_BF16) {
                         ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
                     } else {
                         if (!traits || !traits->to_float) {
                             if (out_mse) { *out_mse = infinity; }
                             if (out_proj) { *out_proj = 0.0; }
-
                             return infinity;
                         }
                         traits->to_float(src, dst, (int)n_per_row);
@@ -858,94 +855,77 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        // Compute error
-        size_t offset = 0;
-        size_t row_idx = 0;
+        // Compute error per slice with trimmed aggregation
+        auto trimmed_sum = [&](std::vector<double> & v) -> double {
+            const int64_t n = (int64_t)v.size();
+            if (n == 0) { return 0.0; }
+            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
+            int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side
+            k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
+            std::nth_element(v.begin(), v.begin() + k, v.end());
+            std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+            return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+        };
+
+        size_t off = 0;
+        size_t ridx = 0;
         double total_mse = 0.0;
         double total_proj = 0.0;
         double total_bias = 0.0;
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = rows_sample[slice];
+        for (int64_t s = 0; s < ne2; ++s) {
+            const int64_t rs = rows_sample[s];
             if (rs == 0) { continue; }
 
-            const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
-            const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
-            const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+            const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+            const float * a = has_activations ? activations_sample + s * n_per_row : nullptr;
+            const double denom_bias = has_activations ? bias_denom[s] : 0.0;
             std::vector<double> row_mse_norm;
-            std::vector<double> row_proj_norm;
             row_mse_norm.reserve(rs);
-            if (activations) { row_proj_norm.reserve(rs); }
+            std::vector<double> row_proj_norm;
+            if (a) { row_proj_norm.reserve(rs); }
 
-            for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                const float * x = f32_sample.data() + offset;
-                const float * y = dequantized_buffer.data() + offset;
-                double weighted_mse = 0.0;
+            for (int64_t r = 0; r < rs; ++r, ++ridx) {
+                const float * x = f32_sample.data() + off;
+                const float * y = dequantized_buffer.data() + off;
+                double w_mse = 0.0;
                 double bias_num = 0.0;
-                if (values && activations) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = std::max(0.0f, values[j]);
-                        const double e = y[j] - x[j];
-                        const double a = activations[j];
-                        weighted_mse += w * e * e;
-                        bias_num += w * e * a;
-                    }
-                } else if (values) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = std::max(0.0f, values[j]);
-                        const double e = y[j] - x[j];
-                        weighted_mse += w * e * e;
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double e = y[j] - x[j];
-                        weighted_mse += e * e;
-                    }
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double wj = v ? std::max(0.0f, v[j]) : 1.0;
+                    const double e = y[j] - x[j];
+                    w_mse += wj * e * e;
+                    if (a) { bias_num += wj * e * a[j]; }
                 }
 
-                const double denom_x = row_sq_norm[row_idx];
-                double m_norm = weighted_mse / (denom_x + epsilon);
+                const double denom_x = row_sq_norm[ridx];
+                const double m_norm  = w_mse / (denom_x + epsilon);
                 row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
 
-                if (activations) {
+                if (a) {
                     double p_norm = 0.0;
-                    if (bias_denom > 0.0) {
-                        const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                    if (denom_bias > 0.0) {
+                        const double proj = bias_num * bias_num / (denom_bias + epsilon);
                         p_norm = std::isfinite(proj) ? proj : 0.0;
                     }
+
                     row_proj_norm.push_back(p_norm);
                 }
 
-                offset += (size_t)n_per_row;
+                off += (size_t)n_per_row;
             }
 
-            // Trimmed sum to avoid outlier rows dominating the results
-            auto trimmed_sum = [&](std::vector<double> & v) -> double {
-                const int64_t n = (int64_t)v.size();
-                if (n == 0) { return 0.0; }
-                if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
-
-                int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
-                k = std::clamp<int64_t>(k, 0, n / 32); // cap at ~3.125%
-                std::nth_element(v.begin(), v.begin() + k, v.end());
-                std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
-                return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
-            };
-
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
-            const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
+            const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
 
             total_mse += slice_mse;
             total_proj += slice_proj;
 
-            // per-slice lambda if provided, otherwise use scalar
-            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda;
+            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[s]) : (double)tensor_bias_lambda;
             total_bias += bl * slice_proj;
 
             if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) {
                 if (out_mse) { *out_mse = infinity; }
                 if (out_proj) { *out_proj = 0.0; }
-
                 return infinity;
             }
         }
@@ -954,7 +934,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_proj) { *out_proj = total_proj; }
 
         const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
-
         return std::isfinite(total_err) ? total_err : infinity;
     };
 

From 9a1656eb975fa9f1024a8de029e22a762e49719b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:21:35 +0100
Subject: [PATCH 082/148] Refactor pareto optimise and convexify

---
 src/llama-quant.cpp | 84 ++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 43 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b3e4b3cbf7b..751a26c63aa 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1179,55 +1179,53 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        {
-            auto & candidates = info.candidate;
-            if (!candidates.empty()) {
-                std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
-                    if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
-
-                    return a.error < b.error;
-                });
-
-                std::vector<candidate_types> pareto;
-                pareto.reserve(candidates.size());
-                double best_err = infinity;
-                size_t last_bytes = std::numeric_limits<size_t>::max();
-                for (const auto & c : candidates) {
-                    if (c.bytes != last_bytes) {
-                        last_bytes = c.bytes;
-                        if (c.error < best_err) {
-                            best_err = c.error;
-                            pareto.push_back(c);
-                        }
-                    }
-                }
+        auto pareto_convex = [](std::vector<candidate_types> & candidates) {
+            if (candidates.empty()) return;
 
-                candidates.swap(pareto);
-
-                if (candidates.size() >= 3) {
-                    std::vector<candidate_types> hull;
-                    hull.reserve(candidates.size());
-                    auto slope = [](const candidate_types & a, const candidate_types & b) {
-                        const double dx = b.bytes - a.bytes;
-
-                        return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
-                    };
-
-                    for (const auto & p : candidates) {
-                        while (hull.size() >= 2) {
-                            double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
-                            double s2 = slope(hull[hull.size() - 1], p);
-                            if (s2 + epsilon < s1) { hull.pop_back(); }
-                            else { break; }
-                        }
+            std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+                return a.error < b.error;
+            });
 
-                        hull.push_back(p);
+            // Pareto by bytes -> error
+            std::vector<candidate_types> pareto;
+            pareto.reserve(candidates.size());
+            double best_err = std::numeric_limits<double>::infinity();
+            size_t last_b = std::numeric_limits<size_t>::max();
+            for (const auto & c : candidates) {
+                if (c.bytes != last_b) {
+                    last_b = c.bytes;
+                    if (c.error < best_err) {
+                        best_err = c.error;
+                        pareto.push_back(c);
                     }
+                }
+            }
 
-                    candidates.swap(hull);
+            candidates.swap(pareto);
+            if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
+
+            // Convex hull (lower envelope)
+            auto slope = [](const candidate_types & a, const candidate_types & b) {
+                const double dx = b.bytes - a.bytes;
+                return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
+            };
+
+            std::vector<candidate_types> hull; hull.reserve(candidates.size());
+            for (const auto & p : candidates) {
+                while (hull.size() >= 2) {
+                    const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
+                    const double s2 = slope(hull[hull.size() - 1], p);
+                    if (s2 + epsilon < s1) hull.pop_back();
+                    else { break; }
                 }
+
+                hull.push_back(p);
             }
-        }
+            candidates.swap(hull);
+        };
+
+        pareto_convex(info.candidate);
 
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;

From 0d5f18303e25e6b4e4dc21f963ca6672b9b12d0f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:22:00 +0100
Subject: [PATCH 083/148] Refactor lagrange_penalty()

---
 src/llama-quant.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 751a26c63aa..204fbfecad8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1288,21 +1288,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bytes = 0;
         err = 0.0;
         for (size_t i = 0; i < all.size(); ++i) {
-            const auto & cand = all[i].candidate;
+            const auto & candidate = all[i].candidate;
             int best_j = 0;
             double best_val = infinity;
-            for (int j = 0; j < (int)cand.size(); ++j) {
-                const double bits = (double)cand[j].bytes * 8.0;
-                const double val = cand[j].error + mu * bits;
-                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) {
+            for (int j = 0; j < (int)candidate.size(); ++j) {
+                const double bits = (double)candidate[j].bytes * 8.0;
+                const double val = candidate[j].error + mu * bits;
+                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) {
                     best_val = val;
                     best_j = j;
                 }
             }
 
             choice[i] = best_j;
-            bytes += cand[best_j].bytes;
-            err += cand[best_j].error;
+            bytes += candidate[best_j].bytes;
+            err += candidate[best_j].error;
         }
     };
 

From 814f6b66be4b5ebbe286201eafe8361a37d39a98 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:45:09 +0100
Subject: [PATCH 084/148] Minor general refactoring

---
 src/llama-quant.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 204fbfecad8..93b5fb0ebad 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -860,7 +860,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
-            int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side
+
+            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
             k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
             std::nth_element(v.begin(), v.begin() + k, v.end());
             std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
@@ -1190,7 +1191,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             // Pareto by bytes -> error
             std::vector<candidate_types> pareto;
             pareto.reserve(candidates.size());
-            double best_err = std::numeric_limits<double>::infinity();
+            double best_err = infinity;
             size_t last_b = std::numeric_limits<size_t>::max();
             for (const auto & c : candidates) {
                 if (c.bytes != last_b) {
@@ -1273,12 +1274,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (budget_bytes <= min_bytes) {
         for (auto & ti : all) { ti.choice = 0; }
-
         return emit_overrides();
     }
     if (budget_bytes >= max_bytes) {
         for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
-
         return emit_overrides();
     }
 
@@ -1327,14 +1326,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         int expand = 0;
         while (true) {
             lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
-            if (bytes_hi <= budget_bytes) {
-                break;
-            }
+            if (bytes_hi <= budget_bytes) { break; }
 
             mu_hi *= 2.0;
-            if (++expand > 60) {
-                break;
-            }
+            if (++expand > 60) { break; } // safety cap
         }
     }
 

From e92db008bc848b109f2931162a69c7010f675b70 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 17:20:48 +0100
Subject: [PATCH 085/148] Refactor quantisation checks into its own function

---
 src/llama-quant.cpp | 140 ++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 83 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 93b5fb0ebad..3544653a56b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -21,6 +21,60 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static bool is_quantizable(const std::string & name, const llm_arch arch, const llama_model_quantize_params * params) {
+    if (params->only_copy) { return false; }
+
+    const auto tn = LLM_TN(arch);
+
+    // This used to be a regex, but <regex> has an extreme cost to compile times.
+    bool q = name.size() >= 6 && name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+    // Do not quantize norm tensors
+    q &= name.find("_norm.weight") == std::string::npos;
+
+    // Do not quantize expert gating tensors
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    q &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+    // These are very small (e.g. 4x4)
+    q &= name.find("altup") == std::string::npos;
+    q &= name.find("laurel") == std::string::npos;
+
+    // These are not too big so keep them as it is
+    q &= name.find("per_layer_model_proj") == std::string::npos;
+
+    // Do not quantize positional embeddings and token types (BERT)
+    q &= name != tn(LLM_TENSOR_POS_EMBD, "weight");
+    q &= name != tn(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+    // Do not quantize Jamba, Mamba, LFM2's small yet 2D weights
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    q &= name.find("ssm_conv1d.weight") == std::string::npos;
+    q &= name.find("shortconv.conv.weight") == std::string::npos;
+
+    // Do not quantize ARWKV, RWKV's small yet 2D weights
+    q &= name.find("time_mix_first.weight") == std::string::npos;
+    q &= name.find("time_mix_w0.weight") == std::string::npos;
+    q &= name.find("time_mix_w1.weight") == std::string::npos;
+    q &= name.find("time_mix_w2.weight") == std::string::npos;
+    q &= name.find("time_mix_v0.weight") == std::string::npos;
+    q &= name.find("time_mix_v1.weight") == std::string::npos;
+    q &= name.find("time_mix_v2.weight") == std::string::npos;
+    q &= name.find("time_mix_a0.weight") == std::string::npos;
+    q &= name.find("time_mix_a1.weight") == std::string::npos;
+    q &= name.find("time_mix_a2.weight") == std::string::npos;
+    q &= name.find("time_mix_g1.weight") == std::string::npos;
+    q &= name.find("time_mix_g2.weight") == std::string::npos;
+    q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+    // Do not quantize relative position bias (T5)
+    q &= name.find("attn_rel_b.weight") == std::string::npos;
+
+    return q;
+}
+
 static bool is_iq(const enum ggml_type t) {
     switch (t) {
         case GGML_TYPE_IQ1_S:
@@ -684,40 +738,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
-    auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
-        const std::string name = ggml_get_name(t);
-        bool q = name.rfind("weight") == name.size() - 6;
-        q &= ggml_n_dims(t) >= 2;
-        q &= name.find("_norm.weight") == std::string::npos;
-        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
-        q &= name.find("altup") == std::string::npos;
-        q &= name.find("laurel") == std::string::npos;
-        q &= name.find("per_layer_model_proj") == std::string::npos;
-        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
-        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
-        q &= name.find("ssm_conv1d.weight") == std::string::npos;
-        q &= name.find("shortconv.conv.weight") == std::string::npos;
-        q &= name.find("time_mix_first.weight") == std::string::npos;
-        q &= name.find("time_mix_w0.weight") == std::string::npos;
-        q &= name.find("time_mix_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_v0.weight") == std::string::npos;
-        q &= name.find("time_mix_v1.weight") == std::string::npos;
-        q &= name.find("time_mix_v2.weight") == std::string::npos;
-        q &= name.find("time_mix_a0.weight") == std::string::npos;
-        q &= name.find("time_mix_a1.weight") == std::string::npos;
-        q &= name.find("time_mix_a2.weight") == std::string::npos;
-        q &= name.find("time_mix_g1.weight") == std::string::npos;
-        q &= name.find("time_mix_g2.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-        q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= !params->only_copy;
-
-        return q;
+        if (ggml_n_dims(t) < 2) { return false; }
+        return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
     // Estimate error for a given type using a sampled subset of rows
@@ -1747,57 +1770,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
             ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D and 3D tensors (experts)
-        quantize &= (ggml_n_dims(tensor) >= 2);
-
-        // do not quantize norm tensors
-        quantize &= name.find("_norm.weight") == std::string::npos;
-
+        bool quantize = ggml_n_dims(tensor) >= 2 && is_quantizable(name, model.arch, params);
         quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // these are very small (e.g. 4x4)
-        quantize &= name.find("altup")  == std::string::npos;
-        quantize &= name.find("laurel") == std::string::npos;
-
-        // these are not too big so keep them as it is
-        quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        // do not quantize Mamba's small yet 2D weights
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
-        // do not quantize RWKV's small yet 2D weights
-        quantize &= name.find("time_mix_first.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
-        // do not quantize relative position bias (T5)
-        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
         ggml_type new_type;
         void * new_data;

From fecc472c6175bc65217d6f29855acf81477a5125 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 17:26:38 +0100
Subject: [PATCH 086/148] Fix typos in variable names

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3544653a56b..8a709ddfdd5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1165,7 +1165,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
-        std::vector<float> dequantised_buffer(f32_sample.size());
+        std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
@@ -1175,7 +1175,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             eval_workers.emplace_back([&] {
                 // thread-local scratch
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
-                std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
+                std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
                     if (i >= compatible_candidates.size()) { break; }
@@ -1184,7 +1184,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const auto bpw = (float)tensor_bpw(tensor, tensor_types);
                     const size_t bytes = tensor_bytes(tensor, tensor_types);
                     const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda);
+                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda);
                     eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
                 }
             });

From 896cdc21217ab4d0b2bcb8b18938d3c0efc94dc1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 22:03:36 +0100
Subject: [PATCH 087/148] Refactor potential overflow

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8a709ddfdd5..52d7984e2a4 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1002,7 +1002,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor));
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
             if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
             tensor->data = buffer.data();

From b748a1efa7dd0ab0d4064574530b4b045b27bbfc Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 22:03:54 +0100
Subject: [PATCH 088/148] Fix typo

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 52d7984e2a4..2652f5c86e5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1709,7 +1709,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)");
+            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {

From c855094dff509c97f6cc268e28f123262e67b6f7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:09:11 +0100
Subject: [PATCH 089/148] Exit loop if no better solution found

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2652f5c86e5..8ee052a8e55 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1347,9 +1347,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // increase mu until we get under budget or hit a safety cap
     {
         int expand = 0;
+        size_t prev_bytes_hi = std::numeric_limits<size_t>::max();
         while (true) {
             lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
             if (bytes_hi <= budget_bytes) { break; }
+            if (bytes_hi >= prev_bytes_hi) { break; }
+            prev_bytes_hi = bytes_hi;
 
             mu_hi *= 2.0;
             if (++expand > 60) { break; } // safety cap

From 1fbc59f867b283d1f66a87a8b1f45d265cf69fca Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:10:10 +0100
Subject: [PATCH 090/148] Replace slope with cross product

---
 src/llama-quant.cpp | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8ee052a8e55..0b2f15f0a66 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1230,22 +1230,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
             // Convex hull (lower envelope)
-            auto slope = [](const candidate_types & a, const candidate_types & b) {
-                const double dx = b.bytes - a.bytes;
-                return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
-            };
-
             std::vector<candidate_types> hull; hull.reserve(candidates.size());
-            for (const auto & p : candidates) {
+            for (const auto & c : candidates) {
+                auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
+                    const double dx1 = (double)h1.bytes - (double)h0.bytes;
+                    const double dy1 = h1.error - h0.error;
+                    const double dx2 = (double)p.bytes - (double)h0.bytes;
+                    const double dy2 = p.error - h0.error;
+                    return dx1 * dy2 - dx2 * dy1;
+                };
+
                 while (hull.size() >= 2) {
-                    const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
-                    const double s2 = slope(hull[hull.size() - 1], p);
-                    if (s2 + epsilon < s1) hull.pop_back();
-                    else { break; }
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
+                        hull.pop_back();
+                    } else {
+                        break;
+                    }
                 }
 
-                hull.push_back(p);
+                hull.push_back(c);
             }
+
             candidates.swap(hull);
         };
 

From f184450806163bd1af0eecaff5c31639cf3eaf8f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:10:42 +0100
Subject: [PATCH 091/148] Fix minor logic flaw

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0b2f15f0a66..4c0ec3063a9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -957,7 +957,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_mse) { *out_mse = total_mse; }
         if (out_proj) { *out_proj = total_proj; }
 
-        const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
+        const double total_err = total_mse + total_bias;
         return std::isfinite(total_err) ? total_err : infinity;
     };
 

From d79ade2e8e45057d9006b0b096888501ae639aab Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:11:26 +0100
Subject: [PATCH 092/148] Adjust for small vector size

---
 src/llama-quant.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4c0ec3063a9..08e1c97185d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -885,9 +885,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
 
             int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-            k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
-            std::nth_element(v.begin(), v.begin() + k, v.end());
-            std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small
+            std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };
 

From 7ba6001ec8fda89e7d513ced2da7b9aa3532cb70 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:11:54 +0100
Subject: [PATCH 093/148] Simplify candidates sorting

---
 src/llama-quant.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 08e1c97185d..f4c0ea0fcd9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1209,6 +1209,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
+            const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                return a.bytes == b.bytes;
+            });
+            candidates.erase(last, candidates.end());
 
             // Pareto by bytes -> error
             std::vector<candidate_types> pareto;

From d36ee0a0a86a65e1d730e788d735c1606ebeb49a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:41:56 +0100
Subject: [PATCH 094/148] Add comments to explain magic numbers

---
 src/llama-quant.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f4c0ea0fcd9..93007f281ea 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -739,7 +739,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        if (ggml_n_dims(t) < 2) { return false; }
+        if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
@@ -882,10 +882,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         auto trimmed_sum = [&](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
-            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
+            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
 
-            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small
+            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution
+            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1
             std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };
@@ -1289,7 +1289,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     if (total_elems == 0) { return {}; }
 
     const double target_bpw = params->target_bpw;
-    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0);
+    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes
 
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
@@ -1362,8 +1362,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (bytes_hi >= prev_bytes_hi) { break; }
             prev_bytes_hi = bytes_hi;
 
-            mu_hi *= 2.0;
-            if (++expand > 60) { break; } // safety cap
+            mu_hi *= 2.0; // double the penalty multiplier to reduce tensor sizes
+            if (++expand > 60) { break; } // safety cap to prevent an infinite loop
         }
     }
 
@@ -1371,8 +1371,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     double best_over_gap = infinity;
     double best_under_err = infinity;
     double best_over_err = infinity;
-    for (int it = 0; it < 40; ++it) {
-        double mu = 0.5 * (mu_lo + mu_hi);
+    for (int it = 0; it < 40; ++it) { // binary search iterations for optimal Lagrange multiplier (40 ≈ 1e-12 precision)
+        double mu = 0.5 * (mu_lo + mu_hi); // midpoint of current bounds
         lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
         const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
@@ -1435,7 +1435,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (cur_bytes + delta > budget_bytes) { continue; }
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                double ratio = err_gain / (double)(delta * 8);
+                double ratio = err_gain / (double)(delta * 8); // error reduction per bit
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
                     best_ratio = ratio;
                     best_delta = delta;

From 8eedcf74bc4df64eb7fe5b4935390dc9ad73d104 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:42:37 +0100
Subject: [PATCH 095/148] Increase scale multiplier

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 93007f281ea..0f05c8f9566 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -983,7 +983,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From a74b410f5f6bd11ff42cc1f40fa93242d0f67940 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Sep 2025 19:49:47 +0100
Subject: [PATCH 096/148] Move is_iq() into a lambda and remove unused
 variables

---
 src/llama-quant.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0f05c8f9566..af564ce03e0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -727,11 +727,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return (double)bytes * 8.0 / (double)ggml_nelements(t);
     };
 
-    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
+    auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool {
         const int64_t blck = ggml_blck_size(typ);
         return blck <= 1 || (t->ne[0] % blck) == 0;
     };
 
+    auto is_iq = [](const enum ggml_type t) {
+        switch (t) {
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+            case GGML_TYPE_IQ4_NL:
+            case GGML_TYPE_IQ4_XS:
+                return true;
+            default:
+                return false;
+        }
+    };
+
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
         if (is_compatible(t, typ)) return typ;
         ggml_type fb = fallback_type(typ);
@@ -995,8 +1012,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {
-        std::vector<std::thread> workers;
-        workers.reserve(std::max(1, nthread));
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }

From dbdd179a92426c2031e4bee1ba0ccace45ea29fe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Sep 2025 19:50:20 +0100
Subject: [PATCH 097/148] Combine quant types

---
 src/llama-quant.cpp | 75 ++++++++-------------------------------------
 1 file changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index af564ce03e0..f36b9202d53 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -75,43 +75,6 @@ static bool is_quantizable(const std::string & name, const llm_arch arch, const
     return q;
 }
 
-static bool is_iq(const enum ggml_type t) {
-    switch (t) {
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-            return true;
-        default:
-            return false;
-    }
-}
-
-static bool is_iq(const enum llama_ftype t) {
-    switch (t) {
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:
-            return true;
-        default:
-            return false;
-    }
-}
-
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -678,33 +641,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    constexpr ggml_type k_quants[] = {
-        GGML_TYPE_Q2_K,
-        GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0,
-// TODO: find better way to handle F16/BF16
-#ifdef GGML_USE_METAL
-        GGML_TYPE_F16
-#else
-        GGML_TYPE_BF16
-#endif
-    };
-
-    constexpr ggml_type iq_quants[] = {
+    // subset of quantization types with the best accuracy/size tradeoff
+    constexpr ggml_type quant_types[] = {
         GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
-        GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_S,
+        GGML_TYPE_Q2_K,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
+        GGML_TYPE_Q4_1,
+        GGML_TYPE_Q4_K,
+        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
-        // TODO: find better way to handle F16/BF16
 #ifdef GGML_USE_METAL
         GGML_TYPE_F16
 #else
@@ -896,7 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Compute error per slice with trimmed aggregation
-        auto trimmed_sum = [&](std::vector<double> & v) -> double {
+        auto trimmed_sum = [](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
@@ -978,7 +929,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+    auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
         const int64_t ns = std::max<int64_t>(1, ne2);
         std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
@@ -1141,8 +1092,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Build list of candidate types first (compatible ones)
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         size_t max_row_sz = 0;
-        const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
-        const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
+        const ggml_type * base_arr = quant_types;
+        const size_t base_sz = std::size(quant_types);
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(base_sz);
 

From dd4f4bd0b88c4d59613033ba941d85e7ce1d9547 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:23:48 +0100
Subject: [PATCH 098/148] Reduce bpw range

---
 src/llama-quant.cpp         | 7 +------
 tools/quantize/quantize.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f36b9202d53..03863520147 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -655,12 +655,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0,
-#ifdef GGML_USE_METAL
-        GGML_TYPE_F16
-#else
-        GGML_TYPE_BF16
-#endif
+        GGML_TYPE_Q8_0
     };
 
     constexpr double epsilon = 1e-12;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 03018cc3012..69e03179b3b 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
     printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
@@ -484,13 +484,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 16.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 8.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
         return false;
     }
 

From d16945730eac146d87d158a97ef053f845921f01 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:25:29 +0100
Subject: [PATCH 099/148] Refactor outlier trimming

---
 src/llama-quant.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 03863520147..df36a705c2f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -847,8 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
 
-            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution
-            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1
+            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution
             std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };

From 87cba659089342ef4e4c2209d9a750555ae140e3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:26:30 +0100
Subject: [PATCH 100/148] Tighten worker allocator

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index df36a705c2f..90931f25e7b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1136,7 +1136,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
-                    const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
+                    const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
                     if (i >= compatible_candidates.size()) { break; }
 
                     const ggml_type tensor_types = compatible_candidates[i];

From 8a2c71f471842a9b2dcc0bc33592cd7adb8b8dfe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:27:29 +0100
Subject: [PATCH 101/148] Check for direction reversal

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 90931f25e7b..601b9ada427 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1204,7 +1204,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 };
 
                 while (hull.size() >= 2) {
-                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance
                         hull.pop_back();
                     } else {
                         break;

From 3d75b14c0f2fc605fb39a3cb425c4c2482b8d8f5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:27:58 +0100
Subject: [PATCH 102/148] Simplify dequantisation

---
 src/llama-quant.cpp | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 601b9ada427..316dd35fa86 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -819,25 +819,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) {
-                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row));
-            } else {
-                for (size_t r = 0; r < sample_rows; ++r) {
-                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
-                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                    if (quant_type == GGML_TYPE_F16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                    } else if (quant_type == GGML_TYPE_BF16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-                    } else {
-                        if (!traits || !traits->to_float) {
-                            if (out_mse) { *out_mse = infinity; }
-                            if (out_proj) { *out_proj = 0.0; }
-                            return infinity;
-                        }
-                        traits->to_float(src, dst, (int)n_per_row);
-                    }
-                }
+            if (!traits || !traits->to_float) {
+                if (out_mse) { *out_mse = infinity; }
+                if (out_proj) { *out_proj = 0.0; }
+                return infinity;
+            }
+
+            for (size_t r = 0; r < sample_rows; ++r) {
+                const uint8_t * src = quantized_buffer.data() + r * row_sz;
+                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                traits->to_float(src, dst, (int)n_per_row);
             }
         }
 

From e49e241d37e7fd7f25142ee514c9e129c304083b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:28:39 +0100
Subject: [PATCH 103/148] Calculate bpw over all tensors

---
 src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 316dd35fa86..699264553ac 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1219,6 +1219,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
+    // Compute total elements across all tensors and bytes for non-quantizable tensors
+    size_t nq_elements = 0;
+    size_t nq_bytes = 0;
+    for (const auto & it : ml.weights_map) {
+        const ggml_tensor * tensor = it.second.tensor;
+        const std::string name = it.first;
+        nq_elements += (size_t)ggml_nelements(tensor);
+        if (!is_quantizable(name, model.arch, params)) {
+            nq_bytes += ggml_nbytes(tensor);
+        }
+    }
+
     auto total_bytes = [&]() -> size_t {
         size_t tb = 0;
         for (const auto & ti : all) {
@@ -1228,19 +1240,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return tb;
     };
 
-    size_t total_elems = 0;
+    size_t q_elements = 0;
     size_t min_bytes = 0;
     size_t max_bytes = 0;
     for (const auto & ti : all) {
-        total_elems += (size_t)ti.n_elements;
+        q_elements += (size_t)ti.n_elements;
         min_bytes += ti.candidate.front().bytes;  // smallest candidate per tensor
         max_bytes += ti.candidate.back().bytes;   // largest candidate per tensor
     }
 
-    if (total_elems == 0) { return {}; }
+    if (q_elements == 0) { return {}; }
 
     const double target_bpw = params->target_bpw;
-    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes
+    size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
+    size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
 
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
@@ -1374,29 +1387,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             int best_i = -1;
             int best_j = -1;
             double best_ratio = -1.0;
-            size_t best_delta = 0;
+            double best_gain = -1.0;
+
             for (int i = 0; i < (int)all.size(); ++i) {
                 const auto &ti = all[i];
                 int j = ti.choice + 1;
-                // skip same-bytes entries
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; }
 
-                size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
-                if (cur_bytes + delta > budget_bytes) { continue; }
+                size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
+                if (cur_bytes + delta_bytes > budget_bytes) { continue; }
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                double ratio = err_gain / (double)(delta * 8); // error reduction per bit
-                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
+                if (err_gain < epsilon) { continue; } // no real improvement
+
+                double ratio = err_gain / (double)delta_bytes; // error reduction per byte
+                // For tie-breaking, prioritize the largest absolute error improvement.
+                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {
                     best_ratio = ratio;
-                    best_delta = delta;
+                    best_gain = err_gain;
                     best_i = i;
                     best_j = j;
                 }
             }
-            if (best_i < 0) { break; }
+
+            if (best_i < 0) { break; } // no more upgrades within budget found
+
+            size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes;
             all[best_i].choice = best_j;
-            cur_bytes += best_delta;
+            cur_bytes += upgrade_cost;
         }
     }
 

From b3b8a111a58a8a1586c763382463ccdf9bba3f6a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 28 Sep 2025 18:45:25 +0100
Subject: [PATCH 104/148] Compute rows based on tensor shape and slice count

---
 src/llama-quant.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 699264553ac..7bfb8751aeb 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -650,9 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_XXS,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_Q4_1,
         GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0
@@ -961,10 +959,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
-        const int rows_sample_per_expert = activations_data ? 512 : 256;
         const int64_t n_per_row = tensor->ne[0];
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
+
+        // Compute rows based on tensor shape and slice count
+        auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t {
+            const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024;
+            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt
+            const double slice_budget = tensor_budget * scale_rows / std::max<int64_t>(1, n2);
+            const int64_t min_rows = has_acts ? 128 : 64;
+            const int64_t max_rows = 4096;
+            int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
+            total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
+            if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors
+            return total_rows;
+        };
+
+        const int64_t rows_sample_per_expert = sample_rows(n_per_row, nrows_total, ne2, activations_data != nullptr);
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
         std::vector<int64_t> rows_sample(ne2, 0);

From f5d8811ddde7533c561ad77d358d1d509a57ff9f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 1 Oct 2025 19:04:43 +0100
Subject: [PATCH 105/148] Prioritise important tensors

---
 src/llama-quant.cpp | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7bfb8751aeb..a93d982e634 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -656,6 +656,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q8_0
     };
 
+    const char * important_tensors[] = {
+        ".output.weight",
+        ".attn_output.weight",
+        ".ffn_down.weight",
+        ".ffn_down_shexp.weight"
+    };
+
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     const char * func = __func__;
@@ -1288,6 +1295,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
+    auto is_important = [&](const std::string & tensor_name) -> bool {
+        return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) {
+                return tensor_name.find(imp) != std::string::npos;
+            }
+        );
+    };
+
     // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
@@ -1295,11 +1309,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         err = 0.0;
         for (size_t i = 0; i < all.size(); ++i) {
             const auto & candidate = all[i].candidate;
+            const std::string tensor_name = ggml_get_name(all[i].w->tensor);
+            double effective_mu = mu;
+            if (is_important(tensor_name)) { effective_mu *= 0.1; } // important tensors get 10x lower penalty
+
             int best_j = 0;
             double best_val = infinity;
             for (int j = 0; j < (int)candidate.size(); ++j) {
                 const double bits = (double)candidate[j].bytes * 8.0;
-                const double val = candidate[j].error + mu * bits;
+                const double val = candidate[j].error + effective_mu * bits;
                 if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) {
                     best_val = val;
                     best_j = j;
@@ -1402,18 +1420,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double best_gain = -1.0;
 
             for (int i = 0; i < (int)all.size(); ++i) {
-                const auto &ti = all[i];
+                const auto & ti = all[i];
+                const std::string tensor_name  = ggml_get_name(ti.w->tensor);
                 int j = ti.choice + 1;
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
-                if (j >= (int)ti.candidate.size()) { continue; }
+                if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available
 
                 size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
-                if (cur_bytes + delta_bytes > budget_bytes) { continue; }
+                if (cur_bytes + delta_bytes > budget_bytes) { continue; } // won't fit in budget
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                if (err_gain < epsilon) { continue; } // no real improvement
+                if (err_gain < epsilon) { continue; } // no error improvement
 
                 double ratio = err_gain / (double)delta_bytes; // error reduction per byte
+                if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost
+
                 // For tie-breaking, prioritize the largest absolute error improvement.
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {
                     best_ratio = ratio;

From 940db63144d7369f88145a099370cf1bd33a45b7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 3 Oct 2025 11:08:02 +0100
Subject: [PATCH 106/148] Select quantization type if target_bpw is set unless
 user specifies type and threads

---
 tools/quantize/quantize.cpp | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 69e03179b3b..89cf0fbf80b 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -497,6 +497,24 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
+static const char * get_ftype(const float bpw) {
+    const std::map<float, const char *> quant_bpw = {
+        {1.5625, "IQ1_S"},
+        {1.7500, "IQ1_M"},
+        {2.0625, "IQ2_XXS"},
+        {2.6250, "Q2_K"},
+        {3.0625, "IQ3_XXS"},
+        {3.4375, "Q3_K"},
+        {4.2500, "IQ4_XS"},
+        {4.5000, "Q4_K"},
+        {5.5000, "Q5_K"},
+        {6.5625, "Q6_K"},
+        {8.5000, "Q8_0"}
+    };
+
+    return quant_bpw.lower_bound(bpw)->second;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -655,6 +673,7 @@ int main(int argc, char ** argv) {
 
     std::string ftype_str;
     std::string suffix = ".gguf";
+    std::vector<const char *> tmp_argv(argv, argv + argc);
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
         std::string fpath;
         const size_t pos = fname_inp.find_last_of("/\\");
@@ -678,7 +697,21 @@ int main(int argc, char ** argv) {
         }
         arg_idx++;
 
-        if (argc <= arg_idx) {
+        // select quantization type if target_bpw is set unless user specifies type and threads
+        if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
+            auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
+            if (argc == arg_idx) {
+                tmp_argv.push_back(ftype);
+                tmp_argv.push_back(nullptr);
+                argv = const_cast<char **>(tmp_argv.data());
+                argc++;
+            } else {
+                tmp_argv.insert(tmp_argv.end() - 1, ftype);
+                tmp_argv.push_back(nullptr);
+                argv = const_cast<char **>(tmp_argv.data());
+                argc++;
+            }
+        } else if (argc <= arg_idx) {
             fprintf(stderr, "%s: missing ftype\n", __func__);
             return 1;
         }

From 66d4aed173aba8b3b4e05c6d7b46ca8911ec7ddf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 4 Oct 2025 08:21:01 +0100
Subject: [PATCH 107/148] Minor refactoring

---
 tools/quantize/quantize.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 89cf0fbf80b..d355f972742 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -700,17 +700,11 @@ int main(int argc, char ** argv) {
         // select quantization type if target_bpw is set unless user specifies type and threads
         if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
             auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
-            if (argc == arg_idx) {
-                tmp_argv.push_back(ftype);
-                tmp_argv.push_back(nullptr);
-                argv = const_cast<char **>(tmp_argv.data());
-                argc++;
-            } else {
-                tmp_argv.insert(tmp_argv.end() - 1, ftype);
-                tmp_argv.push_back(nullptr);
-                argv = const_cast<char **>(tmp_argv.data());
-                argc++;
-            }
+            if (argc == arg_idx) { tmp_argv.push_back(ftype); }
+            else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
+            tmp_argv.push_back(nullptr);
+            argv = const_cast<char **>(tmp_argv.data());
+            argc++;
         } else if (argc <= arg_idx) {
             fprintf(stderr, "%s: missing ftype\n", __func__);
             return 1;

From 560e8c9d70964320a0283936b0d8e9fd198356ee Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 14:41:42 +0100
Subject: [PATCH 108/148] Relax lambda clamping

---
 src/llama-quant.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a93d982e634..422c929f0c8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -701,7 +701,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) return typ;
+        if (is_compatible(t, typ)) { return typ; }
         ggml_type fb = fallback_type(typ);
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
@@ -941,7 +941,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0);
             }
 
             lambdas[(size_t)s] = l;
@@ -1035,7 +1035,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) {
                     const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (src_type == GGML_TYPE_F32) {
-                        auto src_f32 = (const float *)src_row;
+                        const auto *src_f32 = (const float *)src_row;
                         f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
                     } else {
                         row_to_fp32(src_row, row_buffer.data());
@@ -1173,7 +1173,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
         auto pareto_convex = [](std::vector<candidate_types> & candidates) {
-            if (candidates.empty()) return;
+            if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }

From 533cda3076b5ae26d120f04b7aaa813f7b7a5ac7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:16:33 +0100
Subject: [PATCH 109/148] Add signal handler

---
 src/llama-quant.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 422c929f0c8..50c8dbf4238 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -8,6 +8,7 @@
 #include <cmath>
 #include <cstring>
 #include <cinttypes>
+#include <csignal>
 #include <fstream>
 #include <mutex>
 #include <random>
@@ -613,6 +614,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
+static std::atomic<bool> bpw_stop{ false };
+
+static void signal_handler(int) {
+    bpw_stop.store(true, std::memory_order_relaxed);
+}
+
 // Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
@@ -711,6 +718,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
+    auto install_signal_handlers = [] {
+        static std::once_flag once;
+        std::call_once(once, [] {
+            std::signal(SIGINT, signal_handler);
+            std::signal(SIGTERM, signal_handler);
+        });
+    };
+
+    auto uninstall_signal_handlers = [] {
+        static std::once_flag once;
+        std::call_once(once, [] {
+            std::signal(SIGINT, SIG_DFL);
+            std::signal(SIGTERM, SIG_DFL);
+        });
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From e48ca32f19095ba0c47058dc7a703c1bb52977e0 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:17:27 +0100
Subject: [PATCH 110/148] Add save_bpw_state()

---
 src/llama-quant.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 50c8dbf4238..3080b0ed715 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -734,6 +734,56 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         });
     };
 
+    // Saved state per tensor
+    struct saved_info {
+        std::vector<candidate_types> candidate;
+        int choice = -1;
+        float min_bpw = 0.0f;
+        float max_bpw = 0.0f;
+        size_t n_elements = 0;
+    };
+
+    auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
+        const std::string tmp = checkpoint_file + ".tmp";
+        std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
+        if (!ofs) { return; } // best-effort
+        const float target_bpw = params->target_bpw;
+        const uint8_t bias_mode = params->no_bias ? 1 : 0;
+        ofs.write((const char *)&file_magic, sizeof(file_magic));
+        ofs.write((const char *)&target_bpw, sizeof(target_bpw));
+        ofs.write((const char *)&bias_mode, sizeof(bias_mode));
+        const uint64_t n = all_vec.size();
+        ofs.write((const char *)&n, sizeof(n));
+        for (const auto & ti : all_vec) {
+            const std::string name = ggml_get_name(ti.w->tensor);
+            const uint32_t len = (uint32_t)name.size();
+            ofs.write((const char *)&len, sizeof(len));
+            ofs.write(name.data(), len);
+
+            const uint64_t cn = ti.candidate.size();
+            ofs.write((const char *)&cn, sizeof(cn));
+            ofs.write((const char *)&ti.choice, sizeof(ti.choice));
+            ofs.write((const char *)&ti.min_bpw, sizeof(ti.min_bpw));
+            ofs.write((const char *)&ti.max_bpw, sizeof(ti.max_bpw));
+            const uint64_t ne = ti.n_elements;
+            ofs.write((const char *)&ne, sizeof(ne));
+
+            for (const auto & c : ti.candidate) {
+                const int32_t  t = c.type;
+                const uint64_t b = c.bytes;
+                ofs.write((const char *)&t, sizeof(t));
+                ofs.write((const char *)&c.bpw, sizeof(c.bpw));
+                ofs.write((const char *)&b, sizeof(b));
+                ofs.write((const char *)&c.error, sizeof(c.error));
+            }
+        }
+
+        ofs.close();
+        std::remove(checkpoint_file.c_str()); // TODO: handle errors
+        std::rename(tmp.c_str(), checkpoint_file.c_str());
+        LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 02c3073b81cc7fa26219419c517331b3e3243379 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:18:36 +0100
Subject: [PATCH 111/148] Add load_bpw_state()

---
 src/llama-quant.cpp | 64 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3080b0ed715..4d0dc6a36e3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -672,7 +672,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
+    constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
+    const std::string checkpoint_file = ml.arch_name + ".bpw_state";
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -784,6 +786,68 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
     };
 
+    auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
+        std::unordered_map<std::string, saved_info> out;
+        std::ifstream ifs(checkpoint_file, std::ios::binary);
+        if (!ifs) { return out; }
+
+        uint32_t magic = 0;
+        float target_bpw = 0.0f;
+        uint8_t bias_mode = 0;
+        ifs.read((char *)&magic, sizeof(magic));
+        ifs.read((char *)&target_bpw, sizeof(target_bpw));
+        ifs.read((char *)&bias_mode, sizeof(bias_mode));
+        if (magic != file_magic) {
+            LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
+            return out;
+        }
+        if (target_bpw != params->target_bpw) {
+            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str());
+            return out;
+        }
+        if (bias_mode != (params->no_bias ? 1 : 0)) {
+            LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
+            return out;
+        }
+
+        uint64_t n = 0;
+        ifs.read((char *)&n, sizeof(n));
+        for (uint64_t i = 0; i < n; ++i) {
+            uint32_t len = 0;
+            ifs.read((char *)&len, sizeof(len));
+            std::string name(len, '\0');
+            ifs.read(name.data(), len);
+
+            uint64_t cn = 0;
+            ifs.read((char *)&cn, sizeof(cn));
+
+            saved_info si;
+            ifs.read((char *)&si.choice, sizeof(si.choice));
+            ifs.read((char *)&si.min_bpw, sizeof(si.min_bpw));
+            ifs.read((char *)&si.max_bpw, sizeof(si.max_bpw));
+            uint64_t ne = 0;
+            ifs.read((char *)&ne, sizeof(ne));
+            si.n_elements = (size_t)ne;
+
+            si.candidate.resize(cn);
+            for (size_t j = 0; j < si.candidate.size(); ++j) {
+                int32_t t = 0;
+                uint64_t b = 0;
+                ifs.read((char *)&t, sizeof(t));
+                si.candidate[j].type = (ggml_type)t;
+                ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw));
+                ifs.read((char *)&b, sizeof(b));
+                si.candidate[j].bytes = (size_t)b;
+                ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error));
+            }
+
+            out.emplace(std::move(name), std::move(si));
+        }
+
+        LLAMA_LOG_INFO("%s: loaded bpw state for %lu tensors from %s\n", func, out.size(), checkpoint_file.c_str());
+        return out;
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 74c62ed4e63e4e95f031875b6ead5718f5fb900a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:19:03 +0100
Subject: [PATCH 112/148] Add delete_bpw_state()

---
 src/llama-quant.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4d0dc6a36e3..9212c885632 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -848,6 +848,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return out;
     };
 
+    auto delete_bpw_state = [&] {
+        LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
+        std::remove(checkpoint_file.c_str());
+    };
+
+    auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
+        if (bpw_stop.load(std::memory_order_relaxed)) {
+            LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+            save_bpw_state(all_vec);
+            throw std::runtime_error("user interrupted the process");
+        }
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 46706cec28ad83b8ab10781493b84343b5b0f048 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:20:28 +0100
Subject: [PATCH 113/148] Persist progress

---
 src/llama-quant.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9212c885632..640672aec73 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1100,12 +1100,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
+    install_signal_handlers();
+    auto bpw_data = load_bpw_state();
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }
+        check_signal_handler(all);
+
+        // If we already have fully evaluatedd this tensor then reuse it
+        if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) {
+            tensor_info info;
+            info.w = tw;
+            info.candidate = it_saved->second.candidate;
+            info.choice = it_saved->second.choice;
+            info.min_bpw = it_saved->second.min_bpw;
+            info.max_bpw = it_saved->second.max_bpw;
+            info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor);
+            all.push_back(std::move(info));
+            continue;
+        }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
@@ -1296,6 +1312,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
+                    if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived
                     const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
                     if (i >= compatible_candidates.size()) { break; }
 
@@ -1311,6 +1328,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         for (auto &th : eval_workers) { th.join(); }
 
+        // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
+        if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) {
+            check_signal_handler(all);
+        }
+
         for (auto &c : eval_candidates) {
             if (c.bytes > 0) { info.candidate.push_back(c); }
         }
@@ -1384,6 +1406,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
         all.push_back(std::move(info));
+        check_signal_handler(all); // save after each tensor
     }
 
     if (all.empty()) { return {}; }
@@ -1441,7 +1464,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
     if (budget_bytes >= max_bytes) {
-        for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
+        for (auto & ti : all) { ti.choice = (int)ti.candidate.size() - 1; }
         return emit_overrides();
     }
 

From 84ada44894dec721124613820bf640b97ac3e784 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:20:56 +0100
Subject: [PATCH 114/148] Uninstall signal handler and cleanup

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 640672aec73..eb5c9124b5f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1625,6 +1625,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
+    delete_bpw_state(); // we're done, clear any checkpoint
+    uninstall_signal_handlers();
+
     return emit_overrides();
 }
 

From 044fa783c7e5e87bddf667fbe7396628e827b455 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 6 Oct 2025 21:40:37 +0100
Subject: [PATCH 115/148] Fix trimming logic

---
 src/llama-quant.cpp | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index eb5c9124b5f..aeb1542607c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -849,8 +849,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto delete_bpw_state = [&] {
-        LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
-        std::remove(checkpoint_file.c_str());
+        std::ifstream ifs(checkpoint_file);
+        if (ifs.good()) {
+            LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
+            std::remove(checkpoint_file.c_str());
+        }
+
     };
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
@@ -988,14 +992,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Compute error per slice with trimmed aggregation
-        auto trimmed_sum = [](std::vector<double> & v) -> double {
+        auto trimmed_mean = [](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
-            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
-
-            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution
+            double sum = std::accumulate(v.begin(), v.end(), 0.0);
+            if (n < 50) { return sum / (double)n; } // too few elements to trim
+            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 5% (2.5% each side)
             std::sort(v.begin(), v.end());
-            return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+            const auto num = (double)(n - 2 * k);
+            sum = std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+            return sum / std::max(1.0, num);
         };
 
         size_t off = 0;
@@ -1028,7 +1034,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
 
                 const double denom_x = row_sq_norm[ridx];
-                const double m_norm  = w_mse / (denom_x + epsilon);
+                const double m_norm = w_mse / (denom_x + epsilon);
                 row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
 
                 if (a) {
@@ -1044,9 +1050,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 off += (size_t)n_per_row;
             }
 
-            const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
-            const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
-            const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
+            const double slice_mse = trimmed_mean(row_mse_norm) * (double)nrows;
+            const double slice_proj = a ? trimmed_mean(row_proj_norm) * (double)nrows : 0.0;
 
             total_mse += slice_mse;
             total_proj += slice_proj;

From c11184a3c11917aba2c3d360a9cbb3bf3ebaf38a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 9 Oct 2025 11:58:01 +0100
Subject: [PATCH 116/148] Generate model ID hash

---
 src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index aeb1542607c..5388d5a072a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -674,7 +674,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
-    const std::string checkpoint_file = ml.arch_name + ".bpw_state";
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -745,6 +744,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
+    auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t {
+        uint64_t h = 5381;
+        for (size_t i = 0; i < n; ++i) {
+            h = (h << 5) + h + data[i];
+        }
+        return h ? h : 0xeabada55cafed00d;
+    };
+
+    auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
+        const size_t sz = gguf_get_meta_size(ctx);
+        std::vector<uint8_t> buf(sz);
+        gguf_get_meta_data(ctx, buf.data());
+        return djb2_hash(buf.data(), buf.size());
+    };
+
+    char hex[17];
+    const uint64_t model_id = metadata_id(ml.meta.get());
+    std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id);
+    const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
@@ -752,6 +771,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float target_bpw = params->target_bpw;
         const uint8_t bias_mode = params->no_bias ? 1 : 0;
         ofs.write((const char *)&file_magic, sizeof(file_magic));
+        ofs.write((const char *)&model_id, sizeof(model_id));
         ofs.write((const char *)&target_bpw, sizeof(target_bpw));
         ofs.write((const char *)&bias_mode, sizeof(bias_mode));
         const uint64_t n = all_vec.size();
@@ -781,9 +801,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         ofs.close();
-        std::remove(checkpoint_file.c_str()); // TODO: handle errors
+        std::remove(checkpoint_file.c_str());
         std::rename(tmp.c_str(), checkpoint_file.c_str());
-        LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+        LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
     };
 
     auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
@@ -792,22 +812,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (!ifs) { return out; }
 
         uint32_t magic = 0;
-        float target_bpw = 0.0f;
-        uint8_t bias_mode = 0;
+        uint64_t id = 0;
+        float bpw = 0.0f;
+        uint8_t bias = 0;
         ifs.read((char *)&magic, sizeof(magic));
-        ifs.read((char *)&target_bpw, sizeof(target_bpw));
-        ifs.read((char *)&bias_mode, sizeof(bias_mode));
+        ifs.read((char *)&id, sizeof(id));
+        ifs.read((char *)&bpw, sizeof(bpw));
+        ifs.read((char *)&bias, sizeof(bias));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        }
-        if (target_bpw != params->target_bpw) {
-            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str());
+        } else if (id != model_id) {
+            LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        }
-        if (bias_mode != (params->no_bias ? 1 : 0)) {
+        } else if (bpw != params->target_bpw) {
+            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
+            return out;
+        } else if (bias != (params->no_bias ? 1 : 0)) {
             LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
+        } else {
+            LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
 
         uint64_t n = 0;
@@ -859,7 +884,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
         if (bpw_stop.load(std::memory_order_relaxed)) {
-            LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+            LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
             save_bpw_state(all_vec);
             throw std::runtime_error("user interrupted the process");
         }

From 3a3d807fc3aacc01715047bcc893f925f5343c6b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 10 Oct 2025 13:10:42 +0100
Subject: [PATCH 117/148] Remove bias mode computation

---
 src/llama-quant.cpp | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5388d5a072a..7b3e956193b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -769,11 +769,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
         if (!ofs) { return; } // best-effort
         const float target_bpw = params->target_bpw;
-        const uint8_t bias_mode = params->no_bias ? 1 : 0;
         ofs.write((const char *)&file_magic, sizeof(file_magic));
         ofs.write((const char *)&model_id, sizeof(model_id));
         ofs.write((const char *)&target_bpw, sizeof(target_bpw));
-        ofs.write((const char *)&bias_mode, sizeof(bias_mode));
         const uint64_t n = all_vec.size();
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
@@ -814,11 +812,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         uint32_t magic = 0;
         uint64_t id = 0;
         float bpw = 0.0f;
-        uint8_t bias = 0;
         ifs.read((char *)&magic, sizeof(magic));
         ifs.read((char *)&id, sizeof(id));
         ifs.read((char *)&bpw, sizeof(bpw));
-        ifs.read((char *)&bias, sizeof(bias));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
@@ -828,9 +824,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         } else if (bpw != params->target_bpw) {
             LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
             return out;
-        } else if (bias != (params->no_bias ? 1 : 0)) {
-            LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
-            return out;
         } else {
             LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
@@ -1319,13 +1312,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> lambdas;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-        if (!params->no_bias) {
-            double acc = 0.0;
-            int ns = 0;
-            lambdas = estimate_lambda(values, activations, n_per_row, ne2);
-            for (float l : lambdas) { acc += l; ++ns; }
-            tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
-        }
+        double acc = 0.0;
+        int ns = 0;
+        lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+        for (float l : lambdas) { acc += l; ++ns; }
+        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
 
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
@@ -1925,11 +1916,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
             if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__);
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__);
             } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
             }
-            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {

From c93131cef6dbb4e415fd2b3625f644c6714e7465 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 10 Oct 2025 13:26:51 +0100
Subject: [PATCH 118/148] Remove --no-bias option

---
 include/llama.h             | 1 -
 src/llama-quant.cpp         | 3 +--
 tools/quantize/quantize.cpp | 6 +-----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 16f61247272..1df8f96920c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,6 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        bool no_bias;                         // use mean square error estimation only (no aligment bias)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7b3e956193b..4ad5124d1ab 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -2180,8 +2180,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f,
-        /*.no_bias                     =*/ false
+        /*.target_bpw                  =*/ -1.0f
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index d355f972742..c254c3f6b24 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -134,8 +134,6 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
-    printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -559,8 +557,6 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--no-bias") == 0) {
-            params.no_bias = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 5b0d3f6d5ad46596e0f30c967c00e2dc2b93d8da Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 11 Oct 2025 10:04:48 +0100
Subject: [PATCH 119/148] Automatically determine if bias error is significant

---
 src/llama-quant.cpp | 52 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ad5124d1ab..07a88f0fd68 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -637,6 +637,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         float bpw;
         size_t bytes;
         double error;
+        double mse = 0.0;
+        double proj = 0.0;
     };
 
     struct tensor_info {
@@ -1340,9 +1342,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tensor_types = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(tensor, tensor_types);
                     const size_t bytes = tensor_bytes(tensor, tensor_types);
+                    double mse = 0.0;
+                    double proj = 0.0;
                     const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda);
-                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
+                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
+                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
                 }
             });
         }
@@ -1354,8 +1358,48 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             check_signal_handler(all);
         }
 
-        for (auto &c : eval_candidates) {
-            if (c.bytes > 0) { info.candidate.push_back(c); }
+        // Check if biasing is needed
+        bool bias_needed = false;
+        if (!lambdas.empty()) {
+            int min_mse  = -1;
+            int min_bias = -1;
+            {
+                double best_mse = std::numeric_limits<double>::infinity();
+                double best_err = std::numeric_limits<double>::infinity();
+                for (int i = 0; i < (int)eval_candidates.size(); ++i) {
+                    const auto & c = eval_candidates[i];
+                    if (c.bytes == 0) { continue; }
+                    if (c.mse  < best_mse) {
+                        best_mse = c.mse;
+                        min_mse  = i;
+                    }
+                    if (c.error < best_err) {
+                        best_err = c.error;
+                        min_bias = i;
+                    }
+                }
+            }
+
+            if (min_mse != min_bias) {
+                bias_needed = true;
+            } else {
+                double max_rel_bias = 0.0;
+                for (const auto & c : eval_candidates) {
+                    if (c.bytes == 0) { continue; }
+                    const double mse = std::max(c.mse, epsilon);
+                    const double bias_term = std::max(0.0, c.error - c.mse);
+                    const double rel = bias_term / mse;
+                    max_rel_bias = std::max(rel, max_rel_bias);
+                }
+
+                bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE?
+            }
+        }
+
+        for (auto & c : eval_candidates) {
+            if (c.bytes == 0) { continue; }
+            const double final_err = bias_needed ? c.error : c.mse;
+            info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
         }
 
         if (info.candidate.empty()) {

From 12e0524f3a24d4d5c8a81546fff83fee81e0d3e1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 15:12:15 +0100
Subject: [PATCH 120/148] Reduce compute time by parallelising tensor
 processing - courtesy of https://github.com/ddh0

---
 src/llama-quant.cpp | 189 +++++++++++++++++++++++---------------------
 1 file changed, 101 insertions(+), 88 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 07a88f0fd68..c607651b05b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -15,6 +15,7 @@
 #include <regex>
 #include <thread>
 #include <unordered_map>
+#include <optional>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
@@ -623,7 +624,6 @@ static void signal_handler(int) {
 // Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
-    std::vector<no_init<uint8_t>> & buffer,
     const llama_model & model,
     const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
     const std::map<int, std::string> & mapped,
@@ -659,6 +659,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_XXS,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
         GGML_TYPE_Q4_K,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
@@ -1127,16 +1128,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     install_signal_handlers();
     auto bpw_data = load_bpw_state();
-    std::vector<tensor_info> all;
-    all.reserve(tensors.size());
-    for (const auto * tw : tensors) {
+
+    // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
+        std::vector<no_init<uint8_t>> & thread_local_buffer,
+        std::mutex & loader_mutex,
+        std::mutex & log_mutex) -> std::optional<tensor_info>
+    {
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
-        if (!can_quantize(tensor)) { continue; }
-        check_signal_handler(all);
+        if (bpw_stop.load(std::memory_order_relaxed)) {
+            return std::nullopt;
+        }
 
-        // If we already have fully evaluatedd this tensor then reuse it
-        if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) {
+        // check for pre-computed results from a checkpoint file.
+        auto it_saved = bpw_data.find(name);
+        if (it_saved != bpw_data.end()) {
             tensor_info info;
             info.w = tw;
             info.candidate = it_saved->second.candidate;
@@ -1144,17 +1151,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.min_bpw = it_saved->second.min_bpw;
             info.max_bpw = it_saved->second.max_bpw;
             info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor);
-            all.push_back(std::move(info));
-            continue;
+            return info;
+        }
+        {
+            std::lock_guard<std::mutex> lock(log_mutex);
+            LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor));
         }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
-            if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
-            tensor->data = buffer.data();
+            if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); }
+            tensor->data = thread_local_buffer.data();
+        }
+        {
+            std::lock_guard<std::mutex> lock(loader_mutex);
+            ml.load_data_for(tensor);
         }
-
-        ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
         const int64_t n_per_row = tensor->ne[0];
@@ -1170,7 +1181,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const int64_t max_rows = 4096;
             int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
             total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
-            if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors
+            if (rows <= min_rows * 2) { total_rows = rows; }
             return total_rows;
         };
 
@@ -1191,17 +1202,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return;
             }
             if (t == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
                 return;
             }
             if (t == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
                 return;
             }
-
             if (src_is_quant) {
                 GGML_ASSERT(src_traits && src_traits->to_float);
-                src_traits->to_float(src, dst, (int) n_per_row);
+                src_traits->to_float(src, dst, (int)n_per_row);
                 return;
             }
 
@@ -1266,6 +1276,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return;
             }
 
+            std::lock_guard<std::mutex> lock(log_mutex);
             LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want);
         };
 
@@ -1276,12 +1287,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
         if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
-        const int64_t nelem = ggml_nelements(tensor);
         tensor_info info;
         info.w = tw;
-        info.n_elements = nelem;
-
-        // Prepare scratch buffers sized for the largest candidate row size
+        info.n_elements = ggml_nelements(tensor);
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
@@ -1295,7 +1303,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str());
+                std::lock_guard<std::mutex> lock(log_mutex);
+                LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 
@@ -1325,58 +1334,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
-        int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
-        std::atomic<size_t> cidx{0};
-        std::vector<std::thread> eval_workers;
-        eval_workers.reserve(n_eval_threads);
-        for (int ti = 0; ti < n_eval_threads; ++ti) {
-            eval_workers.emplace_back([&] {
-                // thread-local scratch
-                std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
-                std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
-                for (;;) {
-                    if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived
-                    const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
-                    if (i >= compatible_candidates.size()) { break; }
-
-                    const ggml_type tensor_types = compatible_candidates[i];
-                    const auto bpw = (float)tensor_bpw(tensor, tensor_types);
-                    const size_t bytes = tensor_bytes(tensor, tensor_types);
-                    double mse = 0.0;
-                    double proj = 0.0;
-                    const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
-                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
-                }
-            });
-        }
+        for (size_t i = 0; i < compatible_candidates.size(); ++i) {
+            if (bpw_stop.load(std::memory_order_relaxed)) { break; }
 
-        for (auto &th : eval_workers) { th.join(); }
-
-        // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
-        if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) {
-            check_signal_handler(all);
+            const ggml_type tensor_types = compatible_candidates[i];
+            const auto bpw = (float)tensor_bpw(tensor, tensor_types);
+            const size_t bytes = tensor_bytes(tensor, tensor_types);
+            double mse = 0.0;
+            double proj = 0.0;
+            const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+                quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
+            eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
         }
 
+        if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
+
         // Check if biasing is needed
         bool bias_needed = false;
         if (!lambdas.empty()) {
             int min_mse  = -1;
             int min_bias = -1;
-            {
-                double best_mse = std::numeric_limits<double>::infinity();
-                double best_err = std::numeric_limits<double>::infinity();
-                for (int i = 0; i < (int)eval_candidates.size(); ++i) {
-                    const auto & c = eval_candidates[i];
-                    if (c.bytes == 0) { continue; }
-                    if (c.mse  < best_mse) {
-                        best_mse = c.mse;
-                        min_mse  = i;
-                    }
-                    if (c.error < best_err) {
-                        best_err = c.error;
-                        min_bias = i;
-                    }
+            double best_mse = std::numeric_limits<double>::infinity();
+            double best_err = std::numeric_limits<double>::infinity();
+            for (int i = 0; i < (int)eval_candidates.size(); ++i) {
+                const auto & c = eval_candidates[i];
+                if (c.bytes == 0) { continue; }
+                if (c.mse  < best_mse) {
+                    best_mse = c.mse;
+                    min_mse  = i;
+                }
+                if (c.error < best_err) {
+                    best_err = c.error;
+                    min_bias = i;
                 }
             }
 
@@ -1388,8 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     if (c.bytes == 0) { continue; }
                     const double mse = std::max(c.mse, epsilon);
                     const double bias_term = std::max(0.0, c.error - c.mse);
-                    const double rel = bias_term / mse;
-                    max_rel_bias = std::max(rel, max_rel_bias);
+                    max_rel_bias = std::max(bias_term / mse, max_rel_bias);
                 }
 
                 bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE?
@@ -1404,7 +1392,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
-            float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
+            float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements;
             info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
@@ -1416,26 +1404,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
-            const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+            candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
                 return a.bytes == b.bytes;
-            });
-            candidates.erase(last, candidates.end());
-
-            // Pareto by bytes -> error
+            }), candidates.end());
             std::vector<candidate_types> pareto;
             pareto.reserve(candidates.size());
             double best_err = infinity;
-            size_t last_b = std::numeric_limits<size_t>::max();
             for (const auto & c : candidates) {
-                if (c.bytes != last_b) {
-                    last_b = c.bytes;
-                    if (c.error < best_err) {
-                        best_err = c.error;
-                        pareto.push_back(c);
-                    }
+                if (c.error < best_err) {
+                    best_err = c.error;
+                    pareto.push_back(c);
                 }
             }
-
             candidates.swap(pareto);
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
@@ -1470,10 +1450,43 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
-        all.push_back(std::move(info));
-        check_signal_handler(all); // save after each tensor
+
+        return info;
+    };
+
+    std::vector<tensor_info> all; // this vector will be populated by the parallel workers
+    {
+        std::atomic<size_t> tensor_idx{0}; // shared work queue index for all threads
+        const size_t num_tensors_to_process = tensors.size();
+        std::mutex loader_mutex;
+        std::mutex log_mutex;
+        std::mutex results_mutex;
+        std::vector<std::thread> workers;
+        int num_threads_to_spawn = std::max(1, std::min<int>(nthread, (int)num_tensors_to_process));
+
+        for (int i = 0; i < num_threads_to_spawn; ++i) {
+            workers.emplace_back([&]() {
+                std::vector<no_init<uint8_t>> thread_local_buffer;
+                while (true) {
+                    const size_t current_idx = tensor_idx.fetch_add(1);
+                    if (current_idx >= num_tensors_to_process) { break; }
+                    const auto * tw = tensors[current_idx];
+                    if (!can_quantize(tw->tensor)) { continue; }
+                    // Execute the main processing logic for this tensor
+                    std::optional<tensor_info> result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex);
+                    if (result_info) {
+                        std::lock_guard<std::mutex> lock(results_mutex);
+                        all.push_back(std::move(*result_info));
+                    }
+                }
+            });
+        }
+
+        for (auto & w : workers) { w.join(); }
     }
 
+    check_signal_handler(all);
+
     if (all.empty()) { return {}; }
 
     // Compute total elements across all tensors and bytes for non-quantizable tensors
@@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
             }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
             LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
         }

From b6094a97bfbd831a715ca366200f8b9372a26a0d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 16:30:35 +0100
Subject: [PATCH 121/148] Add quant types

---
 src/llama-quant.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c607651b05b..56e63f9bb76 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -655,8 +655,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ1_S,
         GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
         GGML_TYPE_Q2_K,
         GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
@@ -1155,7 +1158,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         {
             std::lock_guard<std::mutex> lock(log_mutex);
-            LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor));
+            LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", func, name.c_str(), ggml_nelements(tensor));
         }
 
         if (!ml.use_mmap) {
@@ -1457,19 +1460,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     std::vector<tensor_info> all; // this vector will be populated by the parallel workers
     {
         std::atomic<size_t> tensor_idx{0}; // shared work queue index for all threads
-        const size_t num_tensors_to_process = tensors.size();
+        const size_t tensors_to_process = tensors.size();
         std::mutex loader_mutex;
         std::mutex log_mutex;
         std::mutex results_mutex;
         std::vector<std::thread> workers;
-        int num_threads_to_spawn = std::max(1, std::min<int>(nthread, (int)num_tensors_to_process));
+        int threads_to_spawn = std::max(1, std::min<int>(nthread, (int)tensors_to_process));
 
-        for (int i = 0; i < num_threads_to_spawn; ++i) {
+        for (int i = 0; i < threads_to_spawn; ++i) {
             workers.emplace_back([&]() {
                 std::vector<no_init<uint8_t>> thread_local_buffer;
                 while (true) {
                     const size_t current_idx = tensor_idx.fetch_add(1);
-                    if (current_idx >= num_tensors_to_process) { break; }
+                    if (current_idx >= tensors_to_process) { break; }
                     const auto * tw = tensors[current_idx];
                     if (!can_quantize(tw->tensor)) { continue; }
                     // Execute the main processing logic for this tensor

From ca282302b5cde95945f8337e6df264d92e878501 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 18:23:23 +0100
Subject: [PATCH 122/148] Add --keep-bpw-state option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 16 +++++-----------
 tools/quantize/quantize.cpp |  5 ++++-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 14e12d7c518..f745e2110b7 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -366,6 +366,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
+        bool keep_bpw_state;                  // keep bpw state file
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 56e63f9bb76..4b243f1f557 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -659,7 +659,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ2_S,
         GGML_TYPE_Q2_K,
         GGML_TYPE_IQ3_XXS,
-        GGML_TYPE_IQ3_S,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
@@ -773,11 +772,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
-        if (!ofs) { return; } // best-effort
-        const float target_bpw = params->target_bpw;
+        if (!ofs) { return; }
         ofs.write((const char *)&file_magic, sizeof(file_magic));
         ofs.write((const char *)&model_id, sizeof(model_id));
-        ofs.write((const char *)&target_bpw, sizeof(target_bpw));
         const uint64_t n = all_vec.size();
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
@@ -817,19 +814,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         uint32_t magic = 0;
         uint64_t id = 0;
-        float bpw = 0.0f;
         ifs.read((char *)&magic, sizeof(magic));
         ifs.read((char *)&id, sizeof(id));
-        ifs.read((char *)&bpw, sizeof(bpw));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
         } else if (id != model_id) {
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else if (bpw != params->target_bpw) {
-            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
-            return out;
         } else {
             LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
@@ -874,7 +866,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto delete_bpw_state = [&] {
         std::ifstream ifs(checkpoint_file);
-        if (ifs.good()) {
+        if (ifs.good() && !params->keep_bpw_state) {
             LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
             std::remove(checkpoint_file.c_str());
         }
@@ -1489,6 +1481,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     check_signal_handler(all);
+    if (params->keep_bpw_state) { save_bpw_state(all); }
 
     if (all.empty()) { return {}; }
 
@@ -2240,7 +2233,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f
+        /*.target_bpw                  =*/ -1.0f,
+        /*.keep_bpw_state              =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index c254c3f6b24..ad2563a48d2 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -134,6 +134,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --keep-bpw-state: preserve the bpw computations in a state file\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -557,6 +558,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
+            params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From b1b58e67df30453edd64706abda76d3c42f0bb03 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 14:54:32 +0100
Subject: [PATCH 123/148] Refactor signal handlers

---
 src/llama-quant.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4b243f1f557..d1fa4295530 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,6 +632,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const llama_model_quantize_params * params,
     int nthread
 ) {
+    // RAII guard for signal handlers
+    bpw_stop.store(false, std::memory_order_relaxed);
+    struct signal_scope_guard {
+        using handler_t = void (*)(int);
+        handler_t prev_int = SIG_DFL;
+        handler_t prev_term = SIG_DFL;
+        signal_scope_guard() {
+            prev_int  = std::signal(SIGINT,  signal_handler);
+            prev_term = std::signal(SIGTERM, signal_handler);
+        }
+        ~signal_scope_guard() {
+            std::signal(SIGINT,  prev_int);
+            std::signal(SIGTERM, prev_term);
+        }
+    } _signal_guard;
+
     struct candidate_types {
         ggml_type type;
         float bpw;
@@ -724,22 +740,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
-    auto install_signal_handlers = [] {
-        static std::once_flag once;
-        std::call_once(once, [] {
-            std::signal(SIGINT, signal_handler);
-            std::signal(SIGTERM, signal_handler);
-        });
-    };
-
-    auto uninstall_signal_handlers = [] {
-        static std::once_flag once;
-        std::call_once(once, [] {
-            std::signal(SIGINT, SIG_DFL);
-            std::signal(SIGTERM, SIG_DFL);
-        });
-    };
-
     // Saved state per tensor
     struct saved_info {
         std::vector<candidate_types> candidate;
@@ -1121,7 +1121,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
-    install_signal_handlers();
     auto bpw_data = load_bpw_state();
 
     // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
@@ -1700,7 +1699,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     delete_bpw_state(); // we're done, clear any checkpoint
-    uninstall_signal_handlers();
 
     return emit_overrides();
 }

From cd734b89ce3b2af611fd168975a5921f33b475eb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 15:15:23 +0100
Subject: [PATCH 124/148] Update quant types

---
 src/llama-quant.cpp         | 3 ++-
 tools/quantize/quantize.cpp | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d1fa4295530..7543ec69618 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -681,7 +681,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q4_K,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+        GGML_TYPE_F16
     };
 
     const char * important_tensors[] = {
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index ad2563a48d2..e67649beb97 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -501,6 +501,8 @@ static const char * get_ftype(const float bpw) {
         {1.5625, "IQ1_S"},
         {1.7500, "IQ1_M"},
         {2.0625, "IQ2_XXS"},
+        {2.3125, "IQ2_XS"},
+        {2.5625, "IQ2_S"},
         {2.6250, "Q2_K"},
         {3.0625, "IQ3_XXS"},
         {3.4375, "Q3_K"},

From b7911f14314387e4101957d4eb4df9650660c877 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 17:46:45 +0100
Subject: [PATCH 125/148] Minor refactoring

---
 src/llama-quant.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7543ec69618..0f256eface7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1122,9 +1122,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
-    auto bpw_data = load_bpw_state();
+    const auto bpw_data = load_bpw_state();
 
-    // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
     auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
         std::vector<no_init<uint8_t>> & thread_local_buffer,
         std::mutex & loader_mutex,
@@ -1330,7 +1330,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         for (size_t i = 0; i < compatible_candidates.size(); ++i) {
-            if (bpw_stop.load(std::memory_order_relaxed)) { break; }
+            if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
 
             const ggml_type tensor_types = compatible_candidates[i];
             const auto bpw = (float)tensor_bpw(tensor, tensor_types);
@@ -1383,6 +1383,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (c.bytes == 0) { continue; }
             const double final_err = bias_needed ? c.error : c.mse;
             info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
+            // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n",
+            //     func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err);
         }
 
         if (info.candidate.empty()) {
@@ -1426,7 +1428,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 };
 
                 while (hull.size() >= 2) {
-                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
                         hull.pop_back();
                     } else {
                         break;
@@ -1670,7 +1672,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const auto & ti = all[i];
                 const std::string tensor_name  = ggml_get_name(ti.w->tensor);
                 int j = ti.choice + 1;
-                while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available
 
                 size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;

From a6853ea2ae7d828e535874e6f2244786921df594 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 11:20:24 +0100
Subject: [PATCH 126/148] Add tensor type and depth heuristics

---
 src/llama-quant.cpp | 94 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0f256eface7..38d20e3d0f3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -16,6 +16,7 @@
 #include <thread>
 #include <unordered_map>
 #include <optional>
+#include <unordered_set>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
@@ -685,13 +686,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_F16
     };
 
-    const char * important_tensors[] = {
-        ".output.weight",
-        ".attn_output.weight",
-        ".ffn_down.weight",
-        ".ffn_down_shexp.weight"
-    };
-
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
@@ -1544,11 +1538,89 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto is_important = [&](const std::string & tensor_name) -> bool {
-        return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) {
-                return tensor_name.find(imp) != std::string::npos;
+    auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float> {
+        std::unordered_map<std::string, float> scores;
+        for (const auto & ti : all_vec) {
+            const std::string name = ggml_get_name(ti.w->tensor);
+            float total_score = 0.0f;
+            float depth_score = 0.0f;
+            float type_score = 0.0f;
+
+            // Depth component: output, embeddings & early/late layers are important
+            if (name.find("output.weight") != std::string::npos ||
+                name.find("token_embd.weight") != std::string::npos) {
+                depth_score = 1.0f;
+            }
+            else if (name.find(".attn_output.weight") != std::string::npos) {
+                depth_score = 0.9f;
+            } else {
+                static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
+                std::smatch match;
+                if (std::regex_search(name, match, layer_pattern)) {
+                    const int layer = std::stoi(match[1]);
+                    const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
+                    const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
+                    depth_score = 0.2f + 0.6f * center_dist;
+                }
+            }
+
+            // Type component: certain tensor types are more important
+            if (name.find("output.weight") != std::string::npos) {
+                type_score = 1.0f;
+            } else if (name.find(".attn_output.weight") != std::string::npos) {
+                type_score = 0.9f;
+            } else if (name.find(".ffn_down.weight") != std::string::npos ||
+                       name.find(".ffn_down_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_down_exps.weight") != std::string::npos) {
+                type_score = 0.8f;
+            } else if (name.find(".attn_q.weight") != std::string::npos ||
+                       name.find(".attn_k.weight") != std::string::npos ||
+                       name.find(".attn_v.weight") != std::string::npos ||
+                       name.find(".attn_qkv.weight") != std::string::npos) {
+                type_score = 0.7f;
+            } else if (name.find(".ffn_up.weight") != std::string::npos ||
+                       name.find(".ffn_gate.weight") != std::string::npos ||
+                       name.find(".ffn_up_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_gate_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_up_exps.weight") != std::string::npos ||
+                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
+                type_score = 0.6f;
+            } else if (name.find("token_embd.weight") != std::string::npos) {
+                type_score = 0.5f;
             }
-        );
+
+            // Weighted combination
+            total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth
+            scores[name] = total_score;
+        }
+
+        return scores;
+    };
+
+    auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
+        const auto scores = tensor_importance(all_vec);
+
+        // Sort by score
+        std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
+        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
+
+        // Select top percentile
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25%
+
+        std::unordered_set<std::string> important;
+        for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
+            important.insert(sorted_scores[i].first);
+            //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
+        }
+
+        LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size());
+        return important;
+    };
+
+    const auto important_set = select_tensors(all);
+
+    auto is_important = [&](const std::string & tensor_name) -> bool {
+        return important_set.count(tensor_name) > 0;
     };
 
     // Lagrangian relaxation to minimise error subject to a bpw target constraint

From 0b3e930d5204d3c4be96179835f5378811814247 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 11:41:26 +0100
Subject: [PATCH 127/148] Add option to override bpw state file name

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 21 +++++++++++++++++++--
 tools/quantize/quantize.cpp | 15 +++++++++++----
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index f745e2110b7..ce04011e191 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -367,6 +367,7 @@ extern "C" {
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
+        void * bpw_state;                     // pointer to bpw state file
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 38d20e3d0f3..1dee52d58d9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -762,7 +762,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     char hex[17];
     const uint64_t model_id = metadata_id(ml.meta.get());
     std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id);
-    const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+    std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+    if (params->keep_bpw_state && params->bpw_state) {
+        const auto * filename = static_cast<const char*>(params->bpw_state);
+        std::ifstream ifs(filename, std::ios::binary);
+        if (ifs.good()) {
+            checkpoint_file = std::string(filename);
+        } else {
+            std::ofstream ofs(filename, std::ios::binary | std::ios::app);
+            if (ofs.is_open()) {
+                checkpoint_file = std::string(filename);
+                ofs.close();
+                std::remove(checkpoint_file.c_str());
+            } else {
+                LLAMA_LOG_WARN("%s: %s is not a valid file name. Using %s instead\n", func, filename, checkpoint_file.c_str());
+            }
+        }
+    }
 
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
@@ -2306,7 +2322,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.keep_bpw_state              =*/ false
+        /*.keep_bpw_state              =*/ false,
+        /*.bpw_state                   =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index e67649beb97..945acbe2887 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,8 +117,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
+    printf("       [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -128,13 +128,14 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. Example: --tensor-type attn_q=q8_0\n");
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --keep-bpw-state: preserve the bpw computations in a state file\n");
+    printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
+    printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -562,6 +563,12 @@ int main(int argc, char ** argv) {
             }
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
+        } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {
+            if (arg_idx < argc-1) {
+                params.bpw_state = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From a5103933bb4eec23b71bd8ccaae3b80710a1a82a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 15:11:48 +0100
Subject: [PATCH 128/148] Minor refactoring

---
 src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1dee52d58d9..b8391a4f2c2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -647,7 +647,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             std::signal(SIGINT,  prev_int);
             std::signal(SIGTERM, prev_term);
         }
-    } _signal_guard;
+    } signal_guard;
 
     struct candidate_types {
         ggml_type type;
@@ -683,7 +683,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
+#ifdef GGML_USE_METAL
         GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     constexpr double epsilon = 1e-12;
@@ -1004,17 +1008,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Dequantize into dequantized_buffer
         {
-            const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (!traits || !traits->to_float) {
-                if (out_mse) { *out_mse = infinity; }
-                if (out_proj) { *out_proj = 0.0; }
-                return infinity;
-            }
-
-            for (size_t r = 0; r < sample_rows; ++r) {
-                const uint8_t * src = quantized_buffer.data() + r * row_sz;
-                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                traits->to_float(src, dst, (int)n_per_row);
+            if (quant_type == GGML_TYPE_F16) {
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    auto src = (const ggml_fp16_t *)(quantized_buffer.data() + r * row_sz);
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    ggml_fp16_to_fp32_row(src, dst, (int)n_per_row);
+                }
+            } else if (quant_type == GGML_TYPE_BF16) {
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    auto src = (const ggml_bf16_t *)(quantized_buffer.data() + r * row_sz);
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    ggml_bf16_to_fp32_row(src, dst, (int)n_per_row);
+                }
+            } else {
+                const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
+                if (!traits || !traits->to_float) {
+                    if (out_mse) { *out_mse = infinity; }
+                    if (out_proj) { *out_proj = 0.0; }
+                    return infinity;
+                }
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    traits->to_float(src, dst, (int)n_per_row);
+                }
             }
         }
 
@@ -1500,13 +1517,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Compute total elements across all tensors and bytes for non-quantizable tensors
     size_t nq_elements = 0;
     size_t nq_bytes = 0;
-    for (const auto & it : ml.weights_map) {
-        const ggml_tensor * tensor = it.second.tensor;
-        const std::string name = it.first;
+    for (const auto * it : tensors) {
+        const ggml_tensor * tensor = it->tensor;
+        const std::string name = ggml_get_name(tensor);
         nq_elements += (size_t)ggml_nelements(tensor);
-        if (!is_quantizable(name, model.arch, params)) {
-            nq_bytes += ggml_nbytes(tensor);
-        }
+        if (!can_quantize(tensor)) { nq_bytes += ggml_nbytes(tensor); }
     }
 
     auto total_bytes = [&]() -> size_t {

From fa1df81d49a0512cb4dc6b9b2afc10e7af86bcf2 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 20:52:23 +0100
Subject: [PATCH 129/148] Finetune heuristics

---
 src/llama-quant.cpp | 51 ++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 91b127789cc..5e3893151c6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1577,13 +1577,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             float depth_score = 0.0f;
             float type_score = 0.0f;
 
-            // Depth component: output, embeddings & early/late layers are important
-            if (name.find("output.weight") != std::string::npos ||
-                name.find("token_embd.weight") != std::string::npos) {
+            // Depth component: output & early/late layers are important
+            if (name == "output.weight") {
                 depth_score = 1.0f;
-            }
-            else if (name.find(".attn_output.weight") != std::string::npos) {
-                depth_score = 0.9f;
             } else {
                 static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
                 std::smatch match;
@@ -1591,38 +1587,40 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const int layer = std::stoi(match[1]);
                     const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
                     const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-                    depth_score = 0.2f + 0.6f * center_dist;
+                    depth_score = 0.9f * center_dist;
                 }
             }
 
-            // Type component: certain tensor types are more important
-            if (name.find("output.weight") != std::string::npos) {
+            // Type component: certain tensor types have more impact on model quality
+            if (name == "output.weight") {
                 type_score = 1.0f;
-            } else if (name.find(".attn_output.weight") != std::string::npos) {
-                type_score = 0.9f;
             } else if (name.find(".ffn_down.weight") != std::string::npos ||
-                       name.find(".ffn_down_shexp.weight") != std::string::npos ||
                        name.find(".ffn_down_exps.weight") != std::string::npos) {
+                type_score = 0.9f;
+            } else if (name.find(".attn_output.weight") != std::string::npos ||
+                       name.find(".time_mix_output.weight") != std::string::npos ||
+                       name.find(".attn_o.weight") != std::string::npos) {
                 type_score = 0.8f;
-            } else if (name.find(".attn_q.weight") != std::string::npos ||
-                       name.find(".attn_k.weight") != std::string::npos ||
-                       name.find(".attn_v.weight") != std::string::npos ||
-                       name.find(".attn_qkv.weight") != std::string::npos) {
-                type_score = 0.7f;
             } else if (name.find(".ffn_up.weight") != std::string::npos ||
                        name.find(".ffn_gate.weight") != std::string::npos ||
-                       name.find(".ffn_up_shexp.weight") != std::string::npos ||
-                       name.find(".ffn_gate_shexp.weight") != std::string::npos ||
                        name.find(".ffn_up_exps.weight") != std::string::npos ||
                        name.find(".ffn_gate_exps.weight") != std::string::npos) {
-                type_score = 0.6f;
+                type_score = 0.3f;
+            } else if (name.find(".attn_q.weight") != std::string::npos ||
+                       name.find(".attn_k.weight") != std::string::npos ||
+                       name.find(".attn_v.weight") != std::string::npos ||
+                       name.find(".attn_qkv.weight") != std::string::npos) {
+                type_score = 0.2f;
             } else if (name.find("token_embd.weight") != std::string::npos) {
-                type_score = 0.5f;
+                type_score = 0.1f;
             }
 
             // Weighted combination
-            total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth
-            scores[name] = total_score;
+            total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth
+            if (total_score != 0.0f) {
+                scores[name] = total_score;
+                LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
+            }
         }
 
         return scores;
@@ -1636,15 +1634,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
 
         // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25%
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25%
 
         std::unordered_set<std::string> important;
         for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
             important.insert(sorted_scores[i].first);
-            //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
+            LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
         }
 
-        LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size());
+        const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
+        LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
         return important;
     };
 

From 00ddf039b306882a8a15761624bcdd673f666f71 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 21:38:49 +0100
Subject: [PATCH 130/148] Update usage

---
 tools/quantize/quantize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 945acbe2887..f994999e591 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
-    printf("       [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");

From 543b5a99db2b74e2b74cb87a222a25586479bd9b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 21:57:03 +0100
Subject: [PATCH 131/148] Fix lambda capture

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5e3893151c6..e6c9bfa7f0c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1421,7 +1421,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        auto pareto_convex = [](std::vector<candidate_types> & candidates) {
+        auto pareto_convex = [epsilon](std::vector<candidate_types> & candidates) {
             if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {

From 27bf25e93c9309b96a151c1d8c4eef8fdad0cb21 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 22:04:35 +0100
Subject: [PATCH 132/148] Fix lambda capture

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e6c9bfa7f0c..08f1b302934 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -690,7 +690,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    constexpr double epsilon = 1e-12;
+    const double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
@@ -1118,7 +1118,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
         const int64_t ns = std::max<int64_t>(1, ne2);
         std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
@@ -1421,7 +1421,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        auto pareto_convex = [epsilon](std::vector<candidate_types> & candidates) {
+        auto pareto_convex = [&](std::vector<candidate_types> & candidates) {
             if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {

From 04561d5782b930e781627eee5ffcbb6b06e8b558 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 21 Oct 2025 12:53:26 +0100
Subject: [PATCH 133/148] Update epsilon specifier

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 08f1b302934..5280b9a02af 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -690,7 +690,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    const double epsilon = 1e-12;
+    constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;

From d6ccd5649ac6db0ad87156cf92f036737cf82be3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 25 Oct 2025 12:09:20 +0100
Subject: [PATCH 134/148] Finetune heuristics

---
 src/llama-quant.cpp | 81 ++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5280b9a02af..617c7d94737 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -838,7 +838,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
         } else {
-            LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
+            LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
         }
 
         uint64_t n = 0;
@@ -1569,54 +1569,59 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float> {
+    auto tensor_depth = [&](const std::string & name) -> float {
+        static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
+        std::smatch match;
+
+        // Depth component: output, embeddings & early/late layers are important
+        if (name == "output.weight" || name == "token_embd.weight") {
+            return 1.0f;
+        }
+        if (std::regex_search(name, match, layer_pattern)) {
+            const int layer = std::stoi(match[1]);
+            const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
+            const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
+            return 0.01f + 0.9f * center_dist;
+        }
+
+        return 0.0f;
+    };
+
+    auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
         std::unordered_map<std::string, float> scores;
-        for (const auto & ti : all_vec) {
-            const std::string name = ggml_get_name(ti.w->tensor);
+        for (const auto & t : all_tensors) {
+            const std::string name = ggml_get_name(t.w->tensor);
             float total_score = 0.0f;
             float depth_score = 0.0f;
             float type_score = 0.0f;
 
-            // Depth component: output & early/late layers are important
+            // Type component: certain tensor types have more impact on model quality
+            const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
+                {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
+                {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
+                {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
+                {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
+                {0.2f, {"token_embd.weight"}}
+            };
             if (name == "output.weight") {
-                depth_score = 1.0f;
+                type_score = 1.0f;
             } else {
-                static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
-                std::smatch match;
-                if (std::regex_search(name, match, layer_pattern)) {
-                    const int layer = std::stoi(match[1]);
-                    const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
-                    const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-                    depth_score = 0.9f * center_dist;
+                for (const auto& ts : tensor_scores) {
+                    const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
+                        return name.find(pattern) != std::string::npos;
+                    });
+                    if (found) {
+                        type_score = ts.first;
+                        break;
+                    }
                 }
             }
-
-            // Type component: certain tensor types have more impact on model quality
-            if (name == "output.weight") {
-                type_score = 1.0f;
-            } else if (name.find(".ffn_down.weight") != std::string::npos ||
-                       name.find(".ffn_down_exps.weight") != std::string::npos) {
-                type_score = 0.9f;
-            } else if (name.find(".attn_output.weight") != std::string::npos ||
-                       name.find(".time_mix_output.weight") != std::string::npos ||
-                       name.find(".attn_o.weight") != std::string::npos) {
-                type_score = 0.8f;
-            } else if (name.find(".ffn_up.weight") != std::string::npos ||
-                       name.find(".ffn_gate.weight") != std::string::npos ||
-                       name.find(".ffn_up_exps.weight") != std::string::npos ||
-                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
-                type_score = 0.3f;
-            } else if (name.find(".attn_q.weight") != std::string::npos ||
-                       name.find(".attn_k.weight") != std::string::npos ||
-                       name.find(".attn_v.weight") != std::string::npos ||
-                       name.find(".attn_qkv.weight") != std::string::npos) {
-                type_score = 0.2f;
-            } else if (name.find("token_embd.weight") != std::string::npos) {
-                type_score = 0.1f;
+            if (type_score > 0.0f) {
+                depth_score = tensor_depth(name);
             }
 
             // Weighted combination
-            total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth
+            total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
             if (total_score != 0.0f) {
                 scores[name] = total_score;
                 LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
@@ -1634,7 +1639,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
 
         // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25%
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
 
         std::unordered_set<std::string> important;
         for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {

From 5303212324c90745eb82c3e5f5abb32b184cb7fa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 26 Oct 2025 17:40:52 +0000
Subject: [PATCH 135/148] Simplify tensor selection

---
 src/llama-quant.cpp | 99 +++++----------------------------------------
 1 file changed, 11 insertions(+), 88 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 617c7d94737..04f4ff341af 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -11,11 +11,12 @@
 #include <csignal>
 #include <fstream>
 #include <mutex>
+#include <numeric>
+#include <optional>
 #include <random>
 #include <regex>
 #include <thread>
 #include <unordered_map>
-#include <optional>
 #include <unordered_set>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
@@ -1151,7 +1152,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     const auto bpw_data = load_bpw_state();
 
-    // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    // Parallelize tensor processing - courtesy of https://github.com/ddh0
     auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
         std::vector<no_init<uint8_t>> & thread_local_buffer,
         std::mutex & loader_mutex,
@@ -1569,93 +1570,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto tensor_depth = [&](const std::string & name) -> float {
-        static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
-        std::smatch match;
-
-        // Depth component: output, embeddings & early/late layers are important
-        if (name == "output.weight" || name == "token_embd.weight") {
-            return 1.0f;
-        }
-        if (std::regex_search(name, match, layer_pattern)) {
-            const int layer = std::stoi(match[1]);
-            const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
-            const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-            return 0.01f + 0.9f * center_dist;
-        }
-
-        return 0.0f;
-    };
-
-    auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
-        std::unordered_map<std::string, float> scores;
-        for (const auto & t : all_tensors) {
-            const std::string name = ggml_get_name(t.w->tensor);
-            float total_score = 0.0f;
-            float depth_score = 0.0f;
-            float type_score = 0.0f;
-
-            // Type component: certain tensor types have more impact on model quality
-            const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
-                {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
-                {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
-                {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
-                {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
-                {0.2f, {"token_embd.weight"}}
-            };
-            if (name == "output.weight") {
-                type_score = 1.0f;
-            } else {
-                for (const auto& ts : tensor_scores) {
-                    const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
-                        return name.find(pattern) != std::string::npos;
-                    });
-                    if (found) {
-                        type_score = ts.first;
-                        break;
-                    }
-                }
-            }
-            if (type_score > 0.0f) {
-                depth_score = tensor_depth(name);
-            }
-
-            // Weighted combination
-            total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
-            if (total_score != 0.0f) {
-                scores[name] = total_score;
-                LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
-            }
-        }
-
-        return scores;
-    };
-
-    auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
-        const auto scores = tensor_importance(all_vec);
-
-        // Sort by score
-        std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
-        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
-
-        // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
-
-        std::unordered_set<std::string> important;
-        for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
-            important.insert(sorted_scores[i].first);
-            LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
-        }
-
-        const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
-        LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
-        return important;
-    };
-
-    const auto important_set = select_tensors(all);
-
+    // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        return important_set.count(tensor_name) > 0;
+        const auto important = tensor_name == "output.weight" ||
+                                    tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                                    tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_output.weight") != std::string::npos ||
+                                    tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_o.weight") != std::string::npos;
+        return important;
     };
 
     // Lagrangian relaxation to minimise error subject to a bpw target constraint

From f8863b9a80822bb58e7406fd35d4452a97c4639a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 28 Oct 2025 15:22:32 +0000
Subject: [PATCH 136/148] Minor refactoring

---
 src/llama-quant.cpp | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 04f4ff341af..fdce1f4285b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -694,6 +694,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
+    constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
     const char * func = __func__;
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
@@ -731,7 +732,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
         if (is_compatible(t, typ)) { return typ; }
-        ggml_type fb = fallback_type(typ);
+        const ggml_type fb = fallback_type(typ);
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
@@ -754,7 +755,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < n; ++i) {
             h = (h << 5) + h + data[i];
         }
-        return h ? h : 0xeabada55cafed00d;
+        return h ? h : arbitrary_magic;
     };
 
     auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
@@ -795,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
             const std::string name = ggml_get_name(ti.w->tensor);
-            const uint32_t len = (uint32_t)name.size();
+            const auto len = (uint32_t)name.size();
             ofs.write((const char *)&len, sizeof(len));
             ofs.write(name.data(), len);
 
@@ -835,13 +836,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else if (id != model_id) {
+        }
+        if (id != model_id) {
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else {
-            LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
         }
 
+        LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
+
         uint64_t n = 0;
         ifs.read((char *)&n, sizeof(n));
         for (uint64_t i = 0; i < n; ++i) {
@@ -862,15 +864,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             si.n_elements = (size_t)ne;
 
             si.candidate.resize(cn);
-            for (size_t j = 0; j < si.candidate.size(); ++j) {
+            for (auto & s : si.candidate) {
                 int32_t t = 0;
                 uint64_t b = 0;
                 ifs.read((char *)&t, sizeof(t));
-                si.candidate[j].type = (ggml_type)t;
-                ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw));
+                s.type = (ggml_type)t;
+                ifs.read((char *)&s.bpw, sizeof(s.bpw));
                 ifs.read((char *)&b, sizeof(b));
-                si.candidate[j].bytes = (size_t)b;
-                ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error));
+                s.bytes = (size_t)b;
+                ifs.read((char *)&s.error, sizeof(s.error));
             }
 
             out.emplace(std::move(name), std::move(si));
@@ -886,7 +888,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
             std::remove(checkpoint_file.c_str());
         }
-
     };
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
@@ -1198,10 +1199,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Compute rows based on tensor shape and slice count
         auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t {
             const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024;
-            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt
+            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large tensors
             const double slice_budget = tensor_budget * scale_rows / std::max<int64_t>(1, n2);
             const int64_t min_rows = has_acts ? 128 : 64;
-            const int64_t max_rows = 4096;
+            constexpr int64_t max_rows = 4096; // row limit to avoid excessive memory use
             int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
             total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
             if (rows <= min_rows * 2) { total_rows = rows; }
@@ -1246,7 +1247,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_sample.clear();
             std::vector<float> row_buffer(n_per_row);
             for (int64_t slice = 0; slice < ne2; ++slice) {
-                std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
+                std::mt19937 rng(std::hash<std::string>{}(name) ^ arbitrary_magic ^ slice);
                 const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
                 const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
                 int64_t offset = 0;
@@ -1411,8 +1412,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (c.bytes == 0) { continue; }
             const double final_err = bias_needed ? c.error : c.mse;
             info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
-            // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n",
-            //     func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err);
         }
 
         if (info.candidate.empty()) {
@@ -1445,16 +1444,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
             // Convex hull (lower envelope)
+            auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
+                const double dx1 = (double)h1.bytes - (double)h0.bytes;
+                const double dy1 = h1.error - h0.error;
+                const double dx2 = (double)p.bytes - (double)h0.bytes;
+                const double dy2 = p.error - h0.error;
+                return dx1 * dy2 - dx2 * dy1;
+            };
             std::vector<candidate_types> hull; hull.reserve(candidates.size());
             for (const auto & c : candidates) {
-                auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
-                    const double dx1 = (double)h1.bytes - (double)h0.bytes;
-                    const double dy1 = h1.error - h0.error;
-                    const double dx2 = (double)p.bytes - (double)h0.bytes;
-                    const double dy2 = p.error - h0.error;
-                    return dx1 * dy2 - dx2 * dy1;
-                };
-
                 while (hull.size() >= 2) {
                     if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
                         hull.pop_back();

From 6e32244a06b1ffe513b1694ee647e92c09904dac Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 30 Oct 2025 21:53:07 +0000
Subject: [PATCH 137/148] Read statistics from imatrix

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 28 ++++++++++-----
 tools/quantize/quantize.cpp | 68 +++++++++++++++++++++++++++++--------
 3 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index ce04011e191..517ef5e0fbe 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -368,6 +368,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
+        void * statistics;                    // pointer to statistics data
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fdce1f4285b..a8153494f92 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -631,6 +631,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
+    const std::unordered_map<std::string, std::vector<float>> * statistics_data,
     const llama_model_quantize_params * params,
     int nthread
 ) {
@@ -1815,6 +1816,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
     const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
     const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * statistics_data = nullptr;
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
@@ -1845,6 +1847,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    if (params->statistics) {
+        statistics_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->statistics);
+        if (statistics_data) {
+            LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size()));
+        }
+    }
     LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
@@ -1999,15 +2007,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-            if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__);
-            } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
-            }
+            const char* base_msg = params->activations
+                ? (params->statistics
+                    ? "imatrix with activations and statistics provided, process will be more accurate\n"
+                    : "imatrix with activations provided, process will be accurate\n")
+                : "imatrix without activations provided, process will be less accurate\n";
+            if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); }
+            else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); }
+
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread);
         } else {
-            LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
+            LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__);
         }
     }
 
@@ -2269,7 +2280,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
-        /*.bpw_state                   =*/ nullptr
+        /*.bpw_state                   =*/ nullptr,
+        /*.statistics                  =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index f994999e591..0b2b05b60a6 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -221,7 +221,8 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
 static int load_imatrix(const std::string & imatrix_file,
     std::vector<std::string> & imatrix_datasets,
     std::unordered_map<std::string, std::vector<float>> & values_data,
-    std::unordered_map<std::string, std::vector<float>> & activations_data) {
+    std::unordered_map<std::string, std::vector<float>> & activations_data,
+    std::unordered_map<std::string, std::vector<float>> & statistics_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -256,24 +257,28 @@ static int load_imatrix(const std::string & imatrix_file,
     const std::string sums_suffix{ ".in_sum" };
     const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
+    const std::string stats_suffix{ ".stats" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
 
         if (name.empty()) { continue; }
 
-        if (string_remove_suffix(name, sums2_suffix)) {
-            // in_sum2
+        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
             std::get<0>(sums_counts_for[std::move(name)]) = cur;
+        } else if (string_remove_suffix(name, sums2_suffix)) {
+            // in_sum2
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
-            std::get<1>(sums_counts_for[std::move(name)]) = cur;
-        }  else if (string_remove_suffix(name, sums_suffix)) {
-            // in_sum
             std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        }  else if (string_remove_suffix(name, stats_suffix)) {
+            // stats
+            std::get<3>(sums_counts_for[std::move(name)]) = cur;
         }
         else {
             // ignore other tensors
@@ -282,11 +287,12 @@ static int load_imatrix(const std::string & imatrix_file,
 
     for (const auto & sc : sums_counts_for) {
         const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = std::get<2>(sc.second);
-        const struct ggml_tensor * sums2  = std::get<0>(sc.second);
-        const struct ggml_tensor * counts = std::get<1>(sc.second);
+        const struct ggml_tensor * sums   = std::get<0>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<1>(sc.second);
+        const struct ggml_tensor * counts = std::get<2>(sc.second);
+        const struct ggml_tensor * stats = std::get<3>(sc.second);
 
-        // check that sums, sums2 and counts have the same shape
+        // check sums2 and counts are present, and that sums and sums2 have the same shape
         if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
             fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
             gguf_free(ctx_gguf);
@@ -302,6 +308,19 @@ static int load_imatrix(const std::string & imatrix_file,
         if (sums) {
             activations.resize(ggml_nelements(sums));
         }
+        if (stats) {
+            auto & statistics = statistics_data[name];
+            statistics.resize(ggml_nelements(stats));
+            if (stats->type == GGML_TYPE_F32) {
+                std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
+            } else {
+                fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
+                    __func__, ggml_type_name(stats->type), name.c_str());
+                statistics.clear();
+                statistics_data.erase(name);
+            }
+
+        }
         values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
@@ -354,10 +373,11 @@ static int prepare_imatrix(const std::string & imatrix_file,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & values_data,
-        std::unordered_map<std::string, std::vector<float>> & activations_data) {
+        std::unordered_map<std::string, std::vector<float>> & activations_data,
+        std::unordered_map<std::string, std::vector<float>> & statistics_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
     }
     if (values_data.empty()) {
         return m_last_call;
@@ -380,11 +400,20 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     ++at;
                 }
             }
+            for (auto st = statistics_data.begin(); st != statistics_data.end();) {
+                auto pos = st->first.find(name);
+                if (pos != std::string::npos) {
+                    st = activations_data.erase(st);
+                } else {
+                    ++st;
+                }
+            }
         }
     }
     if (!included_weights.empty()) {
         std::unordered_map<std::string, std::vector<float>> tmp_values;
         std::unordered_map<std::string, std::vector<float>> tmp_activations;
+        std::unordered_map<std::string, std::vector<float>> tmp_statistics;
         for (const auto & name : included_weights) {
             for (auto & e : values_data) {
                 auto pos = e.first.find(name);
@@ -398,9 +427,16 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     tmp_activations.emplace(std::move(a));
                 }
             }
+            for (auto & s : statistics_data) {
+                auto pos = s.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_statistics.emplace(std::move(s));
+                }
+            }
         }
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
+        statistics_data = std::move(tmp_statistics);
     }
 
     return m_last_call;
@@ -617,7 +653,8 @@ int main(int argc, char ** argv) {
     std::vector<std::string> imatrix_datasets;
     std::unordered_map<std::string, std::vector<float>> values_data;
     std::unordered_map<std::string, std::vector<float>> activations_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
+    std::unordered_map<std::string, std::vector<float>> statistics_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
     if (!values_data.empty()) {
         params.imatrix = &values_data;
         {
@@ -657,6 +694,9 @@ int main(int argc, char ** argv) {
     if (!activations_data.empty()) {
         params.activations = &activations_data;
     }
+    if (!statistics_data.empty()) {
+        params.statistics = &statistics_data;
+    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;

From c59bb6d49d025765091d7c83a9b95528395de283 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 30 Oct 2025 22:11:40 +0000
Subject: [PATCH 138/148] Add Euclidean-Cosine score to identify important
 tensors

---
 src/llama-quant.cpp | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a8153494f92..957dd5f3677 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1571,12 +1571,25 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        const auto important = tensor_name == "output.weight" ||
-                                    tensor_name.find(".ffn_down.weight") != std::string::npos ||
-                                    tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
-                                    tensor_name.find(".attn_output.weight") != std::string::npos ||
-                                    tensor_name.find(".time_mix_output.weight") != std::string::npos ||
-                                    tensor_name.find(".attn_o.weight") != std::string::npos;
+        bool important = false;
+
+        if (statistics_data) {
+            float ecs = 0.0f; // Euclidean-Cosine score
+            const std::string key = remap_imatrix(tensor_name, mapped);
+            const auto tstats = statistics_data->find(key);
+            if (tstats != statistics_data->end() && !tstats->second.empty()) {
+                ecs = tstats->second.front();
+                important = ecs == 100.0f; // mark as important if ecs is 100%
+            }
+        } else {
+            important = tensor_name == "output.weight" ||
+                        tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                        tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                        tensor_name.find(".attn_output.weight") != std::string::npos ||
+                        tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                        tensor_name.find(".attn_o.weight") != std::string::npos;
+        }
+
         return important;
     };
 

From ac8cfbdd12eb2207098e3bcc4aee9347aa8366bc Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 17 Nov 2025 18:03:09 +0000
Subject: [PATCH 139/148] Improved is_important() logic

---
 src/llama-quant.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 739172c70f4..1e8a2cda9c9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -635,8 +635,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const llama_model_quantize_params * params,
     int nthread
 ) {
-    // RAII guard for signal handlers
     bpw_stop.store(false, std::memory_order_relaxed);
+    // Signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
         handler_t prev_int = SIG_DFL;
@@ -1574,12 +1574,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bool important = false;
 
         if (statistics_data) {
-            float ecs = 0.0f; // Euclidean-Cosine score
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto tstats = statistics_data->find(key);
             if (tstats != statistics_data->end() && !tstats->second.empty()) {
-                ecs = tstats->second.front();
-                important = ecs == 100.0f; // mark as important if ecs is 100%
+                float ecs = 0.0f; // Euclidean-Cosine score
+                float l2 = 0.0f;  // L2 Euclidean Distance
+                float cs = 0.0f;  // Cosine Similarity
+                try {
+                    // ecs = tstats->second.at(0);
+                    l2 = tstats->second.at(1);
+                    cs = tstats->second.at(2);
+                } catch (std::out_of_range &) {
+                    LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str());
+                    return false;
+                }
+                ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q)
+                // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs);
+                important = ecs >= 99.99f; // mark as important if ecs is >= 99.99%
             }
         } else {
             important = tensor_name == "output.weight" ||

From a0ba913613235c1639f92877f09e82c3db6fef47 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Nov 2025 11:19:44 +0000
Subject: [PATCH 140/148] Fix lambda capture bug in Windows and initialise
 candidate_types struct

---
 src/llama-quant.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e8a2cda9c9..86ca165b6cb 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -652,10 +652,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     } signal_guard;
 
     struct candidate_types {
-        ggml_type type;
-        float bpw;
-        size_t bytes;
-        double error;
+        ggml_type type = GGML_TYPE_COUNT;
+        float bpw = 0.0f;
+        size_t bytes = 0;
+        double error = 0.0;
         double mse = 0.0;
         double proj = 0.0;
     };
@@ -751,7 +751,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t {
+    auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t {
         uint64_t h = 5381;
         for (size_t i = 0; i < n; ++i) {
             h = (h << 5) + h + data[i];

From 9ec3e6e2629d294e7ae95ee58634c360475e67d7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 23 Nov 2025 17:49:53 +0000
Subject: [PATCH 141/148] Remove processing statistics_data

---
 include/llama.h             |  1 -
 src/llama-quant.cpp         | 19 ++----------
 tools/quantize/quantize.cpp | 61 ++++++-------------------------------
 3 files changed, 12 insertions(+), 69 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 3515ee1a13b..c82a4147f4c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,7 +369,6 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
-        void * statistics;                    // pointer to statistics data
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 86ca165b6cb..99759a27c8f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -631,7 +631,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
-    const std::unordered_map<std::string, std::vector<float>> * statistics_data,
     const llama_model_quantize_params * params,
     int nthread
 ) {
@@ -1840,7 +1839,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
     const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
     const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
-    const std::unordered_map<std::string, std::vector<float>> * statistics_data = nullptr;
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
@@ -1871,12 +1869,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
-    if (params->statistics) {
-        statistics_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->statistics);
-        if (statistics_data) {
-            LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size()));
-        }
-    }
     LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
@@ -2031,16 +2023,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-            const char* base_msg = params->activations
-                ? (params->statistics
-                    ? "imatrix with activations and statistics provided, process will be more accurate\n"
-                    : "imatrix with activations provided, process will be accurate\n")
-                : "imatrix without activations provided, process will be less accurate\n";
-            if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); }
-            else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); }
 
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread);
+
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
             LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__);
         }
@@ -2305,7 +2291,6 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
-        /*.statistics                  =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0b2b05b60a6..aabcd73986f 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -221,8 +221,7 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
 static int load_imatrix(const std::string & imatrix_file,
     std::vector<std::string> & imatrix_datasets,
     std::unordered_map<std::string, std::vector<float>> & values_data,
-    std::unordered_map<std::string, std::vector<float>> & activations_data,
-    std::unordered_map<std::string, std::vector<float>> & statistics_data) {
+    std::unordered_map<std::string, std::vector<float>> & activations_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -257,10 +256,9 @@ static int load_imatrix(const std::string & imatrix_file,
     const std::string sums_suffix{ ".in_sum" };
     const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
-    const std::string stats_suffix{ ".stats" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
@@ -276,11 +274,7 @@ static int load_imatrix(const std::string & imatrix_file,
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
             std::get<2>(sums_counts_for[std::move(name)]) = cur;
-        }  else if (string_remove_suffix(name, stats_suffix)) {
-            // stats
-            std::get<3>(sums_counts_for[std::move(name)]) = cur;
-        }
-        else {
+        }  else {
             // ignore other tensors
         }
     }
@@ -290,7 +284,6 @@ static int load_imatrix(const std::string & imatrix_file,
         const struct ggml_tensor * sums   = std::get<0>(sc.second);
         const struct ggml_tensor * sums2  = std::get<1>(sc.second);
         const struct ggml_tensor * counts = std::get<2>(sc.second);
-        const struct ggml_tensor * stats = std::get<3>(sc.second);
 
         // check sums2 and counts are present, and that sums and sums2 have the same shape
         if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
@@ -308,19 +301,6 @@ static int load_imatrix(const std::string & imatrix_file,
         if (sums) {
             activations.resize(ggml_nelements(sums));
         }
-        if (stats) {
-            auto & statistics = statistics_data[name];
-            statistics.resize(ggml_nelements(stats));
-            if (stats->type == GGML_TYPE_F32) {
-                std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
-            } else {
-                fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
-                    __func__, ggml_type_name(stats->type), name.c_str());
-                statistics.clear();
-                statistics_data.erase(name);
-            }
-
-        }
         values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
@@ -373,23 +353,22 @@ static int prepare_imatrix(const std::string & imatrix_file,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & values_data,
-        std::unordered_map<std::string, std::vector<float>> & activations_data,
-        std::unordered_map<std::string, std::vector<float>> & statistics_data) {
+        std::unordered_map<std::string, std::vector<float>> & activations_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
     }
     if (values_data.empty()) {
         return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (const auto & name : excluded_weights) {
-            for (auto it = values_data.begin(); it != values_data.end();) {
-                auto pos = it->first.find(name);
+            for (auto vt = values_data.begin(); vt != values_data.end();) {
+                auto pos = vt->first.find(name);
                 if (pos != std::string::npos) {
-                    it = values_data.erase(it);
+                    vt = values_data.erase(vt);
                 } else {
-                    ++it;
+                    ++vt;
                 }
             }
             for (auto at = activations_data.begin(); at != activations_data.end();) {
@@ -400,20 +379,11 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     ++at;
                 }
             }
-            for (auto st = statistics_data.begin(); st != statistics_data.end();) {
-                auto pos = st->first.find(name);
-                if (pos != std::string::npos) {
-                    st = activations_data.erase(st);
-                } else {
-                    ++st;
-                }
-            }
         }
     }
     if (!included_weights.empty()) {
         std::unordered_map<std::string, std::vector<float>> tmp_values;
         std::unordered_map<std::string, std::vector<float>> tmp_activations;
-        std::unordered_map<std::string, std::vector<float>> tmp_statistics;
         for (const auto & name : included_weights) {
             for (auto & e : values_data) {
                 auto pos = e.first.find(name);
@@ -427,16 +397,9 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     tmp_activations.emplace(std::move(a));
                 }
             }
-            for (auto & s : statistics_data) {
-                auto pos = s.first.find(name);
-                if (pos != std::string::npos) {
-                    tmp_statistics.emplace(std::move(s));
-                }
-            }
         }
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
-        statistics_data = std::move(tmp_statistics);
     }
 
     return m_last_call;
@@ -653,8 +616,7 @@ int main(int argc, char ** argv) {
     std::vector<std::string> imatrix_datasets;
     std::unordered_map<std::string, std::vector<float>> values_data;
     std::unordered_map<std::string, std::vector<float>> activations_data;
-    std::unordered_map<std::string, std::vector<float>> statistics_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
     if (!values_data.empty()) {
         params.imatrix = &values_data;
         {
@@ -694,9 +656,6 @@ int main(int argc, char ** argv) {
     if (!activations_data.empty()) {
         params.activations = &activations_data;
     }
-    if (!statistics_data.empty()) {
-        params.statistics = &statistics_data;
-    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;

From 1c9993e13198a28db1b5a8e7cd0fcb5d6bcf89eb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 23 Nov 2025 17:51:04 +0000
Subject: [PATCH 142/148] Add --disable-tensor-importance option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 39 ++++++++++++++-----------------------
 tools/quantize/quantize.cpp |  4 ++++
 3 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index c82a4147f4c..1f5b2e8a2b2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,6 +369,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
+        bool disable_tensor_importance;       // treat all tensors equally during quantization
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 99759a27c8f..2b9aba091b9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1570,29 +1570,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        bool important = false;
-
-        if (statistics_data) {
-            const std::string key = remap_imatrix(tensor_name, mapped);
-            const auto tstats = statistics_data->find(key);
-            if (tstats != statistics_data->end() && !tstats->second.empty()) {
-                float ecs = 0.0f; // Euclidean-Cosine score
-                float l2 = 0.0f;  // L2 Euclidean Distance
-                float cs = 0.0f;  // Cosine Similarity
-                try {
-                    // ecs = tstats->second.at(0);
-                    l2 = tstats->second.at(1);
-                    cs = tstats->second.at(2);
-                } catch (std::out_of_range &) {
-                    LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str());
-                    return false;
-                }
-                ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q)
-                // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs);
-                important = ecs >= 99.99f; // mark as important if ecs is >= 99.99%
-            }
-        } else {
-            important = tensor_name == "output.weight" ||
+        bool important = tensor_name == "output.weight";
+        if (!important && !params->disable_tensor_importance) {
+            important = tensor_name.find(".attn_v.weight") != std::string::npos ||
+                        tensor_name.find(".time_mix_value.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
                         tensor_name.find(".attn_output.weight") != std::string::npos ||
@@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-
+            if (params->activations) {
+                LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
+            }
+            if (params->disable_tensor_importance) {
+                LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__);
+            }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
 
             bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
@@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
+        /*.disable_tensor_importance   =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index aabcd73986f..4fee8c91a1c 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -134,6 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --disable-tensor-importance: treat all tensors equally during bpw quantization\n");
+    printf("      Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n");
     printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
     printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -560,6 +562,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) {
+            params.disable_tensor_importance = true;
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {

From 661600842096145db52a4c631bfe0303a5d454ee Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 24 Nov 2025 18:26:45 +0000
Subject: [PATCH 143/148] Use more descriptive option naming

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 10 +++++-----
 tools/quantize/quantize.cpp | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 1f5b2e8a2b2..50e61d49761 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,7 +369,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
-        bool disable_tensor_importance;       // treat all tensors equally during quantization
+        bool no_importance;                   // allocate target bpw budget equitably across all tensors
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2b9aba091b9..c468a3e4fc9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1571,7 +1571,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
         bool important = tensor_name == "output.weight";
-        if (!important && !params->disable_tensor_importance) {
+        if (!important && !params->no_importance) {
             important = tensor_name.find(".attn_v.weight") != std::string::npos ||
                         tensor_name.find(".time_mix_value.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down.weight") != std::string::npos ||
@@ -2009,10 +2009,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
             }
-            if (params->disable_tensor_importance) {
-                LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__);
+            if (params->no_importance) {
+                LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__);
             } else {
-                LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__);
+                LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__);
             }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
 
@@ -2281,7 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
-        /*.disable_tensor_importance   =*/ false
+        /*.no_importance               =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 4fee8c91a1c..dd4b860e1b9 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,9 +117,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
-    printf("       [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
-    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
+    printf("        [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n");
+    printf("        [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -134,8 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --disable-tensor-importance: treat all tensors equally during bpw quantization\n");
-    printf("      Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n");
+    printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
+    printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
     printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
     printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -562,8 +562,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) {
-            params.disable_tensor_importance = true;
+        } else if (strcmp(argv[arg_idx], "--no-importance") == 0) {
+            params.no_importance = true;
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {

From 69a32b6f508a4d0d38f52cf91cc8cd5b42a4bf62 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:28:43 +0000
Subject: [PATCH 144/148] Relax target bpw range

---
 tools/quantize/quantize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index dd4b860e1b9..ebeea653365 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
     printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
@@ -485,13 +485,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 8.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
         return false;
     }
 

From 5b557ca958d3b0cb4293e12aafe21135c0c12142 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:30:20 +0000
Subject: [PATCH 145/148] Minor refactoring

---
 src/llama-quant.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c468a3e4fc9..2cb58d46bdb 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -635,7 +635,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     int nthread
 ) {
     bpw_stop.store(false, std::memory_order_relaxed);
-    // Signal handlers
+    // SIGINT/SIGTERM signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
         handler_t prev_int = SIG_DFL;
@@ -1361,14 +1361,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < compatible_candidates.size(); ++i) {
             if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
 
-            const ggml_type tensor_types = compatible_candidates[i];
-            const auto bpw = (float)tensor_bpw(tensor, tensor_types);
-            const size_t bytes = tensor_bytes(tensor, tensor_types);
+            const ggml_type tensor_type = compatible_candidates[i];
+            const auto bpw = (float)tensor_bpw(tensor, tensor_type);
+            const size_t bytes = tensor_bytes(tensor, tensor_type);
             double mse = 0.0;
             double proj = 0.0;
-            const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+            const auto err = estimate_error(tensor, tensor_type, f32_sample, rows_sample, values, activations,
                 quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
-            eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
+            eval_candidates[i] = candidate_types{ tensor_type, bpw, bytes, err, mse, proj };
         }
 
         if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }

From 229109f329c498078f84da39b2c1ebb807e60646 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:31:39 +0000
Subject: [PATCH 146/148] Increase importance boost for final pass

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2cb58d46bdb..44f84ec949d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1714,7 +1714,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (err_gain < epsilon) { continue; } // no error improvement
 
                 double ratio = err_gain / (double)delta_bytes; // error reduction per byte
-                if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost
+                if (is_important(tensor_name)) { ratio *= 5.0; } // important tensors get 5x boost
 
                 // For tie-breaking, prioritize the largest absolute error improvement.
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {

From b97cda628960d66a9fcc301062a1dc3925feae9f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 23:52:51 +0000
Subject: [PATCH 147/148] Add B/F16 to get_ftype()

---
 tools/quantize/quantize.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index ebeea653365..a1426ea4a3f 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -512,7 +512,12 @@ static const char * get_ftype(const float bpw) {
         {4.5000, "Q4_K"},
         {5.5000, "Q5_K"},
         {6.5625, "Q6_K"},
-        {8.5000, "Q8_0"}
+        {8.5000, "Q8_0"},
+#ifdef GGML_USE_METAL
+        {16.0000, "F16"}
+#else
+        {16.0000, "BF16"}
+#endif
     };
 
     return quant_bpw.lower_bound(bpw)->second;

From 37cf51ebd032e63c7901835cdd85a0e7e9109e25 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 30 Nov 2025 00:29:35 +0000
Subject: [PATCH 148/148] Process bpw targets up to B/F16

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 44f84ec949d..6c6926dee85 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -2089,7 +2089,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
+            if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);