From 33c6bf9d74d7924ff2b289a9b30e969e4c3c17fb Mon Sep 17 00:00:00 2001 From: "John W. Leimgruber III" Date: Sat, 12 Apr 2025 16:28:30 -0400 Subject: [PATCH] WIP Compute per layer LIM Scores during imatrix *WARNING*: This is mostly vibe code. Hope I'm not wasting y'alls time. Compute Layer Importance Modification (LIM) Scores The goal of this PR is to rank layers of a given tensor in order of sensitivity to quantization error. Given that it is now possible to use `llama-quantize --custom-q ...` regex, it may be possible to use these LIM Scores to decide which layers of a given tensor to quantize more or less in an attempt to preserve generation quality (e.g. low perplexity) while reducing memory footprint as compared to using same quant size across all layers of a given tensor. This experimental PR was motivated by this comment and PR: https://github.com/ggml-org/llama.cpp/pull/12718 I may force-push this after more testing and experimenting to see if it is actually doing the right thing and if the output is actually useful to improve quantization quality e.g. PPL per GiB... This may just be a big mistake, lol. This is built on existing imatrix computation and assumes that values of `x[j]` are the "activations" coming right in/out of the given tensor layer. I don't know GGML and generally work in python or vanilla c not so much c++. So a lot of this was vibe coded running [ubergarm/DeepSeek-V3-0324-GGUF IQ4_K_R4 quant](https://huggingface.co/ubergarm/DeepSeek-V3-0324-GGUF/tree/main/DeepSeek-V3-0324-IQ4_K_R4). So this is partially an experiment actually trying to use an LLM instead of just enjoying the meta of manual quantization min-maxing. ``` @misc{dumitru2024layerwisequantizationpragmaticeffective, title={Layer-Wise Quantization: A Pragmatic and Effective Method for Quantizing LLMs Beyond Integer Bit-Levels}, author={Razvan-Gabriel Dumitru and Vikas Yadav and Rishabh Maheshwary and Paul-Ioan Clotan and Sathwik Tejaswi Madhusudhan and Mihai Surdeanu}, year={2024}, eprint={2406.17415}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2406.17415}, code={https://github.com/RazvanDu/LayerwiseQuant/}, } ``` --- common/common.cpp | 4 ++ common/common.h | 1 + examples/imatrix/imatrix.cpp | 109 ++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index f0c618e0e..dc4f9ceb5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1419,6 +1419,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.compute_ppl = false; return true; } + if (arg == "--no-lim") { + params.compute_lim = false; + return true; + } if (arg == "--chunk" || arg == "--from-chunk") { CHECK_ARG params.i_chunk = std::stoi(argv[i]); diff --git a/common/common.h b/common/common.h index b4f75236d..eb23c5079 100644 --- a/common/common.h +++ b/common/common.h @@ -270,6 +270,7 @@ struct gpt_params { bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity + bool compute_lim = true; // whether to compute and show Layer Importance Scores https://arxiv.org/pdf/2406.17415 // cvector-generator params int n_pca_batch = 100; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index d8a430492..7c1e7b4f8 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -30,12 +31,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" + " [--no-ppl] [--no-lim] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); LOG_TEE("\n"); } struct Stats { + std::vector activations; std::vector values; std::vector counts; int ncall = 0; @@ -48,6 +50,7 @@ class IMatrixCollector { void set_params(gpt_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; + void compute_lim(); bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; @@ -131,6 +134,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * ++e.ncall; if (e.values.empty()) { + e.activations.resize(src1->ne[0]*n_as, 0); e.values.resize(src1->ne[0]*n_as, 0); e.counts.resize(src1->ne[0]*n_as, 0); e.n_as = n_as; @@ -162,6 +166,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.activations[e_start + j] = x[j]; e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; if (!std::isfinite(e.values[e_start + j])) { @@ -183,7 +188,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } else { auto & e = m_stats[wname]; + if (e.values.empty()) { + e.activations.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0); } @@ -198,6 +205,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.activations[j] = x[j]; e.values[j] += x[j]*x[j]; e.counts[j]++; if (!std::isfinite(e.values[j])) { @@ -396,6 +404,99 @@ bool IMatrixCollector::load_imatrix(const char * fname) { return true; } +// Extract layer number from keys like "blk.17.ffn_gate.weight" +int extract_layer(const std::string& name) { + size_t p1 = name.find('.') + 1; // Skip "blk." + size_t p2 = name.find('.', p1); // Find next "." + return std::stoi(name.substr(p1, p2 - p1)); +} + +void IMatrixCollector::compute_lim() { + if (m_stats.empty()) { + fprintf(stderr, "%s: no data collected - cannot compute LIM scores\n", __func__); + return; + } + printf("\n===\n"); + printf("Computing Layer Importance Modification (LIM) Scores...\n"); + + // Convert to vector and sort by layer number + std::vector> sorted_pairs(m_stats.begin(), m_stats.end()); + std::sort(sorted_pairs.begin(), sorted_pairs.end(), + [](const auto& a, const auto& b) { + return extract_layer(a.first) < extract_layer(b.first); + } + ); + + // Group activations by tensor type (e.g., ffn_gate, attn_k, etc.) + std::unordered_map*>>> tensor_groups; + + for (const auto& pair : sorted_pairs) { + std::string full_name = pair.first; + size_t p1 = full_name.find('.') + 1; // Skip "blk." + size_t p2 = full_name.find('.', p1); // Find next "." + int layer = std::stoi(full_name.substr(p1, p2 - p1)); + std::string tensor_name = full_name.substr(p2 + 1, full_name.rfind('.') - p2 - 1); + + tensor_groups[tensor_name].emplace_back(layer, &pair.second.activations); + } + + // Calculate LIM scores for each tensor type + for (const auto& group : tensor_groups) { + const std::string& tensor_name = group.first; + const auto& layers = group.second; + + printf("\nTensor: %s\n", tensor_name.c_str()); + printf("Layer\tLIM Score\n"); + printf("-----\t---------\n"); + + // Need at least 2 layers to compute LIM scores + if (layers.size() < 2) { + printf("(Need at least 2 layers to compute LIM scores)\n"); + continue; + } + + // For each layer, compare with next layer's input (current layer's output) + for (size_t i = 0; i < layers.size() - 1; i++) { + int layer = layers[i].first; + const std::vector& input_acts = *layers[i].second; + const std::vector& output_acts = *layers[i+1].second; + + // Check if activation sizes match + if (input_acts.size() != output_acts.size()) { + printf("%d\t(skipped - dimension mismatch: %zu vs %zu)\n", + layer, input_acts.size(), output_acts.size()); + continue; + } + + // Calculate dot product and magnitudes + float dot_product = 0.0f; + float input_magnitude = 0.0f; + float output_magnitude = 0.0f; + + for (size_t j = 0; j < input_acts.size(); j++) { + dot_product += input_acts[j] * output_acts[j]; + input_magnitude += input_acts[j] * input_acts[j]; + output_magnitude += output_acts[j] * output_acts[j]; + } + + input_magnitude = sqrtf(input_magnitude); + output_magnitude = sqrtf(output_magnitude); + + // Avoid division by zero + if (input_magnitude == 0 || output_magnitude == 0) { + printf("%d\t(skipped - zero magnitude)\n", layer); + continue; + } + + // Calculate cosine similarity and LIM score + float cosine_sim = dot_product / (input_magnitude * output_magnitude); + float lim_score = -cosine_sim; + + printf("%d\t%.4f\n", layer, lim_score); + } + } +} + static IMatrixCollector g_collector; static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -683,10 +784,16 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); + if (params.compute_lim) { + g_collector.compute_lim(); + } + llama_free(ctx); llama_free_model(model); llama_backend_free(); + + return 0; }