diff --git a/common/common.cpp b/common/common.cpp index f0c618e0e..dc4f9ceb5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1419,6 +1419,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.compute_ppl = false; return true; } + if (arg == "--no-lim") { + params.compute_lim = false; + return true; + } if (arg == "--chunk" || arg == "--from-chunk") { CHECK_ARG params.i_chunk = std::stoi(argv[i]); diff --git a/common/common.h b/common/common.h index b4f75236d..eb23c5079 100644 --- a/common/common.h +++ b/common/common.h @@ -270,6 +270,7 @@ struct gpt_params { bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity + bool compute_lim = true; // whether to compute and show Layer Importance Scores https://arxiv.org/pdf/2406.17415 // cvector-generator params int n_pca_batch = 100; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index d8a430492..7c1e7b4f8 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -30,12 +31,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" + " [--no-ppl] [--no-lim] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); LOG_TEE("\n"); } struct Stats { + std::vector activations; std::vector values; std::vector counts; int ncall = 0; @@ -48,6 +50,7 @@ class IMatrixCollector { void set_params(gpt_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; + void compute_lim(); bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; @@ -131,6 +134,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * ++e.ncall; if (e.values.empty()) { + e.activations.resize(src1->ne[0]*n_as, 0); e.values.resize(src1->ne[0]*n_as, 0); e.counts.resize(src1->ne[0]*n_as, 0); e.n_as = n_as; @@ -162,6 +166,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.activations[e_start + j] = x[j]; e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; if (!std::isfinite(e.values[e_start + j])) { @@ -183,7 +188,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } else { auto & e = m_stats[wname]; + if (e.values.empty()) { + e.activations.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0); } @@ -198,6 +205,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.activations[j] = x[j]; e.values[j] += x[j]*x[j]; e.counts[j]++; if (!std::isfinite(e.values[j])) { @@ -396,6 +404,99 @@ bool IMatrixCollector::load_imatrix(const char * fname) { return true; } +// Extract layer number from keys like "blk.17.ffn_gate.weight" +int extract_layer(const std::string& name) { + size_t p1 = name.find('.') + 1; // Skip "blk." + size_t p2 = name.find('.', p1); // Find next "." + return std::stoi(name.substr(p1, p2 - p1)); +} + +void IMatrixCollector::compute_lim() { + if (m_stats.empty()) { + fprintf(stderr, "%s: no data collected - cannot compute LIM scores\n", __func__); + return; + } + printf("\n===\n"); + printf("Computing Layer Importance Modification (LIM) Scores...\n"); + + // Convert to vector and sort by layer number + std::vector> sorted_pairs(m_stats.begin(), m_stats.end()); + std::sort(sorted_pairs.begin(), sorted_pairs.end(), + [](const auto& a, const auto& b) { + return extract_layer(a.first) < extract_layer(b.first); + } + ); + + // Group activations by tensor type (e.g., ffn_gate, attn_k, etc.) + std::unordered_map*>>> tensor_groups; + + for (const auto& pair : sorted_pairs) { + std::string full_name = pair.first; + size_t p1 = full_name.find('.') + 1; // Skip "blk." + size_t p2 = full_name.find('.', p1); // Find next "." + int layer = std::stoi(full_name.substr(p1, p2 - p1)); + std::string tensor_name = full_name.substr(p2 + 1, full_name.rfind('.') - p2 - 1); + + tensor_groups[tensor_name].emplace_back(layer, &pair.second.activations); + } + + // Calculate LIM scores for each tensor type + for (const auto& group : tensor_groups) { + const std::string& tensor_name = group.first; + const auto& layers = group.second; + + printf("\nTensor: %s\n", tensor_name.c_str()); + printf("Layer\tLIM Score\n"); + printf("-----\t---------\n"); + + // Need at least 2 layers to compute LIM scores + if (layers.size() < 2) { + printf("(Need at least 2 layers to compute LIM scores)\n"); + continue; + } + + // For each layer, compare with next layer's input (current layer's output) + for (size_t i = 0; i < layers.size() - 1; i++) { + int layer = layers[i].first; + const std::vector& input_acts = *layers[i].second; + const std::vector& output_acts = *layers[i+1].second; + + // Check if activation sizes match + if (input_acts.size() != output_acts.size()) { + printf("%d\t(skipped - dimension mismatch: %zu vs %zu)\n", + layer, input_acts.size(), output_acts.size()); + continue; + } + + // Calculate dot product and magnitudes + float dot_product = 0.0f; + float input_magnitude = 0.0f; + float output_magnitude = 0.0f; + + for (size_t j = 0; j < input_acts.size(); j++) { + dot_product += input_acts[j] * output_acts[j]; + input_magnitude += input_acts[j] * input_acts[j]; + output_magnitude += output_acts[j] * output_acts[j]; + } + + input_magnitude = sqrtf(input_magnitude); + output_magnitude = sqrtf(output_magnitude); + + // Avoid division by zero + if (input_magnitude == 0 || output_magnitude == 0) { + printf("%d\t(skipped - zero magnitude)\n", layer); + continue; + } + + // Calculate cosine similarity and LIM score + float cosine_sim = dot_product / (input_magnitude * output_magnitude); + float lim_score = -cosine_sim; + + printf("%d\t%.4f\n", layer, lim_score); + } + } +} + static IMatrixCollector g_collector; static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { @@ -683,10 +784,16 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); + if (params.compute_lim) { + g_collector.compute_lim(); + } + llama_free(ctx); llama_free_model(model); llama_backend_free(); + + return 0; }