Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.compute_ppl = false;
return true;
}
if (arg == "--no-lim") {
params.compute_lim = false;
return true;
}
if (arg == "--chunk" || arg == "--from-chunk") {
CHECK_ARG
params.i_chunk = std::stoi(argv[i]);
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ struct gpt_params {

bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
bool compute_lim = true; // whether to compute and show Layer Importance Scores https://arxiv.org/pdf/2406.17415

// cvector-generator params
int n_pca_batch = 100;
Expand Down
109 changes: 108 additions & 1 deletion examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <cstring>
#include <ctime>
#include <sstream>
#include <string>
#include <thread>
#include <mutex>
#include <vector>
Expand All @@ -30,12 +31,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--no-ppl] [--no-lim] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}

struct Stats {
std::vector<float> activations;
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
Expand All @@ -48,6 +50,7 @@ class IMatrixCollector {
void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const;
void compute_lim();
bool load_imatrix(const char * file_name);
private:
std::unordered_map<std::string, Stats> m_stats;
Expand Down Expand Up @@ -131,6 +134,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
++e.ncall;

if (e.values.empty()) {
e.activations.resize(src1->ne[0]*n_as, 0);
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
e.n_as = n_as;
Expand Down Expand Up @@ -162,6 +166,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);

for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.activations[e_start + j] = x[j];
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
Expand All @@ -183,7 +188,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
} else {
auto & e = m_stats[wname];

if (e.values.empty()) {
e.activations.resize(src1->ne[0], 0);
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
Expand All @@ -198,6 +205,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int row = 0; row < (int)(src1->ne[1]*src1->ne[2]); ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.activations[j] = x[j];
Copy link
Owner

@ikawrakow ikawrakow Apr 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, activations gets overwritten each time we get called with a new set of activations. It also gets overwritten as we go over the rows of the activation matrix. At the end of the run, the compute_lim() function gets called. Which means that we get the LIM computed with just the very last token processed in the imatrix run, not an actual statistical evaluation of cosine similarities between inputs to tensors of the same type in subsequent layers.

e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
Expand Down Expand Up @@ -396,6 +404,99 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
return true;
}

// Extract layer number from keys like "blk.17.ffn_gate.weight"
int extract_layer(const std::string& name) {
size_t p1 = name.find('.') + 1; // Skip "blk."
size_t p2 = name.find('.', p1); // Find next "."
return std::stoi(name.substr(p1, p2 - p1));
}

void IMatrixCollector::compute_lim() {
if (m_stats.empty()) {
fprintf(stderr, "%s: no data collected - cannot compute LIM scores\n", __func__);
return;
}
printf("\n===\n");
printf("Computing Layer Importance Modification (LIM) Scores...\n");

// Convert to vector and sort by layer number
std::vector<std::pair<std::string, Stats>> sorted_pairs(m_stats.begin(), m_stats.end());
std::sort(sorted_pairs.begin(), sorted_pairs.end(),
[](const auto& a, const auto& b) {
return extract_layer(a.first) < extract_layer(b.first);
}
);

// Group activations by tensor type (e.g., ffn_gate, attn_k, etc.)
std::unordered_map<std::string, std::vector<std::pair<int, const std::vector<float>*>>> tensor_groups;

for (const auto& pair : sorted_pairs) {
std::string full_name = pair.first;
size_t p1 = full_name.find('.') + 1; // Skip "blk."
size_t p2 = full_name.find('.', p1); // Find next "."
int layer = std::stoi(full_name.substr(p1, p2 - p1));
std::string tensor_name = full_name.substr(p2 + 1, full_name.rfind('.') - p2 - 1);

tensor_groups[tensor_name].emplace_back(layer, &pair.second.activations);
}

// Calculate LIM scores for each tensor type
for (const auto& group : tensor_groups) {
const std::string& tensor_name = group.first;
const auto& layers = group.second;

printf("\nTensor: %s\n", tensor_name.c_str());
printf("Layer\tLIM Score\n");
printf("-----\t---------\n");

// Need at least 2 layers to compute LIM scores
if (layers.size() < 2) {
printf("(Need at least 2 layers to compute LIM scores)\n");
continue;
}

// For each layer, compare with next layer's input (current layer's output)
for (size_t i = 0; i < layers.size() - 1; i++) {
int layer = layers[i].first;
const std::vector<float>& input_acts = *layers[i].second;
const std::vector<float>& output_acts = *layers[i+1].second;

// Check if activation sizes match
if (input_acts.size() != output_acts.size()) {
printf("%d\t(skipped - dimension mismatch: %zu vs %zu)\n",
layer, input_acts.size(), output_acts.size());
continue;
}

// Calculate dot product and magnitudes
float dot_product = 0.0f;
float input_magnitude = 0.0f;
float output_magnitude = 0.0f;

for (size_t j = 0; j < input_acts.size(); j++) {
dot_product += input_acts[j] * output_acts[j];
input_magnitude += input_acts[j] * input_acts[j];
output_magnitude += output_acts[j] * output_acts[j];
}

input_magnitude = sqrtf(input_magnitude);
output_magnitude = sqrtf(output_magnitude);

// Avoid division by zero
if (input_magnitude == 0 || output_magnitude == 0) {
printf("%d\t(skipped - zero magnitude)\n", layer);
continue;
}

// Calculate cosine similarity and LIM score
float cosine_sim = dot_product / (input_magnitude * output_magnitude);
float lim_score = -cosine_sim;

printf("%d\t%.4f\n", layer, lim_score);
}
}
}

static IMatrixCollector g_collector;

static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
Expand Down Expand Up @@ -683,10 +784,16 @@ int main(int argc, char ** argv) {

llama_print_timings(ctx);

if (params.compute_lim) {
g_collector.compute_lim();
}

llama_free(ctx);
llama_free_model(model);

llama_backend_free();



return 0;
}