Skip to content

Commit 05dbbea

Browse files
ikawrakowIwan Kawrakow
andauthored
imatrix: collect layer influence statistics (#328)
* imatrix: collect layer influence statistics * imatrix: collect layer influence statiscs also for the last layer For the last layer we need to use the input for the output.weight tensor. Last layer(s) tend(s) to be important, so it is useful to also have its influence metric. * imatrix: separate metric for attention and ffn importance * Use stripped tensor name, not src0->name --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 028e0cf commit 05dbbea

File tree

1 file changed

+169
-2
lines changed

1 file changed

+169
-2
lines changed

examples/imatrix/imatrix.cpp

Lines changed: 169 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include <fstream>
2020
#include <unordered_map>
2121
#include <algorithm>
22+
#include <optional>
23+
#include <sstream>
2224

2325
#if defined(_MSC_VER)
2426
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -49,13 +51,59 @@ class IMatrixCollector {
4951
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
5052
void save_imatrix(int ncall = -1) const;
5153
bool load_imatrix(const char * file_name);
54+
void set_collect_lsim(bool yes_or_no) { m_collect_lsim = yes_or_no; }
55+
void print_layer_importance();
5256
private:
5357
std::unordered_map<std::string, Stats> m_stats;
5458
gpt_params m_params;
5559
std::mutex m_mutex;
5660
int m_last_call = 0;
61+
int m_last_layer = 9999;
62+
int m_last_ffn = -1;
5763
std::vector<float> m_src1_data;
5864
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
65+
std::vector<float> m_last_input;
66+
std::vector<float> m_ffn_input;
67+
std::vector<std::pair<double,int>> m_layer_sim;
68+
std::vector<std::pair<double,int>> m_attn_sim;
69+
std::vector<std::pair<double,int>> m_ffn_sim;
70+
bool m_collect_lsim = false;
71+
72+
std::optional<int> layer_index(const std::string& name) const {
73+
if (name == m_params.output_tensor_name && m_last_layer < 199) {
74+
return m_last_layer + 1;
75+
}
76+
if (auto pos = name.find("blk."); pos == 0) {
77+
pos += 4;
78+
if (auto pos1 = name.find('.', pos); pos1 != std::string::npos) {
79+
auto index_str = name.substr(pos, pos1 - pos);
80+
std::istringstream str(index_str);
81+
int index; str >> index;
82+
if (!str.fail()) return index;
83+
}
84+
}
85+
return std::nullopt;
86+
}
87+
88+
static inline double cosine_similarity(int n, const float * x, const float * y) {
89+
double sumxy = 0, sumx2 = 0, sumy2 = 0;
90+
for (int j = 0; j < n; ++j) {
91+
sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
92+
}
93+
double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt(sumx2*sumy2) : 0;
94+
return cos_sim;
95+
}
96+
97+
static inline void collect_cos_similarity(int nrow, int n, const float * x, const float * y, std::pair<double, int>& p) {
98+
for (int row = 0; row < nrow; ++row) {
99+
p.first += cosine_similarity(n, x, y);
100+
p.second += 1;
101+
x += n;
102+
y += n;
103+
}
104+
}
105+
106+
static void print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim);
59107
};
60108

61109
// remove any prefix and suffixes from the name
@@ -77,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
77125
return wname;
78126
}
79127

128+
void IMatrixCollector::print_layer_importance(const char * msg, const std::vector<std::pair<double, int>>& sim) {
129+
if (sim.empty()) return;
130+
std::vector<std::pair<float, int>> layers;
131+
layers.reserve(sim.size());
132+
for (int i = 0; i < int(sim.size()); ++i) {
133+
if (sim[i].second > 0) layers.emplace_back(float(std::abs(sim[i].first/sim[i].second)), i);
134+
}
135+
if (layers.empty()) return;
136+
std::sort(layers.begin(), layers.end());
137+
printf("%s\n", msg);
138+
//printf("======================== sorted layer importances\n");
139+
int j = 0;
140+
for (auto& p : layers) {
141+
int i = p.second;
142+
printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, sim[i].first/sim[i].second);
143+
}
144+
}
145+
146+
void IMatrixCollector::print_layer_importance() {
147+
print_layer_importance("\n======================== sorted layer importances", m_layer_sim);
148+
print_layer_importance("\n======================== sorted attention importances", m_attn_sim);
149+
print_layer_importance("\n======================== sorted ffn importances", m_ffn_sim);
150+
//printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
151+
//if (m_layer_sim.empty()) return;
152+
//std::vector<std::pair<float, int>> layers;
153+
//layers.reserve(m_layer_sim.size());
154+
//for (int i = 0; i < int(m_layer_sim.size()); ++i) {
155+
// if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
156+
//}
157+
//if (layers.empty()) return;
158+
//std::sort(layers.begin(), layers.end());
159+
//printf("======================== sorted layer importances\n");
160+
//int j = 0;
161+
//for (auto& p : layers) {
162+
// int i = p.second;
163+
// printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
164+
//}
165+
}
166+
80167
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
81168
GGML_UNUSED(user_data);
82169

@@ -92,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
92179
// why are small batches ignored (<16 tokens)?
93180
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
94181
//printf("wname = %s\n", wname.c_str());
95-
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == m_params.output_tensor_name))) return false;
182+
if (!(wname.substr(0, 4) == "blk." || ((m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name))) return false;
96183
return true;
97184
}
98185

@@ -108,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
108195

109196
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
110197

198+
if (m_collect_lsim) {
199+
if (wname.find(".ffn_") != std::string::npos) {
200+
if (auto index = layer_index(wname); index.has_value() && *index == m_last_layer && *index != m_last_ffn) {
201+
int n = src1->ne[0];
202+
int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne[2] : src1->ne[1];
203+
if (t->op == GGML_OP_MUL_MAT_ID) {
204+
GGML_ASSERT(src1->ne[1] == 1);
205+
}
206+
if (m_ffn_input.empty()) {
207+
m_ffn_input.resize(nrow*n);
208+
} else {
209+
if ((int)m_ffn_input.size() != nrow*n) {
210+
printf("Oops, inconsistent ffn size\n"); exit(1);
211+
}
212+
}
213+
std::memcpy(m_ffn_input.data(), data, nrow*n*sizeof(float));
214+
if (m_ffn_input.size() != m_last_input.size()) {
215+
printf("Oops, inconsistent ffn vs last_input size\n"); exit(1);
216+
}
217+
if (m_attn_sim.size() < *index + 1) m_attn_sim.resize(*index + 1);
218+
auto& p = m_attn_sim[*index];
219+
collect_cos_similarity(nrow, n, m_ffn_input.data(), m_last_input.data(), p);
220+
m_last_ffn = *index;
221+
}
222+
}
223+
}
224+
111225
// this has been adapted to the new format of storing merged experts in a single 3d tensor
112226
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
113227
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -182,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
182296
}
183297
}
184298
} else {
299+
if (m_collect_lsim) {
300+
// We only need to do it here and not in the MoE branch above because the first tensor in a layer
301+
// never is a MoE tensor
302+
if (auto index = layer_index(wname); index.has_value()) {
303+
if (*index != m_last_layer) {
304+
if (*index > 0) {
305+
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
306+
printf("Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n",
307+
(int)(src1->ne[0]*src1->ne[1]), (int)m_last_input.size(), src0->name, m_last_layer);
308+
exit(1);
309+
}
310+
if (*index > m_layer_sim.size()) m_layer_sim.resize(*index);
311+
auto& p = m_layer_sim[*index - 1];
312+
collect_cos_similarity(src1->ne[1], src1->ne[0], m_last_input.data(), (const float *)data, p);
313+
if (*index == m_last_ffn + 1) {
314+
if (*index > m_ffn_sim.size()) m_ffn_sim.resize(*index);
315+
auto& p1 = m_ffn_sim[*index-1];
316+
collect_cos_similarity(src1->ne[1], src1->ne[0], m_ffn_input.data(), (const float *)data, p1);
317+
}
318+
}
319+
m_last_layer = *index;
320+
if (m_last_input.empty()) {
321+
m_last_input.resize(src1->ne[0]*src1->ne[1]);
322+
} else {
323+
if (m_last_input.size() != src1->ne[0]*src1->ne[1]) {
324+
printf("Oops\n"); exit(1);
325+
}
326+
}
327+
//printf("Copying src1 to m_last_input\n");
328+
std::memcpy(m_last_input.data(), data, src1->ne[0]*src1->ne[1]*sizeof(float));
329+
}
330+
}
331+
}
185332
auto & e = m_stats[wname];
186333
if (e.values.empty()) {
187334
e.values.resize(src1->ne[0], 0);
@@ -622,14 +769,33 @@ int main(int argc, char ** argv) {
622769
params.logits_all = true;
623770
params.verbosity = 1;
624771

625-
if (!gpt_params_parse(argc, argv, params)) {
772+
bool lsim = false;
773+
//
774+
// Do not pollute common with totally imatrix specific arguments as it was done in mainline.
775+
// Instead, parse imatrix specific args here, push unknown args into a new array of args,
776+
// and pass that to gpt_params_parse().
777+
//
778+
std::vector<char*> args;
779+
args.reserve(argc);
780+
args.push_back(argv[0]);
781+
for (int i = 1; i < argc; ++i) {
782+
std::string arg{argv[i]};
783+
if (arg == "-lsim" || arg == "--layer-similarity") {
784+
lsim = true;
785+
} else {
786+
args.push_back(argv[i]);
787+
}
788+
}
789+
790+
if (!gpt_params_parse(args.size(), args.data(), params)) {
626791
print_usage(argc, argv, params);
627792
return 1;
628793
}
629794

630795
params.n_batch = std::min(params.n_batch, params.n_ctx);
631796

632797
g_collector.set_params(params);
798+
g_collector.set_collect_lsim(lsim);
633799

634800
for (const auto & in_file : params.in_files) {
635801
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
@@ -680,6 +846,7 @@ int main(int argc, char ** argv) {
680846
}
681847

682848
g_collector.save_imatrix();
849+
g_collector.print_layer_importance();
683850

684851
llama_print_timings(ctx);
685852

0 commit comments

Comments
 (0)