1919#include < fstream>
2020#include < unordered_map>
2121#include < algorithm>
22+ #include < optional>
23+ #include < sstream>
2224
2325#if defined(_MSC_VER)
2426#pragma warning(disable: 4244 4267) // possible loss of data
@@ -49,13 +51,59 @@ class IMatrixCollector {
4951 bool collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data);
5052 void save_imatrix (int ncall = -1 ) const ;
5153 bool load_imatrix (const char * file_name);
54+ void set_collect_lsim (bool yes_or_no) { m_collect_lsim = yes_or_no; }
55+ void print_layer_importance ();
5256private:
5357 std::unordered_map<std::string, Stats> m_stats;
5458 gpt_params m_params;
5559 std::mutex m_mutex;
5660 int m_last_call = 0 ;
61+ int m_last_layer = 9999 ;
62+ int m_last_ffn = -1 ;
5763 std::vector<float > m_src1_data;
5864 std::vector<char > m_ids; // the expert ids from ggml_mul_mat_id
65+ std::vector<float > m_last_input;
66+ std::vector<float > m_ffn_input;
67+ std::vector<std::pair<double ,int >> m_layer_sim;
68+ std::vector<std::pair<double ,int >> m_attn_sim;
69+ std::vector<std::pair<double ,int >> m_ffn_sim;
70+ bool m_collect_lsim = false ;
71+
72+ std::optional<int > layer_index (const std::string& name) const {
73+ if (name == m_params.output_tensor_name && m_last_layer < 199 ) {
74+ return m_last_layer + 1 ;
75+ }
76+ if (auto pos = name.find (" blk." ); pos == 0 ) {
77+ pos += 4 ;
78+ if (auto pos1 = name.find (' .' , pos); pos1 != std::string::npos) {
79+ auto index_str = name.substr (pos, pos1 - pos);
80+ std::istringstream str (index_str);
81+ int index; str >> index;
82+ if (!str.fail ()) return index;
83+ }
84+ }
85+ return std::nullopt ;
86+ }
87+
88+ static inline double cosine_similarity (int n, const float * x, const float * y) {
89+ double sumxy = 0 , sumx2 = 0 , sumy2 = 0 ;
90+ for (int j = 0 ; j < n; ++j) {
91+ sumxy += x[j]*y[j]; sumx2 += x[j]*x[j]; sumy2 += y[j]*y[j];
92+ }
93+ double cos_sim = sumx2 > 0 && sumy2 > 0 ? sumxy/sqrt (sumx2*sumy2) : 0 ;
94+ return cos_sim;
95+ }
96+
97+ static inline void collect_cos_similarity (int nrow, int n, const float * x, const float * y, std::pair<double , int >& p) {
98+ for (int row = 0 ; row < nrow; ++row) {
99+ p.first += cosine_similarity (n, x, y);
100+ p.second += 1 ;
101+ x += n;
102+ y += n;
103+ }
104+ }
105+
106+ static void print_layer_importance (const char * msg, const std::vector<std::pair<double , int >>& sim);
59107};
60108
61109// remove any prefix and suffixes from the name
@@ -77,6 +125,45 @@ static std::string filter_tensor_name(const char * name) {
77125 return wname;
78126}
79127
128+ void IMatrixCollector::print_layer_importance (const char * msg, const std::vector<std::pair<double , int >>& sim) {
129+ if (sim.empty ()) return ;
130+ std::vector<std::pair<float , int >> layers;
131+ layers.reserve (sim.size ());
132+ for (int i = 0 ; i < int (sim.size ()); ++i) {
133+ if (sim[i].second > 0 ) layers.emplace_back (float (std::abs (sim[i].first /sim[i].second )), i);
134+ }
135+ if (layers.empty ()) return ;
136+ std::sort (layers.begin (), layers.end ());
137+ printf (" %s\n " , msg);
138+ // printf("======================== sorted layer importances\n");
139+ int j = 0 ;
140+ for (auto & p : layers) {
141+ int i = p.second ;
142+ printf (" %3d: Layer %3d, <cos_sim> = %g\n " , j++, i, sim[i].first /sim[i].second );
143+ }
144+ }
145+
146+ void IMatrixCollector::print_layer_importance () {
147+ print_layer_importance (" \n ======================== sorted layer importances" , m_layer_sim);
148+ print_layer_importance (" \n ======================== sorted attention importances" , m_attn_sim);
149+ print_layer_importance (" \n ======================== sorted ffn importances" , m_ffn_sim);
150+ // printf("%s: have %d layers\n", __func__, int(m_layer_sim.size()));
151+ // if (m_layer_sim.empty()) return;
152+ // std::vector<std::pair<float, int>> layers;
153+ // layers.reserve(m_layer_sim.size());
154+ // for (int i = 0; i < int(m_layer_sim.size()); ++i) {
155+ // if (m_layer_sim[i].second > 0) layers.emplace_back(float(std::abs(m_layer_sim[i].first/m_layer_sim[i].second)), i);
156+ // }
157+ // if (layers.empty()) return;
158+ // std::sort(layers.begin(), layers.end());
159+ // printf("======================== sorted layer importances\n");
160+ // int j = 0;
161+ // for (auto& p : layers) {
162+ // int i = p.second;
163+ // printf("%3d: Layer %3d, <cos_sim> = %g\n", j++, i, m_layer_sim[i].first/m_layer_sim[i].second);
164+ // }
165+ }
166+
80167bool IMatrixCollector::collect_imatrix (struct ggml_tensor * t, bool ask, void * user_data) {
81168 GGML_UNUSED (user_data);
82169
@@ -92,7 +179,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
92179 // why are small batches ignored (<16 tokens)?
93180 if (src1->ne [1 ] < 16 || src1->type != GGML_TYPE_F32) return false ;
94181 // printf("wname = %s\n", wname.c_str());
95- if (!(wname.substr (0 , 4 ) == " blk." || (m_params.process_output && wname == m_params.output_tensor_name ))) return false ;
182+ if (!(wname.substr (0 , 4 ) == " blk." || (( m_params.process_output || m_collect_lsim) && wname == m_params.output_tensor_name ))) return false ;
96183 return true ;
97184 }
98185
@@ -108,6 +195,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
108195
109196 const float * data = is_host ? (const float *) src1->data : m_src1_data.data ();
110197
198+ if (m_collect_lsim) {
199+ if (wname.find (" .ffn_" ) != std::string::npos) {
200+ if (auto index = layer_index (wname); index.has_value () && *index == m_last_layer && *index != m_last_ffn) {
201+ int n = src1->ne [0 ];
202+ int nrow = t->op == GGML_OP_MUL_MAT_ID ? src1->ne [2 ] : src1->ne [1 ];
203+ if (t->op == GGML_OP_MUL_MAT_ID) {
204+ GGML_ASSERT (src1->ne [1 ] == 1 );
205+ }
206+ if (m_ffn_input.empty ()) {
207+ m_ffn_input.resize (nrow*n);
208+ } else {
209+ if ((int )m_ffn_input.size () != nrow*n) {
210+ printf (" Oops, inconsistent ffn size\n " ); exit (1 );
211+ }
212+ }
213+ std::memcpy (m_ffn_input.data (), data, nrow*n*sizeof (float ));
214+ if (m_ffn_input.size () != m_last_input.size ()) {
215+ printf (" Oops, inconsistent ffn vs last_input size\n " ); exit (1 );
216+ }
217+ if (m_attn_sim.size () < *index + 1 ) m_attn_sim.resize (*index + 1 );
218+ auto & p = m_attn_sim[*index];
219+ collect_cos_similarity (nrow, n, m_ffn_input.data (), m_last_input.data (), p);
220+ m_last_ffn = *index;
221+ }
222+ }
223+ }
224+
111225 // this has been adapted to the new format of storing merged experts in a single 3d tensor
112226 // ref: https://github.com/ggerganov/llama.cpp/pull/6387
113227 if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -182,6 +296,39 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
182296 }
183297 }
184298 } else {
299+ if (m_collect_lsim) {
300+ // We only need to do it here and not in the MoE branch above because the first tensor in a layer
301+ // never is a MoE tensor
302+ if (auto index = layer_index (wname); index.has_value ()) {
303+ if (*index != m_last_layer) {
304+ if (*index > 0 ) {
305+ if (m_last_input.size () != src1->ne [0 ]*src1->ne [1 ]) {
306+ printf (" Oops: different size (%d vs %d). Tensor name was %s, m_last_layer = %d\n " ,
307+ (int )(src1->ne [0 ]*src1->ne [1 ]), (int )m_last_input.size (), src0->name , m_last_layer);
308+ exit (1 );
309+ }
310+ if (*index > m_layer_sim.size ()) m_layer_sim.resize (*index);
311+ auto & p = m_layer_sim[*index - 1 ];
312+ collect_cos_similarity (src1->ne [1 ], src1->ne [0 ], m_last_input.data (), (const float *)data, p);
313+ if (*index == m_last_ffn + 1 ) {
314+ if (*index > m_ffn_sim.size ()) m_ffn_sim.resize (*index);
315+ auto & p1 = m_ffn_sim[*index-1 ];
316+ collect_cos_similarity (src1->ne [1 ], src1->ne [0 ], m_ffn_input.data (), (const float *)data, p1);
317+ }
318+ }
319+ m_last_layer = *index;
320+ if (m_last_input.empty ()) {
321+ m_last_input.resize (src1->ne [0 ]*src1->ne [1 ]);
322+ } else {
323+ if (m_last_input.size () != src1->ne [0 ]*src1->ne [1 ]) {
324+ printf (" Oops\n " ); exit (1 );
325+ }
326+ }
327+ // printf("Copying src1 to m_last_input\n");
328+ std::memcpy (m_last_input.data (), data, src1->ne [0 ]*src1->ne [1 ]*sizeof (float ));
329+ }
330+ }
331+ }
185332 auto & e = m_stats[wname];
186333 if (e.values .empty ()) {
187334 e.values .resize (src1->ne [0 ], 0 );
@@ -622,14 +769,33 @@ int main(int argc, char ** argv) {
622769 params.logits_all = true ;
623770 params.verbosity = 1 ;
624771
625- if (!gpt_params_parse (argc, argv, params)) {
772+ bool lsim = false ;
773+ //
774+ // Do not pollute common with totally imatrix specific arguments as it was done in mainline.
775+ // Instead, parse imatrix specific args here, push unknown args into a new array of args,
776+ // and pass that to gpt_params_parse().
777+ //
778+ std::vector<char *> args;
779+ args.reserve (argc);
780+ args.push_back (argv[0 ]);
781+ for (int i = 1 ; i < argc; ++i) {
782+ std::string arg{argv[i]};
783+ if (arg == " -lsim" || arg == " --layer-similarity" ) {
784+ lsim = true ;
785+ } else {
786+ args.push_back (argv[i]);
787+ }
788+ }
789+
790+ if (!gpt_params_parse (args.size (), args.data (), params)) {
626791 print_usage (argc, argv, params);
627792 return 1 ;
628793 }
629794
630795 params.n_batch = std::min (params.n_batch , params.n_ctx );
631796
632797 g_collector.set_params (params);
798+ g_collector.set_collect_lsim (lsim);
633799
634800 for (const auto & in_file : params.in_files ) {
635801 printf (" %s : loading imatrix from '%s'\n " , __func__, in_file.c_str ());
@@ -680,6 +846,7 @@ int main(int argc, char ** argv) {
680846 }
681847
682848 g_collector.save_imatrix ();
849+ g_collector.print_layer_importance ();
683850
684851 llama_print_timings (ctx);
685852
0 commit comments