@@ -166,6 +166,7 @@ static std::vector<float> compute_tensor_averages(const Stats & tstats) {
166166static bool compute_vector_statistics (std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
167167 const size_t n_mat = e.counts .size ();
168168 const size_t len = e.activations .empty () ? e.values .size () : e.activations .size ();
169+ const bool legacy = e.activations .empty ();
169170 if (n_mat == 0 ) {
170171 LOG_ERR (" %s: there are no activations for tensor %s. The imatrix may be suboptimal\n " , __func__, name.c_str ());
171172 return false ;
@@ -174,91 +175,91 @@ static bool compute_vector_statistics(std::vector<tensor_statistics> & tstats, c
174175 LOG_ERR (" %s: activation size mismatch for tensor %s (len=%zu, counts=%zu)\n " , __func__, name.c_str (), len, n_mat);
175176 return false ;
176177 }
178+ if (!legacy && e.values .size () != len) {
179+ LOG_ERR (" %s: activations/values size mismatch for tensor %s (act=%zu, val=%zu)\n " , __func__, name.c_str (), len, e.values .size ());
180+ return false ;
181+ }
177182
178183 const size_t row_size = len / n_mat;
179- std::vector<float > activations;
180- activations.reserve (len);
181-
184+ double mean = 0.0 ;
185+ double M2 = 0.0 ;
186+ double sum = 0.0 ;
187+ float vmin = std::numeric_limits<float >::infinity ();
188+ float vmax = -std::numeric_limits<float >::infinity ();
189+ double energy_sum = 0.0 ;
190+ size_t valid_n = 0 ;
182191 for (size_t i = 0 ; i < n_mat; ++i) {
183192 const auto c = (float )e.counts [i];
193+ if (c <= 0 .0f ) { continue ; } // skip experts with zero count
184194 const size_t off = i * row_size;
185- if (c <= 0 .0f ) {
186- activations.insert (activations.end (), row_size, 0 .0f );
187- continue ;
188- }
189- if (e.activations .empty ()) {
190- for (size_t j = 0 ; j < row_size; ++j) {
191- activations.push_back (e.values [off + j] / c); // mean-of-squares
192- }
193- } else {
194- for (size_t j = 0 ; j < row_size; ++j) {
195- activations.push_back (e.activations [off + j] / c); // mean
196- }
195+
196+ for (size_t j = 0 ; j < row_size; ++j) {
197+ const double v_avg = legacy ? 0.0 : (double )e.activations [off + j] / (double )c; // E[x]
198+ const double v_energy = (double )e.values [off + j] / (double )c; // E[x^2]
199+ const double v = legacy ? v_energy : v_avg;
200+
201+ ++valid_n;
202+ sum += v;
203+ vmin = std::min (vmin, (float )v);
204+ vmax = std::max (vmax, (float )v);
205+
206+ const double delta = v - mean;
207+ mean += delta / (double )valid_n;
208+ M2 += delta * (v - mean);
209+ energy_sum += std::max (0.0 , v_energy);
197210 }
198211 }
199212
200- if (activations. empty () ) {
201- LOG_ERR (" %s: computed empty activation vector for tensor %s\n " , __func__, name.c_str ());
213+ if (valid_n == 0 ) {
214+ LOG_ERR (" %s: there are no activations for tensor %s. The imatrix may be suboptimal \n " , __func__, name.c_str ());
202215 return false ;
203216 }
204217
205- double sum = 0.0 ;
206- float vmax = activations[0 ];
207- float vmin = activations[0 ];
208- for (float v : activations) {
209- sum += v;
210- vmax = std::max (vmax, v);
211- vmin = std::min (vmin, v);
212- }
213-
214- const auto mean = (float )(sum / (double )activations.size ());
215- double sqr_sum = 0.0 ;
216- for (const float v : activations) { sqr_sum += (double )v * (double )v; }
217- double variance = sqr_sum / (double )activations.size () - (double )mean * (double )mean;
218- variance = std::max (variance, 0.0 );
219- const float std_deviation = std::sqrt ((float )variance);
220-
218+ float std_deviation = 0 .0f ;
221219 float entropy = 0 .0f ;
222- if (e.activations .empty ()) {
223- double energy_sum = 0.0 ;
224- for (float v : activations) { energy_sum += (double )std::max (0 .0f , v); }
225- if (energy_sum > 0.0 ) {
226- for (const float v : activations) {
227- const double p = std::max (0.0 , (double )v) / energy_sum;
228- if (p > 0.0 ) { entropy -= (float )(p * std::log2 (p)); }
229- }
230- }
231- } else {
232- double energy_sum = 0.0 ;
233- for (const float v : activations) { energy_sum += (double )v * (double )v; }
234- if (energy_sum > 0.0 ) {
235- for (const float v : activations) {
236- const double p = (double )v * (double )v / energy_sum;
220+ double zd_count = 0.0 ;
221+ double variance = valid_n > 1 ? M2 / ((double )valid_n - 1 ) : 0.0 ;
222+ variance = std::max (variance, 0.0 );
223+ std_deviation = std::sqrt ((float )variance);
224+ if (energy_sum > 0.0 ) {
225+ for (size_t i = 0 ; i < n_mat; ++i) {
226+ const auto c = (float )e.counts [i];
227+ if (c <= 0 .0f ) { continue ; }
228+ const size_t off = i * row_size;
229+ for (size_t j = 0 ; j < row_size; ++j) {
230+ const double v_energy = (double )e.values [off + j] / (double )c; // E[x^2]
231+ const double w = std::max (0.0 , v_energy);
232+ const double p = w / energy_sum;
237233 if (p > 0.0 ) { entropy -= (float )(p * std::log2 (p)); }
238234 }
239235 }
240236 }
241-
242- // ZD score: fraction with |z| > 1
243- double zd_count = 0.0 ;
244237 if (std_deviation > 0 .0f ) {
245- for (const float v : activations) {
246- const float z = (v - mean) / std_deviation;
247- if (std::fabs (z) > 1 .0f ) { zd_count += 1.0 ; }
238+ for (size_t i = 0 ; i < n_mat; ++i) {
239+ const float c = (float )e.counts [i];
240+ if (c <= 0 .0f ) { continue ; }
241+ const size_t off = i * row_size;
242+ for (size_t j = 0 ; j < row_size; ++j) {
243+ const double v_avg = legacy ? 0.0 : (double )e.activations [off + j] / (double )c; // E[x]
244+ const double v_energy = (double )e.values [off + j] / (double )c; // E[x^2]
245+ const float v = (float )(legacy ? v_energy : v_avg);
246+ const float z = (v - (float )mean) / std_deviation;
247+ if (std::fabs (z) > 1 .0f ) { zd_count += 1.0 ; }
248+ }
248249 }
249250 }
250251
251252 auto & ts = tstats.emplace_back ();
252253 ts.tensor = name;
253254 ts.stats = e;
254255 ts.sum_values = (float )sum;
255- ts.mean_values = mean;
256+ ts.mean_values = ( float ) mean;
256257 ts.max_values = vmax;
257258 ts.min_values = vmin;
258- ts.elements = ( int )activations. size () ;
259+ ts.elements = valid_n ;
259260 ts.std_deviation = std_deviation;
260261 ts.entropy = entropy;
261- ts.zd_score = ts. elements > 0 ? (float )(zd_count / (double )ts. elements ) : 0 . 0f ;
262+ ts.zd_score = (float )(zd_count / (double )valid_n) ;
262263
263264 return e.activations .empty ();
264265}
@@ -267,7 +268,7 @@ static void compute_tensor_statistics(std::vector<tensor_statistics> & tstats) {
267268 static const std::regex pattern (R"( blk\.(\d+)\.)" );
268269 for (auto & ts : tstats) {
269270 ts.cossim = 0 .0f ;
270- ts.l2_norm = 0 .0f ;
271+ ts.l2_dist = 0 .0f ;
271272
272273 if (std::smatch match; std::regex_search (ts.tensor , match, pattern)) {
273274 const int blk = std::stoi (match[1 ]);
@@ -309,7 +310,7 @@ static void compute_tensor_statistics(std::vector<tensor_statistics> & tstats) {
309310 ts.cossim = cs;
310311
311312 // Compute L2 Norm (Euclidean Distance)
312- ts.l2_norm = std::sqrt (l2_dist_sq);
313+ ts.l2_dist = std::sqrt (l2_dist_sq);
313314 }
314315 }
315316}
0 commit comments