@@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
250
250
const char * data = is_host ? (const char *) src1->data : m_src1_data.data ();
251
251
GGML_ASSERT (src1->nb [0 ] == ggml_element_size (src1));
252
252
253
- // TODO: 4d? (is that even used in practice?)
254
- // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
255
- if (ggml_nrows (src1) != src1->ne [1 ] * src1->ne [2 ]) {
256
- LOG_ERR (" %s: tensor has more than 3 dimensions: %s" , __func__, wname.c_str ());
257
- GGML_ASSERT (false );
258
- }
259
-
260
253
// this has been adapted to the new format of storing merged experts in a single 3d tensor
261
254
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
262
255
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
272
265
273
266
GGML_ASSERT (ids->ne [1 ] == src1->ne [2 ]);
274
267
268
+ // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
269
+ if (ggml_nrows (src1) != src1->ne [1 ] * src1->ne [2 ]) {
270
+ LOG_ERR (" %s: tensor has more than 3 dimensions: %s" , __func__, wname.c_str ());
271
+ GGML_ASSERT (false );
272
+ }
273
+
275
274
m_ids.resize (ggml_nbytes (ids));
276
275
ggml_backend_tensor_get (ids, m_ids.data (), 0 , ggml_nbytes (ids));
277
276
@@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
335
334
}
336
335
} else {
337
336
auto & e = m_stats[wname];
338
- const int64_t n_mat = src1->ne [2 ] * src1->ne [3 ];
339
-
337
+ const int64_t n_mat = src0->ne [2 ] * src0->ne [3 ];
338
+
339
+ // use a single count per dense tensor
340
+ // (necessary when merging older GGUF-imatrix files with 3d tensors)
341
+ if (e.counts .size () > 1 ) {
342
+ bool all_equal = true ;
343
+ for (size_t i = 1 ; i < e.counts .size (); ++i) {
344
+ if (e.counts [0 ] != e.counts [i]) {
345
+ all_equal = false ;
346
+ break ;
347
+ }
348
+ }
349
+ if (all_equal) {
350
+ e.counts .resize (1 );
351
+ }
352
+ }
340
353
if (e.values .empty ()) {
341
354
e.values .resize (src1->ne [0 ] * n_mat, 0 );
342
- e.counts .resize (n_mat , 0 );
355
+ e.counts .resize (1 , 0 );
343
356
}
344
357
else if (e.values .size () != (size_t )(src1->ne [0 ] * n_mat)) {
345
358
LOG_ERR (" %s: inconsistent size for %s (%d vs %d)\n " , __func__, wname.c_str (), (int )e.values .size (), (int )(src1->ne [0 ] * n_mat));
346
359
exit (1 ); // GGML_ABORT("fatal error");
347
360
}
348
- else if (e.counts .size () != (size_t )n_mat) {
349
- LOG_ERR (" %s: inconsistent expert count for %s (%d vs %d)\n " , __func__, wname.c_str (), (int )e.counts .size (), (int )n_mat);
350
- exit (1 ); // GGML_ABORT("fatal error");
351
- }
352
361
LOG_DBGV (2 , " %s[%d]: %32s, %s, %5d x %5d x %5d, %d\n " , __func__, m_last_chunk, wname.c_str (), ggml_op_name (t->op ), (int )src1->ne [0 ], (int )src1->ne [1 ], (int )src1->ne [2 ], (int )src1->type );
362
+
353
363
for (int64_t i3 = 0 ; i3 < src1->ne [3 ]; ++i3) {
354
364
for (int64_t i2 = 0 ; i2 < src1->ne [2 ]; ++i2) {
355
- const int64_t mat_id = i3 * src1->ne [2 ] + i2;
365
+ // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
366
+ const int64_t mat_id = (i3 % src0->ne [3 ]) * src0->ne [2 ] + (i2 % src0->ne [2 ]);
356
367
const int64_t mat_start = mat_id * src1->ne [0 ];
357
368
358
369
for (int64_t row = 0 ; row < src1->ne [1 ]; ++row) {
359
- const float * x = (const float *) (data + row * src1->nb [1 ] + i2 * src1->nb [2 ] + i3 * src1->ne [3 ]);
360
- e.counts [mat_id]++;
370
+ const float * x = (const float *) (data + row * src1->nb [1 ] + i2 * src1->nb [2 ] + i3 * src1->nb [3 ]);
361
371
for (int64_t j = 0 ; j < src1->ne [0 ]; ++j) {
362
372
e.values [mat_start + j] += x[j] * x[j];
363
373
if (!std::isfinite ((float )e.values [j])) {
@@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
366
376
}
367
377
}
368
378
}
369
- const int32_t n_chunk = e.counts [mat_id] / chunk_size;
370
- if (n_chunk > m_last_chunk) {
371
- const int32_t chunk_step = n_chunk - m_last_chunk;
372
- m_last_chunk = n_chunk;
373
- if ((m_last_chunk % m_params.n_out_freq ) / chunk_step == 0 ) {
374
- save_imatrix ();
375
- }
376
- if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq ) / chunk_step == 0 ) {
377
- save_imatrix (m_last_chunk);
378
- }
379
+ }
380
+ }
381
+ // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
382
+ for (size_t i = 0 ; i < e.counts .size (); ++i) {
383
+ e.counts [i] += ggml_nrows (src1) / n_mat;
384
+ const int32_t n_chunk = e.counts [i] / chunk_size;
385
+ if (n_chunk > m_last_chunk) {
386
+ const int32_t chunk_step = n_chunk - m_last_chunk;
387
+ m_last_chunk = n_chunk;
388
+ if ((m_last_chunk % m_params.n_out_freq ) / chunk_step == 0 ) {
389
+ save_imatrix ();
390
+ }
391
+ if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq ) / chunk_step == 0 ) {
392
+ save_imatrix (m_last_chunk);
379
393
}
380
394
}
381
395
}
0 commit comments