Skip to content

Commit 0a2f549

Browse files
authored
imatrix : fix 3d activation handling for hybrid and recurrent models (#14994)
* imatrix : use a single count for dense 3d tensors * imatrix : fix 3d activations when model tensor is 2d * imatrix : fix 3d tensor counts
1 parent 11a3811 commit 0a2f549

File tree

1 file changed

+41
-27
lines changed

1 file changed

+41
-27
lines changed

tools/imatrix/imatrix.cpp

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
250250
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
251251
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
252252

253-
// TODO: 4d? (is that even used in practice?)
254-
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
255-
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
256-
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
257-
GGML_ASSERT(false);
258-
}
259-
260253
// this has been adapted to the new format of storing merged experts in a single 3d tensor
261254
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
262255
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
272265

273266
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
274267

268+
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
269+
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
270+
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
271+
GGML_ASSERT(false);
272+
}
273+
275274
m_ids.resize(ggml_nbytes(ids));
276275
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
277276

@@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
335334
}
336335
} else {
337336
auto & e = m_stats[wname];
338-
const int64_t n_mat = src1->ne[2] * src1->ne[3];
339-
337+
const int64_t n_mat = src0->ne[2] * src0->ne[3];
338+
339+
// use a single count per dense tensor
340+
// (necessary when merging older GGUF-imatrix files with 3d tensors)
341+
if (e.counts.size() > 1) {
342+
bool all_equal = true;
343+
for (size_t i = 1; i < e.counts.size(); ++i) {
344+
if (e.counts[0] != e.counts[i]) {
345+
all_equal = false;
346+
break;
347+
}
348+
}
349+
if (all_equal) {
350+
e.counts.resize(1);
351+
}
352+
}
340353
if (e.values.empty()) {
341354
e.values.resize(src1->ne[0] * n_mat, 0);
342-
e.counts.resize(n_mat, 0);
355+
e.counts.resize(1, 0);
343356
}
344357
else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
345358
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
346359
exit(1); //GGML_ABORT("fatal error");
347360
}
348-
else if (e.counts.size() != (size_t)n_mat) {
349-
LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
350-
exit(1); //GGML_ABORT("fatal error");
351-
}
352361
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
362+
353363
for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
354364
for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
355-
const int64_t mat_id = i3 * src1->ne[2] + i2;
365+
// handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
366+
const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
356367
const int64_t mat_start = mat_id * src1->ne[0];
357368

358369
for (int64_t row = 0; row < src1->ne[1]; ++row) {
359-
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
360-
e.counts[mat_id]++;
370+
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
361371
for (int64_t j = 0; j < src1->ne[0]; ++j) {
362372
e.values[mat_start + j] += x[j] * x[j];
363373
if (!std::isfinite((float)e.values[j])) {
@@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
366376
}
367377
}
368378
}
369-
const int32_t n_chunk = e.counts[mat_id] / chunk_size;
370-
if (n_chunk > m_last_chunk) {
371-
const int32_t chunk_step = n_chunk - m_last_chunk;
372-
m_last_chunk = n_chunk;
373-
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
374-
save_imatrix();
375-
}
376-
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
377-
save_imatrix(m_last_chunk);
378-
}
379+
}
380+
}
381+
// only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
382+
for (size_t i = 0; i < e.counts.size(); ++i) {
383+
e.counts[i] += ggml_nrows(src1) / n_mat;
384+
const int32_t n_chunk = e.counts[i] / chunk_size;
385+
if (n_chunk > m_last_chunk) {
386+
const int32_t chunk_step = n_chunk - m_last_chunk;
387+
m_last_chunk = n_chunk;
388+
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
389+
save_imatrix();
390+
}
391+
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
392+
save_imatrix(m_last_chunk);
379393
}
380394
}
381395
}

0 commit comments

Comments
 (0)