From 6c7a441161080551ce8a52ba32563b6295067192 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 3 Aug 2025 07:23:57 -0500 Subject: [PATCH 1/6] vulkan: Use coopmat2 for conv2d (#14982) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 13 +++- .../ggml-vulkan/vulkan-shaders/conv2d_mm.comp | 62 ++++++++++++++++--- .../vulkan-shaders/vulkan-shaders-gen.cpp | 3 + 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e095b26a48471..3682ee3804784 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -3096,6 +3096,10 @@ static void ggml_vk_load_shaders(vk_device& device) { uint32_t conv2d_SHMEM_PAD = 4; bool conv2d_UNROLL = true; + if (device->coopmat2) { + conv2d_SHMEM_PAD = 8; // 8 float16_t + } + if (device->vendor_id == VK_VENDOR_ID_INTEL) { conv2d_SHMEM_PAD = 0; conv2d_UNROLL = false; @@ -3154,7 +3158,14 @@ static void ggml_vk_load_shaders(vk_device& device) { std::array wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 }; std::vector spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD }; - if (conv2d_UNROLL) { + if (device->coopmat2) { + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives); + ggml_vk_create_pipeline( + device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3, + sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives); + } else if (conv2d_UNROLL) { ggml_vk_create_pipeline( device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3, sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp index 04a10c012f4fe..86bafba4a4398 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -1,6 +1,11 @@ #version 450 #extension GL_EXT_control_flow_attributes : enable +#ifdef COOPMAT2 +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_KHR_memory_scope_semantics : enable +#endif #ifdef USE_COLLECTIVES # extension GL_KHR_shader_subgroup_shuffle : enable @@ -91,6 +96,12 @@ uint32_t n_elems_out = K * NPQ; // Number of blocktiles per input uint32_t NB_CRS = splitWork(CRS, BS_CRS); +#ifdef COOPMAT2 +#define SHMEM_TYPE float16_t +#else +#define SHMEM_TYPE float +#endif + const uint32_t Ash_stride = BS_CRS + SHMEM_PAD; const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD; @@ -100,8 +111,8 @@ const uint32_t Bsh_numel = BS_CRS * BS_NPQ; const uint32_t Ash_len = BS_K * Ash_stride; const uint32_t Bsh_len = BS_CRS * Bsh_stride; -shared float Ash[Ash_len]; // K x CRS -shared float Bsh[Bsh_len]; // CRS x NPQ +shared SHMEM_TYPE Ash[Ash_len]; // K x CRS +shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ // Threadtile sizes const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; @@ -110,10 +121,6 @@ const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; const uint32_t NT_K = BS_K / TS_K; const uint32_t NT_NPQ = BS_NPQ / TS_NPQ; -float regA[TS_K]; -float regB[TS_NPQ]; -float regC[TS_K][TS_NPQ]; - /* Compute KxCRS @ CRSxNPQ = K x NPQ @@ -145,12 +152,36 @@ uint fastdiv(uint n, uint mp, uint L) { return (msbs + n) >> L; } +#ifdef COOPMAT2 +#define ACC_TYPE float16_t + +ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem) +{ + uint32_t K_idx = B_idx_K * BS_K + r; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c; + uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW; + uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW; + uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW; + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3; + if (K_idx < K && NPQ_idx < NPQ) { + dst_data[dst_idx] = D_TYPE(elem); + } + return elem; +} +#endif + void main() { +#ifdef COOPMAT2 + coopmat matC; + matC = coopmat(0.0); +#else + float regC[TS_K][TS_NPQ]; for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { regC[T_ly][T_lx] = 0.0; } } +#endif /* Advance block in CRS dim */ for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) { uint32_t CRS_idx_a; @@ -199,7 +230,7 @@ void main() { if (K_idx >= K || CRS_idx_a >= CRS) { val = 0.0; } - Ash[B_ly * Ash_stride + B_lx] = val; + Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val); } /* Load input to B_block: (BS_CRS x BS_NPQ) */ UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) { @@ -244,11 +275,21 @@ void main() { if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) { val = 0.0; } - Bsh[B_ly * Bsh_stride + B_lx] = val; + Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val); } barrier(); +#ifdef COOPMAT2 + coopmat matA; + coopmat matB; + + coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor); + matC = coopMatMulAdd(matA, matB, matC); +#else if (T_y * TS_K < K) { UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) { + float regA[TS_K]; + float regB[TS_NPQ]; for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx]; } @@ -262,9 +303,13 @@ void main() { } } } +#endif barrier(); } /* Save C* */ +#ifdef COOPMAT2 + coopMatPerElementNV(matC, matC, perElemOpStore); +#else if (T_y * TS_K < K) { for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { @@ -280,4 +325,5 @@ void main() { } } } +#endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index b634e52d64d37..83e4a7c723d32 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -661,6 +661,9 @@ void process_shaders() { string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}}); string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}}); + string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true); + string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true); + string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); From 83bc2f288c0e08e676d9beca9c4669197e920593 Mon Sep 17 00:00:00 2001 From: Gabriel Larson <55459720+gabriellarson@users.noreply.github.com> Date: Sun, 3 Aug 2025 09:56:25 -0500 Subject: [PATCH 2/6] model : add text-only support for Kimi-VL (and find special tokens in text_config) (#15051) * basic kimi-vl textmodel conversion * check config["text_config"] for special tokens --- convert_hf_to_gguf.py | 8 ++++++++ gguf-py/gguf/vocab.py | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f13f8558b2ec4..5f15c8257cbef 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6059,6 +6059,7 @@ def prepare_tensors(self): @ModelBase.register("DeepseekV2ForCausalLM") @ModelBase.register("DeepseekV3ForCausalLM") +@ModelBase.register("KimiVLForConditionalGeneration") class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -6161,6 +6162,13 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # skip vision tensors and remove "language_model." for Kimi-VL + if "vision_tower" in name or "multi_modal_projector" in name: + return [] + + if name.startswith("language_model."): + name = name.replace("language_model.", "") + # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index e1d5aaf47ac46..7111557bfdd8c 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -312,7 +312,11 @@ def _try_load_from_config_json(self, path: Path) -> bool: with open(config_file, encoding = 'utf-8') as f: config = json.load(f) for typ in self.special_token_types: - self._set_special_token(typ, config.get(f'{typ}_token_id')) + token_id = config.get(f'{typ}_token_id') + # If not found at root, check in text_config (for multimodal models like Kimi-VL) + if token_id is None and 'text_config' in config: + token_id = config['text_config'].get(f'{typ}_token_id') + self._set_special_token(typ, token_id) return True From 97366dc6abdd0bdc74260bd3c42bd06f0feb7428 Mon Sep 17 00:00:00 2001 From: Csaba Kecskemeti Date: Sun, 3 Aug 2025 12:38:18 -0700 Subject: [PATCH 3/6] vocab : JetBrains Mellum pre-tokenizer (#15045) --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5f15c8257cbef..9303a047694f5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -852,6 +852,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B res = "exaone4" + if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": + # ref: https://huggingface.co/JetBrains/Mellum-4b-base + res = "mellum" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 211b81ff34088..226805f1e1ff8 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -138,6 +138,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", }, {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, + {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 7b7a93566027a..959c86a14745f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "gigachat" || tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-de" || - tokenizer_pre == "a.x-4.0") { + tokenizer_pre == "a.x-4.0" || + tokenizer_pre == "mellum") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "jina-v1-en" || From 11a3811164ef2d75393c6b0a632f4c608e3e3dd2 Mon Sep 17 00:00:00 2001 From: compilade Date: Sun, 3 Aug 2025 15:43:07 -0400 Subject: [PATCH 4/6] memory : handle kv_unified for hybrid models (#15050) --- src/llama-memory-hybrid.cpp | 3 ++- src/llama-memory-hybrid.h | 1 + src/llama-model.cpp | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index d8e2086c87514..e98b4e3546959 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid( /* common */ uint32_t n_seq_max, bool offload, + bool unified, /* layer filters */ layer_filter_cb && filter_attn, layer_filter_cb && filter_recr) : @@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid( type_v, v_trans, offload, - 1, + unified, kv_size, n_seq_max, n_pad, diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index 4ac318175785e..c2d56cd541594 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -39,6 +39,7 @@ class llama_memory_hybrid : public llama_memory_i { /* common */ uint32_t n_seq_max, bool offload, + bool unified, /* layer filters */ layer_filter_cb && filter_attn = nullptr, layer_filter_cb && filter_recr = nullptr); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6b58fb8a059f4..60a615c159a51 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17598,6 +17598,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, /* offload */ cparams.offload_kqv, + /* unified */ cparams.kv_unified, /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else { From 0a2f5496bef9e54e5f42d6c2c3ad9eb7b379aed0 Mon Sep 17 00:00:00 2001 From: compilade Date: Sun, 3 Aug 2025 15:49:13 -0400 Subject: [PATCH 5/6] imatrix : fix 3d activation handling for hybrid and recurrent models (#14994) * imatrix : use a single count for dense 3d tensors * imatrix : fix 3d activations when model tensor is 2d * imatrix : fix 3d tensor counts --- tools/imatrix/imatrix.cpp | 68 +++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 9aad3711bae54..f5262e5e83da9 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); - // TODO: 4d? (is that even used in practice?) - // the extra dimension would need to be stored somewhere to be reflected in the imatrix file - if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { - LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); - GGML_ASSERT(false); - } - // this has been adapted to the new format of storing merged experts in a single 3d tensor // ref: https://github.com/ggml-org/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { @@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * GGML_ASSERT(ids->ne[1] == src1->ne[2]); + // the extra dimension would need to be stored somewhere to be reflected in the imatrix file + if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { + LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); + GGML_ASSERT(false); + } + m_ids.resize(ggml_nbytes(ids)); ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); @@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } else { auto & e = m_stats[wname]; - const int64_t n_mat = src1->ne[2] * src1->ne[3]; - + const int64_t n_mat = src0->ne[2] * src0->ne[3]; + + // use a single count per dense tensor + // (necessary when merging older GGUF-imatrix files with 3d tensors) + if (e.counts.size() > 1) { + bool all_equal = true; + for (size_t i = 1; i < e.counts.size(); ++i) { + if (e.counts[0] != e.counts[i]) { + all_equal = false; + break; + } + } + if (all_equal) { + e.counts.resize(1); + } + } if (e.values.empty()) { e.values.resize(src1->ne[0] * n_mat, 0); - e.counts.resize(n_mat, 0); + e.counts.resize(1, 0); } else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) { LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); exit(1); //GGML_ABORT("fatal error"); } - else if (e.counts.size() != (size_t)n_mat) { - LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat); - exit(1); //GGML_ABORT("fatal error"); - } LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); + for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { - const int64_t mat_id = i3 * src1->ne[2] + i2; + // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D + const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]); const int64_t mat_start = mat_id * src1->ne[0]; for (int64_t row = 0; row < src1->ne[1]; ++row) { - const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); - e.counts[mat_id]++; + const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]); for (int64_t j = 0; j < src1->ne[0]; ++j) { e.values[mat_start + j] += x[j] * x[j]; if (!std::isfinite((float)e.values[j])) { @@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } - const int32_t n_chunk = e.counts[mat_id] / chunk_size; - if (n_chunk > m_last_chunk) { - const int32_t chunk_step = n_chunk - m_last_chunk; - m_last_chunk = n_chunk; - if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { - save_imatrix(m_last_chunk); - } + } + } + // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT + for (size_t i = 0; i < e.counts.size(); ++i) { + e.counts[i] += ggml_nrows(src1) / n_mat; + const int32_t n_chunk = e.counts[i] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); } } } From d31192b4ee1441bbbecd3cbf9e02633368bdc4f5 Mon Sep 17 00:00:00 2001 From: compilade Date: Sun, 3 Aug 2025 16:00:05 -0400 Subject: [PATCH 6/6] imatrix : use GGUF by default (#14842) * imatrix : use GGUF by default * imatrix : use GGUF regardless of the output filename The legacy format can only be produced with --output-format dat --- common/arg.cpp | 9 +++++++++ common/common.h | 1 + tools/imatrix/README.md | 12 +++++++++--- tools/imatrix/imatrix.cpp | 8 ++++---- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index cd853119131e9..0b216ec0d0c02 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2647,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(common_arg( + {"--output-format"}, "{gguf,dat}", + string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"), + [](common_params & params, const std::string & value) { + /**/ if (value == "gguf") { params.imat_dat = false; } + else if (value == "dat") { params.imat_dat = true; } + else { throw std::invalid_argument("invalid output format"); } + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), diff --git a/common/common.h b/common/common.h index b8b01a7e99790..6b900b795f438 100644 --- a/common/common.h +++ b/common/common.h @@ -439,6 +439,7 @@ struct common_params { int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t i_chunk = 0; // start processing from this chunk + bool imat_dat = false; // whether the legacy imatrix.dat format should be output bool process_output = false; // collect data for the output tensor bool compute_ppl = true; // whether to compute perplexity diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md index 7417a2dec9e6c..4505cb4ce8c7d 100644 --- a/tools/imatrix/README.md +++ b/tools/imatrix/README.md @@ -7,7 +7,7 @@ More information is available in =2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. * `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used. * `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf". * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) * `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. * `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets. @@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the ```bash # generate and save the imatrix using legacy format -./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99 +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99 ``` ```bash -# covert legacy (binary) imatrix format to new (GGUF) format +# convert legacy (binary) imatrix format to new (GGUF) format ./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf ``` +```bash +# convert new (GGUF) imatrix format to legacy (binary) format +./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat +``` + ```bash # combine existing imatrices ./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index f5262e5e83da9..9ceceb478df4f 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -26,7 +26,7 @@ static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); LOG("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n" " [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n" " [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n" " [--show-statistics] [...]\n" , argv[0]); @@ -506,13 +506,13 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const { void IMatrixCollector::save_imatrix(int32_t n_chunk) const { auto fname = m_params.out_file; + bool use_legacy_format = m_params.imat_dat; - // TODO: use the new format in more cases - if (!string_ends_with(fname, ".gguf")) { - LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__); + if (use_legacy_format) { this->save_imatrix_legacy(n_chunk); return; } + // else, default to GGUF imatrix if (n_chunk > 0) { fname += ".at_";