fix mul_mat_id read out-of-bound

ngxson · ngxson · commit 40ab1ab30755 · 2025-03-30T14:51:29.000+02:00
diff --git a/examples/tts/convert_csm_to_gguf.py b/examples/tts/convert_csm_to_gguf.py
@@ -179,10 +179,10 @@ def rename_transformer(name: str) -> str:
             is_decoder = True
             name = "audio_head.weight"
             if component == "decoder":
-                # add padding at the beginning so that build_lora_mm_id can be used
+                # add padding at the beginning and the end so that build_lora_mm_id can be used
                 zero_tensor = torch.zeros(1, 1024, 2051)
-                data_torch = torch.cat([zero_tensor, data_torch], dim=0)
-                assert data_torch.shape == (32, 1024, 2051)
+                data_torch = torch.cat([zero_tensor, data_torch, zero_tensor], dim=0)
+                assert data_torch.shape == (33, 1024, 2051)
                 # then, transpose it
                 data_torch = data_torch.transpose(1, 2)
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1662,15 +1662,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 {
                     // TODO: maybe store these in gguf metadata
                     int64_t csm_audio_cbook_size = 2051; // audio codebook size
-                    int64_t csm_acoustic_tokens  = 32;   // equal to number of acoutic tokens for Mimi
+                    int64_t csm_audio_tokens     = 32;   // equal to number of audio tokens for Mimi
                     //int64_t csm_n_audio_vocab    = csm_audio_cbook_size*csm_acoustic_tokens;
 
                     csm_output_cbook = create_tensor(tn(LLM_TENSOR_CSM_CBOOK_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size}, TENSOR_NOT_REQUIRED);
 
                     bool is_backbone = csm_output_cbook != nullptr;
 
                     csm_output_audio = is_backbone ? nullptr
-                        : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_acoustic_tokens}, 0);
+                        : create_tensor(tn(LLM_TENSOR_CSM_AUDIO_OUTPUT, "weight"), {n_embd, csm_audio_cbook_size, csm_audio_tokens+1}, 0);
 
                     tok_embd = is_backbone
                         ? create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,     "weight"), {n_embd, n_vocab}, 0)