-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Description
Name and Version
$ ./build/bin/llama-cli --version
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA GeForce RTX 4070, compute capability 8.9, VMM: yes
register_backend: registered backend CUDA (1 devices)
register_device: registered device CUDA0 (NVIDIA GeForce RTX 4070)
register_backend: registered backend CPU (1 devices)
register_device: registered device CPU (12th Gen Intel(R) Core(TM) i7-1260P)
version: 4476 (504af20e)
built with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnuOperating systems
Linux
Which llama.cpp modules do you know to be affected?
libllama (core library)
Command line
$ build/bin/llama-tts -m ./models/outetts-0.2-0.5B-q8_0.gguf -ngl 99 -mv ./models/wavtokenizer-large-75-f16.gguf -p "What is LoRA"
...
print_info: file format = GGUF V3 (latest)
print_info: file type = F16
print_info: file size = 124.15 MiB (16.03 BPW)
print_info: arch = wavtokenizer-dec
print_info: vocab_only = 0
print_info: n_ctx_train = 8192
print_info: n_embd = 1282
print_info: n_layer = 12
print_info: n_head = 1
print_info: n_head_kv = 1
print_info: n_rot = 1282
print_info: n_swa = 0
print_info: n_embd_head_k = 1282
print_info: n_embd_head_v = 1282
print_info: n_gqa = 1
print_info: n_embd_k_gqa = 1282
print_info: n_embd_v_gqa = 1282
print_info: f_norm_eps = 1.0e-06
print_info: f_norm_rms_eps = 0.0e+00
print_info: f_clamp_kqv = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: n_ff = 2304
print_info: n_expert = 0
print_info: n_expert_used = 0
print_info: causal attn = 0
print_info: pooling type = 0
print_info: rope type = -1
print_info: rope scaling = linear
print_info: freq_base_train = 10000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn = 8192
print_info: rope_finetuned = unknown
print_info: ssm_d_conv = 0
print_info: ssm_d_inner = 0
print_info: ssm_d_state = 0
print_info: ssm_dt_rank = 0
print_info: ssm_dt_b_c_rms = 0
print_info: model type = ?B
print_info: model params = 64.98 M
print_info: general.name = WavTokenizer Large Speech 75token
print_info: vocab type = no vocab
print_info: n_vocab = 0
print_info: n_merges = 0
print_info: max token length = 0
llama_model_load: error loading model: check_tensor_dims: tensor 'token_embd.weight' has wrong shape; expected 512, 0, got 512, 4096, 1, 1
llama_model_load_from_file: failed to load model
common_init_from_params: failed to load model './models/wavtokenizer-large-75-f16.gguf'Problem description & steps to reproduce
This voice decoder model does not have a vocabulary, it does not have an array of tokenizer.ggml.tokens, so the values returned from model.vocab.n_tokens() will be zero.
But the model model does has the following property:
wavtokenizer-dec.vocab_size And this matches the exptected size of tok_embd:
111: 2097152 | 512, 4096, 1, 1 | F16 | token_embd.weight So perhaps this values should be set a hparam when these types of model are loaded.
But there will also be a need to a change to llama_decode_impl where the
n_tokens is also used:
if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
}
} There might be other places where a similar situation arises as well and perhaps these types of models need to be considered in relation to how model.vocab.n_tokens() works.
As a quick test to get the tts example to work the following changes enable it to run again:
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 1fe45410..c641ff2a 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -45,6 +45,7 @@ struct llama_hparams {
// for WavTokenizer
struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext;
+ uint32_t wav_n_vocab = 0;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f90f5e74..5b30154c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -421,6 +421,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
+ ml.get_key(LLM_KV_VOCAB_SIZE, hparams.wav_n_vocab);
}
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
@@ -3247,7 +3248,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} break;
case LLM_ARCH_WAVTOKENIZER_DEC:
{
- tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, hparams.wav_n_vocab}, 0);
conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
diff --git a/src/llama.cpp b/src/llama.cpp
index daf1b7c9..15769dc1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8471,8 +8471,10 @@ static int llama_decode_impl(
if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
- return -1;
+ if (model.arch != LLM_ARCH_WAVTOKENIZER_DEC) {
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
+ return -1;
+ }
}
}
}
First Bad Commit
No response
Relevant log output
No response