Skip to content

Commit 5fe27ef

Browse files
committed
(try) fixing problem with long text
1 parent eaba2bf commit 5fe27ef

File tree

1 file changed

+22
-7
lines changed

1 file changed

+22
-7
lines changed

examples/tts/tts-csm.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
// For more details on how this works, see: https://github.com/ggml-org/llama.cpp/pull/12648
1414

1515
static void print_usage(int, char ** argv) {
16-
LOG("\nexample usage:\n");
16+
LOG("\nExample usage:\n");
1717
LOG("\n By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF");
1818
LOG("\n %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]);
1919
LOG("\n");
@@ -22,6 +22,11 @@ static void print_usage(int, char ** argv) {
2222
LOG("\n");
2323
LOG("\n Note: the model need 2 files to run, one ends with '-backbone-<quant>.gguf' and the other ends with '-decoder<quant>.gguf'");
2424
LOG("\n");
25+
LOG("\nPrompt format:");
26+
LOG("\n Each line must start with speaker ID in square brackets, followed by the text. A full stop is recommended at the end of each turn");
27+
LOG("\n Example: [0]Hello world.");
28+
LOG("\n If you want to enter long text, use -f file.txt to read from file");
29+
LOG("\n");
2530
}
2631

2732
// greedy sampling with custom n_vocab
@@ -61,7 +66,7 @@ int main(int argc, char ** argv) {
6166
params.model = "sesame-csm-backbone.gguf";
6267
params.vocoder.model = "kyutai-mimi.gguf";
6368
params.out_file = "output.wav";
64-
params.prompt = "[0]Hello from Sesame.";
69+
params.prompt = "";
6570
params.n_predict = 2048; // CSM's max trained seq length
6671

6772
// HF model
@@ -75,6 +80,11 @@ int main(int argc, char ** argv) {
7580
llama_backend_init();
7681
llama_numa_init(params.numa);
7782

83+
if (params.prompt.empty()) {
84+
LOG_ERR("prompt is empty\n");
85+
return 1;
86+
}
87+
7888
std::vector<float> embd;
7989
params.cb_eval = ggml_callback;
8090
params.cb_eval_user_data = &embd;
@@ -167,7 +177,7 @@ int main(int argc, char ** argv) {
167177
// printf("\n");
168178

169179
llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc));
170-
printf("%d,", semantic_tok);
180+
printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok);
171181
generated_codes.push_back(semantic_tok);
172182

173183
// for (size_t i = 0; i < 10; ++i) {
@@ -200,7 +210,7 @@ int main(int argc, char ** argv) {
200210
// then, decode the semantic_tok to generate acoustic tokens
201211
llama_token tok = semantic_tok;
202212
int n_codes = 32;
203-
int sum_codes = 0; // to check if all codes are 0
213+
int sum_codes = semantic_tok; // to check if all codes are 0
204214
for (int i = 0; i < n_codes; ++i) {
205215
common_batch_clear(batch_token);
206216
// encoder vocab is further divided into 32 codebooks, each with 2051 entries
@@ -228,9 +238,12 @@ int main(int argc, char ** argv) {
228238
}
229239

230240
// do progressive hsum of embeddings
231-
GGML_ASSERT(inp_past_embd.size() == embd.size());
232-
for (size_t i = 0; i < inp_past_embd.size(); ++i) {
233-
inp_past_embd[i] += embd[i];
241+
// skip first semantic code
242+
if (i > 0) {
243+
GGML_ASSERT(inp_past_embd.size() == embd.size());
244+
for (size_t i = 0; i < inp_past_embd.size(); ++i) {
245+
inp_past_embd[i] += embd[i];
246+
}
234247
}
235248
}
236249
printf("\n");
@@ -253,6 +266,8 @@ int main(int argc, char ** argv) {
253266
// printf("\n");
254267

255268
if (is_stop) {
269+
// remove last 32 codes since they will be all zeros
270+
generated_codes.resize(generated_codes.size() - 32);
256271
break;
257272
}
258273
}

0 commit comments

Comments
 (0)