ability to do multi-turns

ngxson · ngxson · commit c681257e58d1 · 2025-04-02T17:31:29.000+02:00
diff --git a/examples/tts/csm-demo.txt b/examples/tts/csm-demo.txt
@@ -0,0 +1,5 @@
+[0]Hey how are you doing.
+[1]Pretty good, pretty good.
+[0]I'm great, so happy to be speaking to you.
+What about you?
+[1]Me too, this is some cool stuff huh?
diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp
@@ -5,6 +5,7 @@
 #include "mimi-model.h"
 
 #include <vector>
+#include <regex>
 #include <fstream>
 #include <float.h>
 #include <cstring> // memcpy and strcmp
@@ -23,12 +24,39 @@ static void print_usage(int, char ** argv) {
     LOG("\n    Note: the model need 2 files to run, one ends with '-backbone-<quant>.gguf' and the other ends with '-decoder<quant>.gguf'");
     LOG("\n");
     LOG("\nPrompt format:");
-    LOG("\n    Each line must start with speaker ID in square brackets, followed by the text. A full stop is recommended at the end of each turn");
-    LOG("\n    Example: [0]Hello world.");
+    LOG("\n    Each line must start with speaker ID in square brackets, followed by the text. One turn per line. A full stop is recommended at the end of each turn");
+    LOG("\n    Example:");
+    LOG("\n        [0]Hey how are you doing.");
+    LOG("\n        [1]Pretty good, pretty good.");
     LOG("\n    If you want to enter long text, use -f file.txt to read from file");
     LOG("\n");
 }
 
+// split text containing "[N]..." into speaker turns
+static std::vector<std::string> get_speaker_turns(const std::string & input) {
+    if (input.empty()) {
+        LOG_ERR("Empty input\n");
+        return {};
+    }
+    if (input[0] != '[') {
+        LOG_ERR("Invalid input format: missing speaker ID\n");
+        return {};
+    }
+    std::regex re(R"((\[\d+\][\s\S]*?)(?=\[\d+\]|$))");
+    std::smatch match;
+    std::vector<std::string> turns;
+    std::string::const_iterator searchStart(input.cbegin());
+    while (std::regex_search(searchStart, input.cend(), match, re)) {
+        std::string turn = match[1].str();
+        if (turn.empty()) {
+            continue;
+        }
+        turns.push_back(turn);
+        searchStart = match.suffix().first;
+    }
+    return turns;
+}
+
 // sampling with custom n_vocab
 // modified version of llama_sampler_sample()
 static llama_token sample_token(struct llama_sampler * smpl, const float * logits, int n_vocab) {
@@ -81,9 +109,11 @@ int main(int argc, char ** argv) {
     params.sampling.top_k     = 50;   // default param from CSM python code
     params.sampling.temp      = 0.9;  // default param from CSM python code
 
-    // HF model
-    params.model.url         = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf";
-    params.vocoder.model.url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf";
+    // HF model (hack: we temporary reuse speculative.model as the decoder model, only to get it downloaded)
+    params.model.url              = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf";
+    params.speculative.model.path = "sesame-csm-decoder.gguf";
+    params.speculative.model.url  = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-decoder.gguf";
+    params.vocoder.model.url      = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf";
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
         return 1;
@@ -125,32 +155,15 @@ int main(int argc, char ** argv) {
 
     mimi_model mimi(params.vocoder.model.path.c_str(), true);
 
-    // tokenize the prompt
-    const llama_vocab * vocab = llama_model_get_vocab(model_bb);
-    llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true);
-    prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab));
-    prompt_tokens.insert(prompt_tokens.end(),   llama_vocab_eos(vocab));
-
     // init sampler
     // the python implementation only has top-k and temperature sampling, so we'll use just that
     llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
     llama_sampler_chain_add(sampler.get(), llama_sampler_init_top_k(params.sampling.top_k));
     llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(params.sampling.temp));
     llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(params.sampling.seed));
 
-    printf("prompt tokens: \n");
-    for (size_t i = 0; i < prompt_tokens.size(); ++i) {
-        printf("%d, ", prompt_tokens[i]);
-    }
-    printf("\n");
-
-    llama_pos n_past_bb = 0;
     llama_batch batch_prompt = llama_batch_init(params.n_batch, 0, 1);
-    common_batch_clear(batch_prompt);
-    for (size_t i = 0; i < prompt_tokens.size(); ++i) {
-        common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false);
-    }
-    batch_prompt.logits[batch_prompt.n_tokens - 1] = true;
+    llama_pos n_past_bb = 0;
 
     // inp_past_embd is the "squashed" embeddings from the decoder
     std::vector<float> inp_past_embd(2048, 0.0f);
@@ -162,128 +175,154 @@ int main(int argc, char ** argv) {
     int64_t t_dc       = 0; // decoder time
     int64_t n_dc_gen   = 0; // decoder generation count
 
-    bool is_stop = false;
     std::vector<int> generated_codes;
 
-    // backbone generation loop
-    for (int k = 0; k < params.n_predict; ++k) {
-        bool is_prompt_processing = k == 0;
-
-        if (!is_prompt_processing) {
-            // generate the next RVQ semantic token
-            batch_past_embd.n_tokens     = 1;
-            batch_past_embd.pos[0]       = n_past_bb++;
-            batch_past_embd.seq_id[0][0] = 0;
-            batch_past_embd.n_seq_id[0]  = 1;
-            batch_past_embd.logits[0]    = true;
-            std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float));
-        }
+    auto turns = get_speaker_turns(params.prompt);
+
+    for (const std::string & turn : turns) {
+        // tokenize the turn
+        llama_tokens prompt_tokens;
+        {
+            printf("\n---\nturn: %s\n\n", turn.c_str());
+            const llama_vocab * vocab = llama_model_get_vocab(model_bb);
+            prompt_tokens = common_tokenize(vocab, turn, false, true);
+            prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab));
+            prompt_tokens.insert(prompt_tokens.end(),   llama_vocab_eos(vocab));
+
+            printf("prompt (%zu tokens): \n", prompt_tokens.size());
+            for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                printf("%d, ", prompt_tokens[i]);
+            }
+            printf("\n");
 
-        int64_t t_bb_start = ggml_time_ms();
-        if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) {
-            LOG_ERR("%s: backbone llama_decode() failed\n", __func__);
-            return 1;
+            common_batch_clear(batch_prompt);
+            for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                common_batch_add(batch_prompt, prompt_tokens[i], n_past_bb++, { 0 }, false);
+            }
+            batch_prompt.logits[batch_prompt.n_tokens - 1] = true;
         }
-        n_bb_gen++;
-        t_bb += ggml_time_ms() - t_bb_start;
 
-        auto vocab_dc = llama_model_get_vocab(model_dc);
-        auto logits   = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0);
-        // for (size_t i = 0; i < 10; ++i) {
-        //     printf("%4.2f, ", logits[i]);
-        // }
-        // printf("\n");
+        // backbone generation loop
+        bool is_end_of_turn = false;
+        for (int k = 0; k < params.n_predict; ++k) {
+            bool is_prompt_processing = k == 0;
+
+            if (!is_prompt_processing) {
+                // generate the next RVQ semantic token
+                batch_past_embd.n_tokens     = 1;
+                batch_past_embd.pos[0]       = n_past_bb++;
+                batch_past_embd.seq_id[0][0] = 0;
+                batch_past_embd.n_seq_id[0]  = 1;
+                batch_past_embd.logits[0]    = true;
+                std::memcpy(batch_past_embd.embd, inp_past_embd.data(), inp_past_embd.size() * sizeof(float));
+            }
 
-        llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc));
-        printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok);
-        generated_codes.push_back(semantic_tok);
+            int64_t t_bb_start = ggml_time_ms();
+            if (llama_decode(ctx_bb, is_prompt_processing ? batch_prompt : batch_past_embd) != 0) {
+                LOG_ERR("%s: backbone llama_decode() failed\n", __func__);
+                return 1;
+            }
+            n_bb_gen++;
+            t_bb += ggml_time_ms() - t_bb_start;
 
-        // for (size_t i = 0; i < 10; ++i) {
-        //     printf("%4.2f, ", embd[i]);
-        // }
-        // printf("\n");
+            auto vocab_dc = llama_model_get_vocab(model_dc);
+            auto logits   = llama_get_logits_ith(ctx_bb, is_prompt_processing ? (batch_prompt.n_tokens - 1) : 0);
+            // for (size_t i = 0; i < 10; ++i) {
+            //     printf("%4.2f, ", logits[i]);
+            // }
+            // printf("\n");
 
+            llama_token semantic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc));
+            printf("Sem token %5d : %d,", 1+(int)generated_codes.size()/32, semantic_tok);
+            generated_codes.push_back(semantic_tok);
 
-        // decoder generation loop
-        inp_past_embd = std::vector<float>(inp_past_embd.size(), 0.0f);
-        {
-            llama_kv_self_clear(ctx_dc);
-            llama_batch batch_embd  = llama_batch_init(1, embd.size(), 1);
-            llama_batch batch_token = llama_batch_init(1, 0, 1);
+            // for (size_t i = 0; i < 10; ++i) {
+            //     printf("%4.2f, ", embd[i]);
+            // }
+            // printf("\n");
 
-            // first "token" is the latent embeddings from backbone
-            {
-                batch_embd.n_tokens     = 1;
-                batch_embd.pos[0]       = 0;
-                batch_embd.seq_id[0][0] = 0;
-                batch_embd.n_seq_id[0]  = 1;
-                batch_embd.logits[0]    = false;
-                std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float));
-            }
-            if (llama_decode(ctx_dc, batch_embd) != 0) {
-                LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__);
-                return 1;
-            }
 
-            // then, decode the semantic_tok to generate acoustic tokens
-            llama_token tok = semantic_tok;
-            int n_codes = 32;
-            int sum_codes = semantic_tok; // to check if all codes are 0
-            for (int i = 0; i < n_codes; ++i) {
-                common_batch_clear(batch_token);
-                // encoder vocab is further divided into 32 codebooks, each with 2051 entries
-                llama_token inp_tok = tok + 2051*i;
-                common_batch_add(batch_token, inp_tok, i+1, { 0 }, true);
-
-                int64_t t_bb_start = ggml_time_ms();
-                if (llama_decode(ctx_dc, batch_token) != 0) {
-                    LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__);
-                    return 1;
+            // decoder generation loop
+            inp_past_embd = std::vector<float>(inp_past_embd.size(), 0.0f);
+            {
+                llama_kv_self_clear(ctx_dc);
+                llama_batch batch_embd  = llama_batch_init(1, embd.size(), 1);
+                llama_batch batch_token = llama_batch_init(1, 0, 1);
+
+                // first "token" is the latent embeddings from backbone
+                {
+                    batch_embd.n_tokens     = 1;
+                    batch_embd.pos[0]       = 0;
+                    batch_embd.seq_id[0][0] = 0;
+                    batch_embd.n_seq_id[0]  = 1;
+                    batch_embd.logits[0]    = false;
+                    std::memcpy(batch_embd.embd, embd.data(), embd.size() * sizeof(float));
                 }
-                n_dc_gen++;
-                t_dc += ggml_time_ms() - t_bb_start;
-
-                // sample the acoustic token
-                auto logits = llama_get_logits_ith(ctx_dc, 0);
-                llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc));
-
-                // discard last code (only for embeddings)
-                if (i < n_codes - 1) {
-                    printf("%d,", acoustic_tok);
-                    tok = acoustic_tok; // next input token
-                    sum_codes += acoustic_tok;
-                    generated_codes.push_back(acoustic_tok);
+                if (llama_decode(ctx_dc, batch_embd) != 0) {
+                    LOG_ERR("%s: decoder llama_decode(embd) failed\n", __func__);
+                    return 1;
                 }
 
-                // do progressive hsum of embeddings
-                GGML_ASSERT(inp_past_embd.size() == embd.size());
-                for (size_t i = 0; i < inp_past_embd.size(); ++i) {
-                    inp_past_embd[i] += embd[i];
+                // then, decode the semantic_tok to generate acoustic tokens
+                llama_token tok = semantic_tok;
+                int n_codes = 32;
+                int sum_codes = semantic_tok; // to check if all codes are 0
+                for (int i = 0; i < n_codes; ++i) {
+                    common_batch_clear(batch_token);
+                    // encoder vocab is further divided into 32 codebooks, each with 2051 entries
+                    llama_token inp_tok = tok + 2051*i;
+                    common_batch_add(batch_token, inp_tok, i+1, { 0 }, true);
+
+                    int64_t t_bb_start = ggml_time_ms();
+                    if (llama_decode(ctx_dc, batch_token) != 0) {
+                        LOG_ERR("%s: decoder llama_decode(token) failed\n", __func__);
+                        return 1;
+                    }
+                    n_dc_gen++;
+                    t_dc += ggml_time_ms() - t_bb_start;
+
+                    // sample the acoustic token
+                    auto logits = llama_get_logits_ith(ctx_dc, 0);
+                    llama_token acoustic_tok = sample_token(sampler.get(), logits, llama_vocab_n_tokens(vocab_dc));
+
+                    // discard last code (only for embeddings)
+                    if (i < n_codes - 1) {
+                        printf("%d,", acoustic_tok);
+                        tok = acoustic_tok; // next input token
+                        sum_codes += acoustic_tok;
+                        generated_codes.push_back(acoustic_tok);
+                    }
+
+                    // do progressive hsum of embeddings
+                    GGML_ASSERT(inp_past_embd.size() == embd.size());
+                    for (size_t i = 0; i < inp_past_embd.size(); ++i) {
+                        inp_past_embd[i] += embd[i];
+                    }
                 }
-            }
-            printf("\n");
+                printf("\n");
 
-            llama_batch_free(batch_embd);
-            llama_batch_free(batch_token);
+                llama_batch_free(batch_embd);
+                llama_batch_free(batch_token);
 
-            // if all codes are 0, then we are done
-            is_stop = sum_codes == 0;
-        }
+                // if all codes are 0, then we are done
+                is_end_of_turn = sum_codes == 0;
+            }
 
-        // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb);
-        // for (size_t i = 0; i < inp_past_embd.size(); ++i) {
-        //     printf("%4.4f, ", inp_past_embd[i]);
-        //     if (i == 2) {
-        //         printf("... ");
-        //         i = inp_past_embd.size() - 4;
-        //     }
-        // }
-        // printf("\n");
-
-        if (is_stop) {
-            // remove last 32 codes since they will be all zeros
-            generated_codes.resize(generated_codes.size() - 32);
-            break;
+            // printf("inp_past_embd, n_past_bb = %d\n", n_past_bb);
+            // for (size_t i = 0; i < inp_past_embd.size(); ++i) {
+            //     printf("%4.4f, ", inp_past_embd[i]);
+            //     if (i == 2) {
+            //         printf("... ");
+            //         i = inp_past_embd.size() - 4;
+            //     }
+            // }
+            // printf("\n");
+
+            if (is_end_of_turn) {
+                // remove last 32 codes since they will be all zeros
+                generated_codes.resize(generated_codes.size() - 32);
+                break;
+            }
         }
     }