working now

ngxson · ngxson · commit 7ecce7645576 · 2025-03-30T14:21:55.000+02:00
diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt
@@ -4,15 +4,19 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-set(TARGET llama-tts-csm)
-add_executable(${TARGET} tts-csm.cpp)
+add_library(mimi-model mimi-model.h mimi-model.cpp)
+target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
+target_compile_features(mimi-model PRIVATE cxx_std_20)
+
+set(TARGET llama-mimi)
+add_executable(${TARGET} mimi.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-set(TARGET llama-mimi)
-add_executable(${TARGET} mimi.cpp mimi-model.cpp)
+set(TARGET llama-tts-csm)
+add_executable(${TARGET} tts-csm.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
-# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
-target_compile_features(${TARGET} PRIVATE cxx_std_20)
+target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tts/convert_mimi_to_gguf.py b/examples/tts/convert_mimi_to_gguf.py
@@ -27,7 +27,7 @@ def __init__(self,
         endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.gguf_writer = gguf.GGUFWriter(
             path=None,
-            arch="if you see this, you are using the wrong file",
+            arch="this model cannot be used as LLM, use it via --model-vocoder in TTS examples",
             endianess=endianess)
 
         assert self.mimi_model.config.architectures[0] == "MimiModel"
diff --git a/examples/tts/mimi-model.cpp b/examples/tts/mimi-model.cpp
@@ -716,7 +716,7 @@ std::vector<int> mimi_model::transpose_input(const std::vector<int> & codes) {
     int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
     GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");
 
-    std::vector<int> codes_T(n_codes_per_embd * n_codes);
+    std::vector<int> codes_T(n_codes);
     for (int i = 0; i < n_codes / n_codes_per_embd; i++) {
         for (int j = 0; j < n_codes_per_embd; j++) {
             int src_idx = i * n_codes_per_embd + j;
diff --git a/examples/tts/tts-csm.cpp b/examples/tts/tts-csm.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "arg.h"
+#include "mimi-model.h"
 
 #include <vector>
 #include <fstream>
@@ -13,7 +14,13 @@
 
 static void print_usage(int, char ** argv) {
     LOG("\nexample usage:\n");
-    LOG("\n    %s   TODO    ", argv[0]);
+    LOG("\n    By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF");
+    LOG("\n    %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]);
+    LOG("\n");
+    LOG("\n    To use a local model, specify the path to the model file:");
+    LOG("\n    %s -p ... -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -o output.wav", argv[0]);
+    LOG("\n");
+    LOG("\n    Note: the model need 2 files to run, one ends with '-backbone-<quant>.gguf' and the other ends with '-decoder<quant>.gguf'");
     LOG("\n");
 }
 
@@ -51,10 +58,15 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) {
 int main(int argc, char ** argv) {
     common_params params;
 
-    params.model     = "sesame-csm-backbone.gguf";
-    params.out_file  = "output.wav";
-    params.prompt    = "[0]Hello from Sesame.";
-    params.n_predict = 2048; // CSM's max trained seq length
+    params.model         = "sesame-csm-backbone.gguf";
+    params.vocoder.model = "kyutai-mimi.gguf";
+    params.out_file      = "output.wav";
+    params.prompt        = "[0]Hello from Sesame.";
+    params.n_predict     = 2048; // CSM's max trained seq length
+
+    // HF model
+    params.model_url         = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf";
+    params.vocoder.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf";
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
         return 1;
@@ -71,6 +83,9 @@ int main(int argc, char ** argv) {
     common_params params_decoder(params); // duplicate the params
     params_decoder.n_ctx = 64; // we never use more than this
     string_replace_all(params_decoder.model, "-backbone", "-decoder");
+    if (!params_decoder.model_url.empty()) {
+        string_replace_all(params_decoder.model_url, "-backbone", "-decoder");
+    }
 
     common_init_result llama_backbone = common_init_from_params(params);
     llama_model   * model_bb = llama_backbone.model.get();
@@ -88,6 +103,8 @@ int main(int argc, char ** argv) {
         return ENOENT;
     }
 
+    mimi_model mimi(params.vocoder.model.c_str(), true);
+
     const llama_vocab * vocab = llama_model_get_vocab(model_bb);
     llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true);
     prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab));
@@ -118,6 +135,7 @@ int main(int argc, char ** argv) {
     int64_t n_dc_gen   = 0; // decoder generation count
 
     bool is_stop = false;
+    std::vector<int> generated_codes;
 
     // backbone generation loop
     for (int k = 0; k < params.n_predict; ++k) {
@@ -150,6 +168,7 @@ int main(int argc, char ** argv) {
 
         llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc));
         printf("%d,", semantic_tok);
+        generated_codes.push_back(semantic_tok);
 
         // for (size_t i = 0; i < 10; ++i) {
         //     printf("%4.2f, ", embd[i]);
@@ -205,6 +224,7 @@ int main(int argc, char ** argv) {
                     printf("%d,", acoustic_tok);
                     tok = acoustic_tok; // next input token
                     sum_codes += acoustic_tok;
+                    generated_codes.push_back(acoustic_tok);
                 }
 
                 // do progressive hsum of embeddings
@@ -246,5 +266,16 @@ int main(int argc, char ** argv) {
     llama_batch_free(batch_prompt);
     llama_batch_free(batch_past_embd);
 
+    printf("decode %zu RVQ tokens into wav...\n", generated_codes.size());
+    generated_codes = mimi.transpose_input(generated_codes);
+    std::vector<float> wav_data = mimi.decode(generated_codes);
+
+    if (!save_wav16(params.out_file.c_str(), wav_data, mimi.get_sample_rate())) {
+        LOG_ERR("Failed to save wav file\n");
+        return 1;
+    }
+
+    printf("\n");
+
     return 0;
 }