Skip to content

Commit 7ecce76

Browse files
committed
working now
1 parent b97fd3e commit 7ecce76

File tree

4 files changed

+50
-15
lines changed

4 files changed

+50
-15
lines changed

examples/tts/CMakeLists.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@ install(TARGETS ${TARGET} RUNTIME)
44
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_17)
66

7-
set(TARGET llama-tts-csm)
8-
add_executable(${TARGET} tts-csm.cpp)
7+
add_library(mimi-model mimi-model.h mimi-model.cpp)
8+
target_link_libraries(mimi-model PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
9+
# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
10+
target_compile_features(mimi-model PRIVATE cxx_std_20)
11+
12+
set(TARGET llama-mimi)
13+
add_executable(${TARGET} mimi.cpp)
914
install(TARGETS ${TARGET} RUNTIME)
10-
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
15+
target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
1116
target_compile_features(${TARGET} PRIVATE cxx_std_17)
1217

13-
set(TARGET llama-mimi)
14-
add_executable(${TARGET} mimi.cpp mimi-model.cpp)
18+
set(TARGET llama-tts-csm)
19+
add_executable(${TARGET} tts-csm.cpp)
1520
install(TARGETS ${TARGET} RUNTIME)
16-
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
17-
# for using C++ designated initializers, TODO: can be changed back to C++17 in the future
18-
target_compile_features(${TARGET} PRIVATE cxx_std_20)
21+
target_link_libraries(${TARGET} PRIVATE llama common mimi-model ${CMAKE_THREAD_LIBS_INIT})
22+
target_compile_features(${TARGET} PRIVATE cxx_std_17)

examples/tts/convert_mimi_to_gguf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(self,
2727
endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
2828
self.gguf_writer = gguf.GGUFWriter(
2929
path=None,
30-
arch="if you see this, you are using the wrong file",
30+
arch="this model cannot be used as LLM, use it via --model-vocoder in TTS examples",
3131
endianess=endianess)
3232

3333
assert self.mimi_model.config.architectures[0] == "MimiModel"

examples/tts/mimi-model.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,7 +716,7 @@ std::vector<int> mimi_model::transpose_input(const std::vector<int> & codes) {
716716
int n_codes_per_embd = mimi_config.n_semantic_components + mimi_config.n_acoustic_components;
717717
GGML_ASSERT(n_codes % n_codes_per_embd == 0 && "number of codes must be a multiply of n_codes_per_embd");
718718

719-
std::vector<int> codes_T(n_codes_per_embd * n_codes);
719+
std::vector<int> codes_T(n_codes);
720720
for (int i = 0; i < n_codes / n_codes_per_embd; i++) {
721721
for (int j = 0; j < n_codes_per_embd; j++) {
722722
int src_idx = i * n_codes_per_embd + j;

examples/tts/tts-csm.cpp

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "common.h"
33
#include "log.h"
44
#include "arg.h"
5+
#include "mimi-model.h"
56

67
#include <vector>
78
#include <fstream>
@@ -13,7 +14,13 @@
1314

1415
static void print_usage(int, char ** argv) {
1516
LOG("\nexample usage:\n");
16-
LOG("\n %s TODO ", argv[0]);
17+
LOG("\n By default, model will be downloaded from https://huggingface.co/ggml-org/sesame-csm-1b-GGUF");
18+
LOG("\n %s -p \"[0]I have a dream that one day every valley shall be exalted\" -o output.wav", argv[0]);
19+
LOG("\n");
20+
LOG("\n To use a local model, specify the path to the model file:");
21+
LOG("\n %s -p ... -m sesame-csm-backbone.gguf -mv kyutai-mimi.gguf -o output.wav", argv[0]);
22+
LOG("\n");
23+
LOG("\n Note: the model need 2 files to run, one ends with '-backbone-<quant>.gguf' and the other ends with '-decoder<quant>.gguf'");
1724
LOG("\n");
1825
}
1926

@@ -51,10 +58,15 @@ static bool ggml_callback(struct ggml_tensor * t, bool ask, void * user_data) {
5158
int main(int argc, char ** argv) {
5259
common_params params;
5360

54-
params.model = "sesame-csm-backbone.gguf";
55-
params.out_file = "output.wav";
56-
params.prompt = "[0]Hello from Sesame.";
57-
params.n_predict = 2048; // CSM's max trained seq length
61+
params.model = "sesame-csm-backbone.gguf";
62+
params.vocoder.model = "kyutai-mimi.gguf";
63+
params.out_file = "output.wav";
64+
params.prompt = "[0]Hello from Sesame.";
65+
params.n_predict = 2048; // CSM's max trained seq length
66+
67+
// HF model
68+
params.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/sesame-csm-backbone.gguf";
69+
params.vocoder.model_url = "https://huggingface.co/ggml-org/sesame-csm-1b-GGUF/resolve/main/kyutai-mimi.gguf";
5870

5971
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
6072
return 1;
@@ -71,6 +83,9 @@ int main(int argc, char ** argv) {
7183
common_params params_decoder(params); // duplicate the params
7284
params_decoder.n_ctx = 64; // we never use more than this
7385
string_replace_all(params_decoder.model, "-backbone", "-decoder");
86+
if (!params_decoder.model_url.empty()) {
87+
string_replace_all(params_decoder.model_url, "-backbone", "-decoder");
88+
}
7489

7590
common_init_result llama_backbone = common_init_from_params(params);
7691
llama_model * model_bb = llama_backbone.model.get();
@@ -88,6 +103,8 @@ int main(int argc, char ** argv) {
88103
return ENOENT;
89104
}
90105

106+
mimi_model mimi(params.vocoder.model.c_str(), true);
107+
91108
const llama_vocab * vocab = llama_model_get_vocab(model_bb);
92109
llama_tokens prompt_tokens = common_tokenize(vocab, params.prompt, false, true);
93110
prompt_tokens.insert(prompt_tokens.begin(), llama_vocab_bos(vocab));
@@ -118,6 +135,7 @@ int main(int argc, char ** argv) {
118135
int64_t n_dc_gen = 0; // decoder generation count
119136

120137
bool is_stop = false;
138+
std::vector<int> generated_codes;
121139

122140
// backbone generation loop
123141
for (int k = 0; k < params.n_predict; ++k) {
@@ -150,6 +168,7 @@ int main(int argc, char ** argv) {
150168

151169
llama_token semantic_tok = sample_greedy(logits, llama_vocab_n_tokens(vocab_dc));
152170
printf("%d,", semantic_tok);
171+
generated_codes.push_back(semantic_tok);
153172

154173
// for (size_t i = 0; i < 10; ++i) {
155174
// printf("%4.2f, ", embd[i]);
@@ -205,6 +224,7 @@ int main(int argc, char ** argv) {
205224
printf("%d,", acoustic_tok);
206225
tok = acoustic_tok; // next input token
207226
sum_codes += acoustic_tok;
227+
generated_codes.push_back(acoustic_tok);
208228
}
209229

210230
// do progressive hsum of embeddings
@@ -246,5 +266,16 @@ int main(int argc, char ** argv) {
246266
llama_batch_free(batch_prompt);
247267
llama_batch_free(batch_past_embd);
248268

269+
printf("decode %zu RVQ tokens into wav...\n", generated_codes.size());
270+
generated_codes = mimi.transpose_input(generated_codes);
271+
std::vector<float> wav_data = mimi.decode(generated_codes);
272+
273+
if (!save_wav16(params.out_file.c_str(), wav_data, mimi.get_sample_rate())) {
274+
LOG_ERR("Failed to save wav file\n");
275+
return 1;
276+
}
277+
278+
printf("\n");
279+
249280
return 0;
250281
}

0 commit comments

Comments
 (0)