Skip to content

Commit 52606e9

Browse files
committed
tts cpp model is now loadable in kcpp
1 parent 9935ac0 commit 52606e9

File tree

14 files changed

+125
-217
lines changed

14 files changed

+125
-217
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,7 @@ mainvk: tools/main/main.cpp common/arg.cpp build-info.h ggml_v4_vulkan.o ggml-cp
729729
$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
730730
embedding: examples/embedding/embedding.cpp common/arg.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
731731
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
732-
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/tts.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
732+
ttscppmain: otherarch/ttscpp/cli/cli.cpp otherarch/ttscpp/cli/playback.cpp otherarch/ttscpp/cli/playback.h otherarch/ttscpp/cli/write_file.cpp otherarch/ttscpp/cli/write_file.h otherarch/ttscpp/cli/vad.cpp otherarch/ttscpp/cli/vad.h otherarch/ttscpp/src/ttscpp.cpp otherarch/ttscpp/src/ttstokenizer.cpp otherarch/ttscpp/src/ttssampler.cpp otherarch/ttscpp/src/parler_model.cpp otherarch/ttscpp/src/dac_model.cpp otherarch/ttscpp/src/ttsutil.cpp otherarch/ttscpp/src/ttsargs.cpp otherarch/ttscpp/src/ttst5_encoder_model.cpp otherarch/ttscpp/src/phonemizer.cpp otherarch/ttscpp/src/tts_model.cpp otherarch/ttscpp/src/kokoro_model.cpp otherarch/ttscpp/src/dia_model.cpp otherarch/ttscpp/src/orpheus_model.cpp otherarch/ttscpp/src/snac_model.cpp otherarch/ttscpp/src/general_neural_audio_codec.cpp ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
733733
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
734734

735735
ggml/src/ggml-vulkan-shaders.cpp:

examples/diffusion/CMakeLists.txt

Lines changed: 0 additions & 5 deletions
This file was deleted.

ggml/src/ggml-webgpu/CMakeLists.txt

Lines changed: 0 additions & 54 deletions
This file was deleted.

ggml/src/ggml-zdnn/CMakeLists.txt

Lines changed: 0 additions & 36 deletions
This file was deleted.

koboldcpp.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1826,8 +1826,8 @@ def whisper_generate(genparams):
18261826
def tts_load_model(ttc_model_filename,cts_model_filename):
18271827
global args
18281828
inputs = tts_load_model_inputs()
1829-
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
1830-
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
1829+
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8") if ttc_model_filename else "".encode("UTF-8")
1830+
inputs.cts_model_filename = cts_model_filename.encode("UTF-8") if cts_model_filename else "".encode("UTF-8")
18311831
inputs.gpulayers = (999 if args.ttsgpu else 0)
18321832
inputs.flash_attention = args.flashattention
18331833
thds = args.threads
@@ -5602,7 +5602,7 @@ def export_vars():
56025602
args.embeddingsmaxctx = (0 if embeddings_ctx_var.get()=="" else int(embeddings_ctx_var.get()))
56035603
args.embeddingsgpu = (embeddings_gpu_var.get()==1)
56045604

5605-
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
5605+
if tts_model_var.get() != "":
56065606
args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
56075607
args.ttsmodel = tts_model_var.get()
56085608
args.ttswavtokenizer = wavtokenizer_var.get()
@@ -7201,8 +7201,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
72017201
exit_with_error(3,"Could not load whisper model: " + whispermodel)
72027202

72037203
#handle tts model
7204-
if args.ttsmodel and args.ttsmodel!="" and args.ttswavtokenizer and args.ttswavtokenizer!="":
7205-
if not os.path.exists(args.ttsmodel) or not os.path.exists(args.ttswavtokenizer):
7204+
if args.ttsmodel and args.ttsmodel!="":
7205+
if not os.path.exists(args.ttsmodel) or (args.ttswavtokenizer and args.ttswavtokenizer!="" and not os.path.exists(args.ttswavtokenizer)):
72067206
if args.ignoremissing:
72077207
print("Ignoring missing TTS model files!")
72087208
args.ttsmodel = None
@@ -7214,7 +7214,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
72147214
ttsmodelpath = args.ttsmodel
72157215
ttsmodelpath = os.path.abspath(ttsmodelpath)
72167216
wavtokpath = args.ttswavtokenizer
7217-
wavtokpath = os.path.abspath(wavtokpath)
7217+
if wavtokpath:
7218+
wavtokpath = os.path.abspath(wavtokpath)
72187219
loadok = tts_load_model(ttsmodelpath,wavtokpath)
72197220
print("Load TTS Model OK: " + str(loadok))
72207221
if not loadok:

model_adapter.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,20 @@ bool gguf_tensor_exists(const std::string & gguf_filename, std::string tensor_na
115115
return found;
116116
}
117117

118+
std::string gguf_get_model_arch(const std::string & gguf_filename)
119+
{
120+
struct gguf_init_params ggufparams;
121+
ggufparams.no_alloc = true;
122+
ggufparams.ctx = NULL;
123+
struct gguf_context * ctx = gguf_init_from_file(gguf_filename.c_str(), ggufparams);
124+
if (!ctx) return "";
125+
auto keyidx = gguf_find_key(ctx, "general.architecture");
126+
std::string modelarch = "";
127+
if (keyidx != -1) { modelarch = gguf_get_val_str(ctx, keyidx); }
128+
gguf_free(ctx);
129+
return modelarch;
130+
}
131+
118132
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
119133
FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta)
120134
{

model_adapter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<in
132132
int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
133133
const bool useSmartContext, const bool requireFullSubset);
134134
bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
135+
std::string gguf_get_model_arch(const std::string & filename);
135136

136137
size_t gpttype_calc_new_state_kv();
137138
size_t gpttype_calc_new_state_tokencount();

otherarch/sdcpp/util.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ void pretty_progress(int step, int steps, float time) {
357357
}
358358
}
359359
progress += "|";
360-
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
360+
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s ",
361361
progress.c_str(), step, steps,
362362
time > 1.0f || time == 0 ? time : (1.0f / time));
363363
fflush(stdout); // for linux

otherarch/tts_adapter.cpp

Lines changed: 91 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
#endif
2727

2828
//imports required for tts.cpp to work
29-
#include "tts.cpp"
29+
#include "ttscommon.h"
30+
#include "ttscpp.cpp"
3031
#include "ttstokenizer.cpp"
3132
#include "ttssampler.cpp"
3233
#include "parler_model.cpp"
@@ -497,6 +498,10 @@ static int code_terminate_id = 151670;
497498
static int nthreads = 4;
498499
static int tts_max_len = 4096;
499500

501+
//ttscpp specific
502+
static generation_configuration * ttscpp_config = nullptr;
503+
static struct tts_runner * ttscpp_runner = nullptr;
504+
500505
int total_tts_gens = 0;
501506

502507
bool ttstype_load_model(const tts_load_model_inputs inputs)
@@ -532,81 +537,103 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
532537

533538
std::string modelfile_ttc = inputs.ttc_model_filename;
534539
std::string modelfile_cts = inputs.cts_model_filename;
535-
printf("\nLoading TTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
540+
std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
541+
542+
bool is_ttscpp_file = false;
543+
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
544+
is_ttscpp_file = true;
545+
printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
546+
}else{
547+
printf("\nLoading OuteTTS Model, OuteTTS: %s \nWavTokenizer: %s \n",modelfile_ttc.c_str(),modelfile_cts.c_str());
548+
if(modelfile_ttc=="" || modelfile_cts=="")
549+
{
550+
printf("\nWarning: KCPP OuteTTS missing a file! Make sure both TTS and WavTokenizer models are loaded.\n");
551+
return false;
552+
}
553+
}
536554

537555
ttsdebugmode = inputs.debugmode;
538556

539557
// tts init
540-
llama_model_params tts_model_params = llama_model_default_params();
541-
llama_context_params tts_ctx_params = llama_context_default_params();
542-
543-
nthreads = inputs.threads;
544-
545-
tts_max_len = inputs.ttsmaxlen;
546-
547-
tts_model_params.use_mmap = false;
548-
tts_model_params.use_mlock = false;
549-
tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
550-
tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
551-
int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu;
552-
tts_model_params.main_gpu = kcpp_parseinfo_maindevice;
553-
tts_ctx_params.n_ctx = 8192;
554-
tts_ctx_params.offload_kqv = true;
555-
tts_ctx_params.n_batch = 8192;
556-
tts_ctx_params.n_ubatch = 512;
557-
tts_ctx_params.n_threads = nthreads;
558-
tts_ctx_params.n_threads_batch = nthreads;
559-
tts_ctx_params.flash_attn = inputs.flash_attention;
560-
tts_ctx_params.kv_unified = true;
561-
562-
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
563-
ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params);
564-
565-
if (ttc_ctx == nullptr) {
566-
printf("\nTTS Load Error: Failed to initialize ttc context!\n");
567-
return false;
568-
}
558+
if (is_ttscpp_file) {
559+
ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0);
560+
ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
561+
if (ttscpp_runner == nullptr) {
562+
printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
563+
return false;
564+
}
565+
} else { //outetts only
566+
llama_model_params tts_model_params = llama_model_default_params();
567+
llama_context_params tts_ctx_params = llama_context_default_params();
568+
569+
nthreads = inputs.threads;
570+
571+
tts_max_len = inputs.ttsmaxlen;
572+
573+
tts_model_params.use_mmap = false;
574+
tts_model_params.use_mlock = false;
575+
tts_model_params.n_gpu_layers = inputs.gpulayers; //offload if possible
576+
tts_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
577+
int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu;
578+
tts_model_params.main_gpu = kcpp_parseinfo_maindevice;
579+
tts_ctx_params.n_ctx = 8192;
580+
tts_ctx_params.offload_kqv = true;
581+
tts_ctx_params.n_batch = 8192;
582+
tts_ctx_params.n_ubatch = 512;
583+
tts_ctx_params.n_threads = nthreads;
584+
tts_ctx_params.n_threads_batch = nthreads;
585+
tts_ctx_params.flash_attn = inputs.flash_attention;
586+
tts_ctx_params.kv_unified = true;
587+
588+
llama_model * ttcmodel = llama_model_load_from_file(modelfile_ttc.c_str(), tts_model_params);
589+
ttc_ctx = llama_init_from_model(ttcmodel, tts_ctx_params);
590+
591+
if (ttc_ctx == nullptr) {
592+
printf("\nTTS Load Error: Failed to initialize ttc context!\n");
593+
return false;
594+
}
569595

570-
llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
596+
llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);
571597

572-
tts_ctx_params.embeddings = true; //this requires embeddings instead
573-
tts_ctx_params.n_ubatch = tts_ctx_params.n_batch;
574-
cts_ctx = llama_init_from_model(ctsmodel, tts_ctx_params);
598+
tts_ctx_params.embeddings = true; //this requires embeddings instead
599+
tts_ctx_params.n_ubatch = tts_ctx_params.n_batch;
600+
cts_ctx = llama_init_from_model(ctsmodel, tts_ctx_params);
575601

576-
if (cts_ctx == nullptr) {
577-
printf("\nTTS Load Error: Failed to initialize cts context!\n");
578-
return false;
579-
}
602+
if (cts_ctx == nullptr) {
603+
printf("\nTTS Load Error: Failed to initialize cts context!\n");
604+
return false;
605+
}
580606

581-
std::vector<int> tmp = {1, 2, 3, 4};
582-
llama_memory_clear(llama_get_memory(ttc_ctx),true);
583-
auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
584-
if(er!=0)
585-
{
586-
printf("\nTTS Eval returned nonzero: %d\n",er);
587-
return false;
588-
}
607+
std::vector<int> tmp = {1, 2, 3, 4};
608+
llama_memory_clear(llama_get_memory(ttc_ctx),true);
609+
auto er = llama_decode(ttc_ctx, llama_batch_get_one(tmp.data(), tmp.size()));
610+
if(er!=0)
611+
{
612+
printf("\nTTS Eval returned nonzero: %d\n",er);
613+
return false;
614+
}
589615

590-
const llama_vocab * ttcvocab = llama_model_get_vocab(ttcmodel);
591-
llama_tokens testoks = common_tokenize(ttcvocab,"<|space|>",false,true);
592-
if (testoks.size() == 1) {
593-
ttsver = TTS_VER_3;
594-
printf("\nUsing v0.3 mode");
595-
//note that the final word does NOT have a space at the end.
596-
space_id = testoks[0];
597-
testoks = common_tokenize(ttcvocab,"<|audio_end|>",false,true);
616+
const llama_vocab * ttcvocab = llama_model_get_vocab(ttcmodel);
617+
llama_tokens testoks = common_tokenize(ttcvocab,"<|space|>",false,true);
598618
if (testoks.size() == 1) {
599-
code_terminate_id = testoks[0];
619+
ttsver = TTS_VER_3;
620+
printf("\nUsing v0.3 mode");
621+
//note that the final word does NOT have a space at the end.
622+
space_id = testoks[0];
623+
testoks = common_tokenize(ttcvocab,"<|audio_end|>",false,true);
624+
if (testoks.size() == 1) {
625+
code_terminate_id = testoks[0];
626+
}
627+
} else {
628+
ttsver = TTS_VER_2;
629+
printf("\nUsing v0.2 mode");
600630
}
601-
} else {
602-
ttsver = TTS_VER_2;
603-
printf("\nUsing v0.2 mode");
604-
}
605631

606-
//determine offset of <|0|>
607-
testoks = common_tokenize(ttcvocab,"<|0|>",false,true);
608-
if (testoks.size() == 1) {
609-
cts_offset = testoks[0];
632+
//determine offset of <|0|>
633+
testoks = common_tokenize(ttcvocab,"<|0|>",false,true);
634+
if (testoks.size() == 1) {
635+
cts_offset = testoks[0];
636+
}
610637
}
611638

612639
printf("\nTTS Load Complete.\n");

otherarch/ttscpp/cli/cli.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "tts.h"
1+
#include "ttscpp.h"
22
#include "ttsargs.h"
33
#include "ttscommon.h"
44
#include "playback.h"

0 commit comments

Comments
 (0)