tts.cpp merged and working in kcpp!

LostRuins · LostRuins · commit bcaf37950949 · 2025-08-17T18:09:28.000+08:00
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -5333,13 +5333,13 @@ def toggletaesd(a,b,c):
     audio_tab = tabcontent["Audio"]
     makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
     whisper_model_var.trace_add("write", gui_changed_modelfile)
-    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech Required):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.")
     tts_model_var.trace_add("write", gui_changed_modelfile)
-    makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
-    makelabelentry(audio_tab, "OuteTTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
+    makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
+    makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
     makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
     ttsgpu_var.trace_add("write", gui_changed_modelfile)
-    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech Required):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "WavTokenizer Model (Required for OuteTTS):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
     wavtokenizer_var.trace_add("write", gui_changed_modelfile)
 
     admin_tab = tabcontent["Admin"]
@@ -7610,7 +7610,7 @@ def range_checker(arg: str):
     whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
 
     ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
-    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
+    ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the TTS Text-To-Speech GGUF model.", default="")
     ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
     ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
     ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.",  type=int, default=default_ttsmaxlen)
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -499,6 +499,7 @@ static int nthreads = 4;
 static int tts_max_len = 4096;
 
 //ttscpp specific
+static bool is_ttscpp_file = false;
 static generation_configuration * ttscpp_config = nullptr;
 static struct tts_runner * ttscpp_runner = nullptr;
 
@@ -539,7 +540,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
     std::string modelfile_cts = inputs.cts_model_filename;
     std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
 
-    bool is_ttscpp_file = false;
+    is_ttscpp_file = false;
     if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
         is_ttscpp_file = true;
         printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
@@ -556,7 +557,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
 
     // tts init
     if (is_ttscpp_file) {
-        ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0);
+        ttscpp_config = new generation_configuration("am_adam", 50, 1.0, 1.0, true, "", 0, 1.0);
         ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
         if (ttscpp_runner == nullptr) {
             printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
@@ -640,7 +641,72 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
     return true;
 }
 
-tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
+static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_inputs inputs)
+{
+    tts_generation_outputs output;
+    if(ttscpp_runner==nullptr || ttscpp_config==nullptr)
+    {
+        printf("\nWarning: KCPP TTSCPP not initialized! Make sure TTS model is loaded successfully.\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+    int speaker_seed = inputs.speaker_seed;
+    std::string voiceused = "am_adam";
+    std::string prompt = inputs.prompt;
+    double ttstime = 0;
+    timer_start();
+    switch(speaker_seed)
+    {
+        case 1:
+            voiceused = "am_adam";
+            break;
+        case 2:
+            voiceused = "af_alloy";
+            break;
+        case 3:
+            voiceused = "af_jessica";
+            break;
+        case 4:
+            voiceused = "bm_george";
+            break;
+        case 5:
+            voiceused = "bf_isabella";
+            break;
+    }
+    if(ttsdebugmode==1 && !tts_is_quiet)
+    {
+        printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());
+        printf("\nInput: %s\n", prompt.c_str());
+    }
+    ttscpp_config->voice = voiceused;
+
+    tts_response response_data;
+    int errorres = generate(ttscpp_runner, prompt, &response_data, ttscpp_config);
+    if(errorres==0)
+    {
+        ttstime = timer_check();
+        printf("\nTTS Generated %d audio in %.2fs.\n",ttstime);
+        std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
+        last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
+        output.data = last_generated_audio.c_str();
+        output.status = 1;
+        last_generation_settings_audio_seed = 0;
+        last_generation_settings_speaker_seed = speaker_seed;
+        last_generation_settings_prompt = std::string(prompt);
+        total_tts_gens += 1;
+        return output;
+    }
+    else
+    {
+        printf("\nError: TTSCPP generation failed\n");
+        output.data = "";
+        output.status = 0;
+        return output;
+    }
+}
+
+static tts_generation_outputs ttstype_generate_outetts(const tts_generation_inputs inputs)
 {
     tts_generation_outputs output;
 
@@ -1051,3 +1117,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
         return output;
     }
 }
+
+tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
+{
+    if (is_ttscpp_file) {
+        return ttstype_generate_ttscpp(inputs);
+    } else {
+        return ttstype_generate_outetts(inputs);
+    }
+}
diff --git a/otherarch/ttscpp/src/kokoro_model.cpp b/otherarch/ttscpp/src/kokoro_model.cpp
@@ -1389,7 +1389,8 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
 
 int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
 	if (model->voices.find(voice) == model->voices.end()) {
-		TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str());
+		fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
+		return -1;
     } else {
     	// if the language changed then we should change the phonemization voice
     	if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {
diff --git a/otherarch/ttscpp/src/orpheus_model.cpp b/otherarch/ttscpp/src/orpheus_model.cpp
@@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
     // it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
     // surpass the default size.
     if (batch.tokens.size() > model->max_context_length) {
-        TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
+        fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
+        return -1;
     }
     octx->reset();
     generation_sampler->reset();
@@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {
     generation_sampler->top_k = config->top_k;
     generation_sampler->top_p = config->top_p;
     if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
-        TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
+        fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());
+        config->voice = "zoe";
     }
     octx->voice = config->voice;
 }
diff --git a/otherarch/ttscpp/src/ttscpp.cpp b/otherarch/ttscpp/src/ttscpp.cpp
@@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
     }
 }
 
+//returns 0 on success
 int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
     switch(runner->arch) {
         case PARLER_TTS_ARCH:

Original file line number	Diff line number	Diff line change
`@@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons`
`409`	`409`	`// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will`
`410`	`410`	`// surpass the default size.`
`411`	`411`	`if (batch.tokens.size() > model->max_context_length) {`
`412`		`- TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");`
	`412`	`+ fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");`
	`413`	`+ return -1;`
`413`	`414`	`}`
`414`	`415`	`octx->reset();`
`415`	`416`	`generation_sampler->reset();`
`@@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {`
`427`	`428`	`generation_sampler->top_k = config->top_k;`
`428`	`429`	`generation_sampler->top_p = config->top_p;`
`429`	`430`	`if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {`
`430`		`- TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());`
	`431`	`+ fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());`
	`432`	`+ config->voice = "zoe";`
`431`	`433`	`}`
`432`	`434`	`octx->voice = config->voice;`
`433`	`435`	`}`
Original file line number	Diff line number	Diff line change
`@@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g`
`162`	`162`	`}`
`163`	`163`	`}`
`164`	`164`
	`165`	`+//returns 0 on success`
`165`	`166`	`int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {`
`166`	`167`	`switch(runner->arch) {`
`167`	`168`	`case PARLER_TTS_ARCH:`