Skip to content

Commit bcaf379

Browse files
committed
tts.cpp merged and working in kcpp!
1 parent 52606e9 commit bcaf379

File tree

5 files changed

+90
-11
lines changed

5 files changed

+90
-11
lines changed

koboldcpp.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5333,13 +5333,13 @@ def toggletaesd(a,b,c):
53335333
audio_tab = tabcontent["Audio"]
53345334
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
53355335
whisper_model_var.trace_add("write", gui_changed_modelfile)
5336-
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech Required):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
5336+
makefileentry(audio_tab, "TTS Model (Text-To-Speech):", "Select TTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a TTS GGUF model file on disk to be loaded for Narration.")
53375337
tts_model_var.trace_add("write", gui_changed_modelfile)
5338-
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
5339-
makelabelentry(audio_tab, "OuteTTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
5338+
makelabelentry(audio_tab, "TTS Threads:" , tts_threads_var, 5, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
5339+
makelabelentry(audio_tab, "TTS Max Tokens:" , ttsmaxlen_var, 7, 50,padx=290,singleline=True,tooltip="Max allowed audiotokens to generate per TTS request.")
53405340
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
53415341
ttsgpu_var.trace_add("write", gui_changed_modelfile)
5342-
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech Required):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
5342+
makefileentry(audio_tab, "WavTokenizer Model (Required for OuteTTS):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 11, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
53435343
wavtokenizer_var.trace_add("write", gui_changed_modelfile)
53445344

53455345
admin_tab = tabcontent["Admin"]
@@ -7610,7 +7610,7 @@ def range_checker(arg: str):
76107610
whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
76117611

76127612
ttsparsergroup = parser.add_argument_group('TTS Narration Commands')
7613-
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
7613+
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the TTS Text-To-Speech GGUF model.", default="")
76147614
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
76157615
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
76167616
ttsparsergroup.add_argument("--ttsmaxlen", help="Limit number of audio tokens generated with TTS.", type=int, default=default_ttsmaxlen)

otherarch/tts_adapter.cpp

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,7 @@ static int nthreads = 4;
499499
static int tts_max_len = 4096;
500500

501501
//ttscpp specific
502+
static bool is_ttscpp_file = false;
502503
static generation_configuration * ttscpp_config = nullptr;
503504
static struct tts_runner * ttscpp_runner = nullptr;
504505

@@ -539,7 +540,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
539540
std::string modelfile_cts = inputs.cts_model_filename;
540541
std::string detectedarch = gguf_get_model_arch(modelfile_ttc);
541542

542-
bool is_ttscpp_file = false;
543+
is_ttscpp_file = false;
543544
if (detectedarch!="" && SUPPORTED_ARCHITECTURES.find(detectedarch) != SUPPORTED_ARCHITECTURES.end()) {
544545
is_ttscpp_file = true;
545546
printf("\nLoading TTS.CPP Model Arch: %s \n", detectedarch.c_str());
@@ -556,7 +557,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
556557

557558
// tts init
558559
if (is_ttscpp_file) {
559-
ttscpp_config = new generation_configuration("af_alloy", 50, 1.0, 1.0, true, "", 0, 1.0);
560+
ttscpp_config = new generation_configuration("am_adam", 50, 1.0, 1.0, true, "", 0, 1.0);
560561
ttscpp_runner = runner_from_file(modelfile_ttc, inputs.threads, ttscpp_config, true);
561562
if (ttscpp_runner == nullptr) {
562563
printf("\nTTS Load Error: Failed to initialize TTSCPP!\n");
@@ -640,7 +641,72 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
640641
return true;
641642
}
642643

643-
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
644+
static tts_generation_outputs ttstype_generate_ttscpp(const tts_generation_inputs inputs)
645+
{
646+
tts_generation_outputs output;
647+
if(ttscpp_runner==nullptr || ttscpp_config==nullptr)
648+
{
649+
printf("\nWarning: KCPP TTSCPP not initialized! Make sure TTS model is loaded successfully.\n");
650+
output.data = "";
651+
output.status = 0;
652+
return output;
653+
}
654+
int speaker_seed = inputs.speaker_seed;
655+
std::string voiceused = "am_adam";
656+
std::string prompt = inputs.prompt;
657+
double ttstime = 0;
658+
timer_start();
659+
switch(speaker_seed)
660+
{
661+
case 1:
662+
voiceused = "am_adam";
663+
break;
664+
case 2:
665+
voiceused = "af_alloy";
666+
break;
667+
case 3:
668+
voiceused = "af_jessica";
669+
break;
670+
case 4:
671+
voiceused = "bm_george";
672+
break;
673+
case 5:
674+
voiceused = "bf_isabella";
675+
break;
676+
}
677+
if(ttsdebugmode==1 && !tts_is_quiet)
678+
{
679+
printf("\nUsing Speaker ID: %d, Voice: %s", speaker_seed, voiceused.c_str());
680+
printf("\nInput: %s\n", prompt.c_str());
681+
}
682+
ttscpp_config->voice = voiceused;
683+
684+
tts_response response_data;
685+
int errorres = generate(ttscpp_runner, prompt, &response_data, ttscpp_config);
686+
if(errorres==0)
687+
{
688+
ttstime = timer_check();
689+
printf("\nTTS Generated %d audio in %.2fs.\n",ttstime);
690+
std::vector<float> wavdat = std::vector(response_data.data, response_data.data + response_data.n_outputs);
691+
last_generated_audio = save_wav16_base64(wavdat, ttscpp_runner->sampling_rate);
692+
output.data = last_generated_audio.c_str();
693+
output.status = 1;
694+
last_generation_settings_audio_seed = 0;
695+
last_generation_settings_speaker_seed = speaker_seed;
696+
last_generation_settings_prompt = std::string(prompt);
697+
total_tts_gens += 1;
698+
return output;
699+
}
700+
else
701+
{
702+
printf("\nError: TTSCPP generation failed\n");
703+
output.data = "";
704+
output.status = 0;
705+
return output;
706+
}
707+
}
708+
709+
static tts_generation_outputs ttstype_generate_outetts(const tts_generation_inputs inputs)
644710
{
645711
tts_generation_outputs output;
646712

@@ -1051,3 +1117,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
10511117
return output;
10521118
}
10531119
}
1120+
1121+
tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
1122+
{
1123+
if (is_ttscpp_file) {
1124+
return ttstype_generate_ttscpp(inputs);
1125+
} else {
1126+
return ttstype_generate_outetts(inputs);
1127+
}
1128+
}

otherarch/ttscpp/src/kokoro_model.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1389,7 +1389,8 @@ std::vector<std::vector<uint32_t>> kokoro_runner::tokenize_chunks(std::vector<st
13891389

13901390
int kokoro_runner::generate(std::string prompt, struct tts_response * response, std::string voice, std::string voice_code) {
13911391
if (model->voices.find(voice) == model->voices.end()) {
1392-
TTS_ABORT("Failed to find Kokoro voice '%s' aborting.\n", voice.c_str());
1392+
fprintf(stdout,"\nFailed to find Kokoro voice '%s' aborting.\n", voice.c_str());
1393+
return -1;
13931394
} else {
13941395
// if the language changed then we should change the phonemization voice
13951396
if (phmzr->mode == ESPEAK && kctx->voice[0] != voice[0]) {

otherarch/ttscpp/src/orpheus_model.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,8 @@ int orpheus_runner::generate(std::string sentence, struct tts_response * respons
409409
// it should be possible to update the max context window size, but currently it is extremely unlikely that a single prompt will
410410
// surpass the default size.
411411
if (batch.tokens.size() > model->max_context_length) {
412-
TTS_ABORT("The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
412+
fprintf(stdout,"The prompt was too large for the default context window. Try splitting up or shortenning the prompt.");
413+
return -1;
413414
}
414415
octx->reset();
415416
generation_sampler->reset();
@@ -427,7 +428,8 @@ void orpheus_runner::configure_generation(generation_configuration * config) {
427428
generation_sampler->top_k = config->top_k;
428429
generation_sampler->top_p = config->top_p;
429430
if (std::find(orpheus_voices.begin(), orpheus_voices.end(), config->voice) == orpheus_voices.end() && !config->voice.empty()) {
430-
TTS_ABORT("Voice '%s' is not a valid voice for Orpheus.", config->voice.c_str());
431+
fprintf(stdout,"Voice '%s' is not a valid voice for Orpheus. Defaulting to zoe.", config->voice.c_str());
432+
config->voice = "zoe";
431433
}
432434
octx->voice = config->voice;
433435
}

otherarch/ttscpp/src/ttscpp.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ struct tts_runner * runner_from_file(const std::string & fname, int n_threads, g
162162
}
163163
}
164164

165+
//returns 0 on success
165166
int generate(tts_runner * runner, std::string sentence, struct tts_response * response, generation_configuration * config) {
166167
switch(runner->arch) {
167168
case PARLER_TTS_ARCH:

0 commit comments

Comments
 (0)