improved tts default voices quality and sample rate

LostRuins · LostRuins · commit e8570de0e605 · 2025-01-17T18:45:16.000+08:00
diff --git a/expose.h b/expose.h
@@ -205,6 +205,7 @@ struct whisper_generation_outputs
 
 struct tts_load_model_inputs
 {
+    const int threads = 4;
     const char * ttc_model_filename = nullptr;
     const char * cts_model_filename = nullptr;
     const char * executable_path = nullptr;
diff --git a/kcpp_docs.embd b/kcpp_docs.embd
@@ -1463,7 +1463,7 @@
                                      },
                                      "voice": {
                                         "type": "string",
-                                        "description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,tutor,shouty,bored,record"
+                                        "description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,shouty,chatty"
                                      }
                                   },
                                   "type": "object"
diff --git a/klite.embd b/klite.embd
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script>
-	const LITEVER = 203;
+	const LITEVER = 204;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = true;
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -12085,6 +12085,7 @@ initializeInstructUIFunctionality();
 					payload =
 					{
 						"input": text,
+						"nocache": true,
 						"voice": (document.getElementById("kcpp_tts_voice").value == "custom")?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
 					};
 					ttsheaders = get_kobold_header();
@@ -20266,10 +20267,8 @@ initializeInstructUIFunctionality();
 									<option value="kobo" selected>kobo</option>
 									<option value="cheery">cheery</option>
 									<option value="sleepy">sleepy</option>
-									<option value="tutor">tutor</option>
 									<option value="shouty">shouty</option>
-									<option value="bored">bored</option>
-									<option value="record">record</option>
+									<option value="chatty">chatty</option>
 									<option value="custom">custom</option>
 									</select></td>
 									<td><input class="settinglabel miniinput" type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; height:18px; width:44px; padding: 2px;"></td></tr>
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -283,7 +283,8 @@ class whisper_generation_outputs(ctypes.Structure):
                 ("data", ctypes.c_char_p)]
 
 class tts_load_model_inputs(ctypes.Structure):
-    _fields_ = [("ttc_model_filename", ctypes.c_char_p),
+    _fields_ = [("threads", ctypes.c_int),
+                ("ttc_model_filename", ctypes.c_char_p),
                 ("cts_model_filename", ctypes.c_char_p),
                 ("executable_path", ctypes.c_char_p),
                 ("clblast_info", ctypes.c_int),
@@ -1346,6 +1347,12 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
     inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
     inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
     inputs.gpulayers = (999 if args.ttsgpu else 0)
+    thds = args.threads
+    if args.ttsthreads and args.ttsthreads > 0:
+        ttst = int(args.ttsthreads)
+        if ttst > 0:
+            thds = ttst
+    inputs.threads = thds
     inputs = set_backend_props(inputs)
     ret = handle.tts_load_model(inputs)
     return ret
@@ -1357,7 +1364,7 @@ def tts_generate(genparams):
     prompt = prompt.strip()
     voice = 1
     voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
-    voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
+    voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
     normalized_voice = voicestr.strip().lower() if voicestr else ""
     if normalized_voice in voice_mapping:
         voice = voice_mapping.index(normalized_voice) + 1
@@ -2332,9 +2339,9 @@ def do_GET(self):
            response_body = (json.dumps([]).encode())
 
         elif self.path.endswith(('/speakers_list')): #xtts compatible
-            response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
+            response_body = (json.dumps(["kobo","cheery","sleepy","shouty","chatty"]).encode()) #some random voices for them to enjoy
         elif self.path.endswith(('/speakers')): #xtts compatible
-            response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"tutor","voice_id":"tutor","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"bored","voice_id":"bored","preview_url":""},{"name":"record","voice_id":"record","preview_url":""}]).encode()) #some random voices for them to enjoy
+            response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"chatty","voice_id":"chatty","preview_url":""}]).encode()) #some random voices for them to enjoy
         elif self.path.endswith(('/get_tts_settings')): #xtts compatible
             response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy
 
@@ -3158,6 +3165,7 @@ def hide_tooltip(event):
     tts_model_var = ctk.StringVar()
     wavtokenizer_var = ctk.StringVar()
     ttsgpu_var = ctk.IntVar(value=0)
+    tts_threads_var = ctk.StringVar(value=str(default_threads))
 
     def tabbuttonaction(name):
         for t in tabcontent:
@@ -3728,11 +3736,12 @@ def toggletaesd(a,b,c):
     audio_tab = tabcontent["Audio"]
     makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
     whisper_model_var.trace("w", gui_changed_modelfile)
-    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
+    makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 3, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
+    makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
     tts_model_var.trace("w", gui_changed_modelfile)
-    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
+    makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 7, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
     wavtokenizer_var.trace("w", gui_changed_modelfile)
-    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
+    makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
     ttsgpu_var.trace("w", gui_changed_modelfile)
 
     def kcpp_export_template():
@@ -3760,6 +3769,7 @@ def kcpp_export_template():
         savdict["tensor_split"] = None
         savdict["draftgpusplit"] = None
         savdict["config"] = None
+        savdict["ttsthreads"] = 0
         filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
         if filename is None:
             return
@@ -3950,6 +3960,7 @@ def export_vars():
             args.whispermodel = whisper_model_var.get()
 
         if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
+            args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
             args.ttsmodel = tts_model_var.get()
             args.ttswavtokenizer = wavtokenizer_var.get()
             args.ttsgpu = (ttsgpu_var.get()==1)
@@ -4114,6 +4125,7 @@ def import_vars(dict):
 
         whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
 
+        tts_threads_var.set(str(dict["ttsthreads"]) if ("ttsthreads" in dict and dict["ttsthreads"]) else str(default_threads))
         tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
         wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
         ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
@@ -5527,6 +5539,7 @@ def range_checker(arg: str):
     ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
     ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
     ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
+    ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
 
     deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
     deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp

Original file line number	Diff line number	Diff line change
`@@ -205,6 +205,7 @@ struct whisper_generation_outputs`
`205`	`205`
`206`	`206`	`struct tts_load_model_inputs`
`207`	`207`	`{`
	`208`	`+ const int threads = 4;`
`208`	`209`	`const char * ttc_model_filename = nullptr;`
`209`	`210`	`const char * cts_model_filename = nullptr;`
`210`	`211`	`const char * executable_path = nullptr;`
Original file line number	Diff line number	Diff line change
`@@ -1463,7 +1463,7 @@`
`1463`	`1463`	`},`
`1464`	`1464`	`"voice": {`
`1465`	`1465`	`"type": "string",`
`1466`		`- "description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,tutor,shouty,bored,record"`
	`1466`	`+ "description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,shouty,chatty"`
`1467`	`1467`	`}`
`1468`	`1468`	`},`
`1469`	`1469`	`"type": "object"`