Skip to content

Commit e8570de

Browse files
committed
improved tts default voices quality and sample rate
1 parent 8d961bb commit e8570de

File tree

5 files changed

+37
-28
lines changed

5 files changed

+37
-28
lines changed

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ struct whisper_generation_outputs
205205

206206
struct tts_load_model_inputs
207207
{
208+
const int threads = 4;
208209
const char * ttc_model_filename = nullptr;
209210
const char * cts_model_filename = nullptr;
210211
const char * executable_path = nullptr;

kcpp_docs.embd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1463,7 +1463,7 @@
14631463
},
14641464
"voice": {
14651465
"type": "string",
1466-
"description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,tutor,shouty,bored,record"
1466+
"description": "The voice to use when generating the audio. You can enter anything you like, a unique speaker will be generated. There are a few preset voices you can use: kobo,cheery,sleepy,shouty,chatty"
14671467
}
14681468
},
14691469
"type": "object"

klite.embd

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
1212
-->
1313

1414
<script>
15-
const LITEVER = 203;
15+
const LITEVER = 204;
1616
const urlParams = new URLSearchParams(window.location.search);
1717
var localflag = true;
1818
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -12085,6 +12085,7 @@ initializeInstructUIFunctionality();
1208512085
payload =
1208612086
{
1208712087
"input": text,
12088+
"nocache": true,
1208812089
"voice": (document.getElementById("kcpp_tts_voice").value == "custom")?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
1208912090
};
1209012091
ttsheaders = get_kobold_header();
@@ -20266,10 +20267,8 @@ initializeInstructUIFunctionality();
2026620267
<option value="kobo" selected>kobo</option>
2026720268
<option value="cheery">cheery</option>
2026820269
<option value="sleepy">sleepy</option>
20269-
<option value="tutor">tutor</option>
2027020270
<option value="shouty">shouty</option>
20271-
<option value="bored">bored</option>
20272-
<option value="record">record</option>
20271+
<option value="chatty">chatty</option>
2027320272
<option value="custom">custom</option>
2027420273
</select></td>
2027520274
<td><input class="settinglabel miniinput" type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; height:18px; width:44px; padding: 2px;"></td></tr>

koboldcpp.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,8 @@ class whisper_generation_outputs(ctypes.Structure):
283283
("data", ctypes.c_char_p)]
284284

285285
class tts_load_model_inputs(ctypes.Structure):
286-
_fields_ = [("ttc_model_filename", ctypes.c_char_p),
286+
_fields_ = [("threads", ctypes.c_int),
287+
("ttc_model_filename", ctypes.c_char_p),
287288
("cts_model_filename", ctypes.c_char_p),
288289
("executable_path", ctypes.c_char_p),
289290
("clblast_info", ctypes.c_int),
@@ -1346,6 +1347,12 @@ def tts_load_model(ttc_model_filename,cts_model_filename):
13461347
inputs.ttc_model_filename = ttc_model_filename.encode("UTF-8")
13471348
inputs.cts_model_filename = cts_model_filename.encode("UTF-8")
13481349
inputs.gpulayers = (999 if args.ttsgpu else 0)
1350+
thds = args.threads
1351+
if args.ttsthreads and args.ttsthreads > 0:
1352+
ttst = int(args.ttsthreads)
1353+
if ttst > 0:
1354+
thds = ttst
1355+
inputs.threads = thds
13491356
inputs = set_backend_props(inputs)
13501357
ret = handle.tts_load_model(inputs)
13511358
return ret
@@ -1357,7 +1364,7 @@ def tts_generate(genparams):
13571364
prompt = prompt.strip()
13581365
voice = 1
13591366
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
1360-
voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
1367+
voice_mapping = ["kobo","cheery","sleepy","shouty","chatty"]
13611368
normalized_voice = voicestr.strip().lower() if voicestr else ""
13621369
if normalized_voice in voice_mapping:
13631370
voice = voice_mapping.index(normalized_voice) + 1
@@ -2332,9 +2339,9 @@ def do_GET(self):
23322339
response_body = (json.dumps([]).encode())
23332340

23342341
elif self.path.endswith(('/speakers_list')): #xtts compatible
2335-
response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
2342+
response_body = (json.dumps(["kobo","cheery","sleepy","shouty","chatty"]).encode()) #some random voices for them to enjoy
23362343
elif self.path.endswith(('/speakers')): #xtts compatible
2337-
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"tutor","voice_id":"tutor","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"bored","voice_id":"bored","preview_url":""},{"name":"record","voice_id":"record","preview_url":""}]).encode()) #some random voices for them to enjoy
2344+
response_body = (json.dumps([{"name":"kobo","voice_id":"kobo","preview_url":""},{"name":"cheery","voice_id":"cheery","preview_url":""},{"name":"sleepy","voice_id":"sleepy","preview_url":""},{"name":"shouty","voice_id":"shouty","preview_url":""},{"name":"chatty","voice_id":"chatty","preview_url":""}]).encode()) #some random voices for them to enjoy
23382345
elif self.path.endswith(('/get_tts_settings')): #xtts compatible
23392346
response_body = (json.dumps({"temperature":0.75,"speed":1,"length_penalty":1,"repetition_penalty":1,"top_p":1,"top_k":4,"enable_text_splitting":True,"stream_chunk_size":100}).encode()) #some random voices for them to enjoy
23402347

@@ -3158,6 +3165,7 @@ def hide_tooltip(event):
31583165
tts_model_var = ctk.StringVar()
31593166
wavtokenizer_var = ctk.StringVar()
31603167
ttsgpu_var = ctk.IntVar(value=0)
3168+
tts_threads_var = ctk.StringVar(value=str(default_threads))
31613169

31623170
def tabbuttonaction(name):
31633171
for t in tabcontent:
@@ -3728,11 +3736,12 @@ def toggletaesd(a,b,c):
37283736
audio_tab = tabcontent["Audio"]
37293737
makefileentry(audio_tab, "Whisper Model (Speech-To-Text):", "Select Whisper .bin Model File", whisper_model_var, 1, width=280, filetypes=[("*.bin","*.bin")], tooltiptxt="Select a Whisper .bin model file on disk to be loaded for Voice Recognition.")
37303738
whisper_model_var.trace("w", gui_changed_modelfile)
3731-
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 3, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
3739+
makelabelentry(audio_tab, "OuteTTS Threads:" , tts_threads_var, 3, 50,padx=290,singleline=True,tooltip="How many threads to use during TTS generation.\nIf left blank, uses same value as threads.")
3740+
makefileentry(audio_tab, "OuteTTS Model (Text-To-Speech):", "Select OuteTTS GGUF Model File", tts_model_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a OuteTTS GGUF model file on disk to be loaded for Narration.")
37323741
tts_model_var.trace("w", gui_changed_modelfile)
3733-
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 5, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
3742+
makefileentry(audio_tab, "WavTokenizer Model (Text-To-Speech):", "Select WavTokenizer GGUF Model File", wavtokenizer_var, 7, width=280, filetypes=[("*.gguf","*.gguf")], tooltiptxt="Select a WavTokenizer GGUF model file on disk to be loaded for Narration.")
37343743
wavtokenizer_var.trace("w", gui_changed_modelfile)
3735-
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 7, 0,tooltiptxt="Uses the GPU for TTS.")
3744+
makecheckbox(audio_tab, "TTS Use GPU", ttsgpu_var, 9, 0,tooltiptxt="Uses the GPU for TTS.")
37363745
ttsgpu_var.trace("w", gui_changed_modelfile)
37373746

37383747
def kcpp_export_template():
@@ -3760,6 +3769,7 @@ def kcpp_export_template():
37603769
savdict["tensor_split"] = None
37613770
savdict["draftgpusplit"] = None
37623771
savdict["config"] = None
3772+
savdict["ttsthreads"] = 0
37633773
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
37643774
if filename is None:
37653775
return
@@ -3950,6 +3960,7 @@ def export_vars():
39503960
args.whispermodel = whisper_model_var.get()
39513961

39523962
if tts_model_var.get() != "" and wavtokenizer_var.get() != "":
3963+
args.ttsthreads = (0 if tts_threads_var.get()=="" else int(tts_threads_var.get()))
39533964
args.ttsmodel = tts_model_var.get()
39543965
args.ttswavtokenizer = wavtokenizer_var.get()
39553966
args.ttsgpu = (ttsgpu_var.get()==1)
@@ -4114,6 +4125,7 @@ def import_vars(dict):
41144125

41154126
whisper_model_var.set(dict["whispermodel"] if ("whispermodel" in dict and dict["whispermodel"]) else "")
41164127

4128+
tts_threads_var.set(str(dict["ttsthreads"]) if ("ttsthreads" in dict and dict["ttsthreads"]) else str(default_threads))
41174129
tts_model_var.set(dict["ttsmodel"] if ("ttsmodel" in dict and dict["ttsmodel"]) else "")
41184130
wavtokenizer_var.set(dict["ttswavtokenizer"] if ("ttswavtokenizer" in dict and dict["ttswavtokenizer"]) else "")
41194131
ttsgpu_var.set(dict["ttsgpu"] if ("ttsgpu" in dict) else 0)
@@ -5527,6 +5539,7 @@ def range_checker(arg: str):
55275539
ttsparsergroup.add_argument("--ttsmodel", metavar=('[filename]'), help="Specify the OuteTTS Text-To-Speech GGUF model.", default="")
55285540
ttsparsergroup.add_argument("--ttswavtokenizer", metavar=('[filename]'), help="Specify the WavTokenizer GGUF model.", default="")
55295541
ttsparsergroup.add_argument("--ttsgpu", help="Use the GPU for TTS.", action='store_true')
5542+
ttsparsergroup.add_argument("--ttsthreads", metavar=('[threads]'), help="Use a different number of threads for TTS if specified. Otherwise, has the same value as --threads.", type=int, default=0)
55305543

55315544
deprecatedgroup = parser.add_argument_group('Deprecated Commands, DO NOT USE!')
55325545
deprecatedgroup.add_argument("--hordeconfig", help=argparse.SUPPRESS, nargs='+')

0 commit comments

Comments
 (0)