Skip to content

Commit 636beac

Browse files
committed
added a nicer built in voice
1 parent 62e33d0 commit 636beac

File tree

7 files changed

+166
-36
lines changed

7 files changed

+166
-36
lines changed

kcpp_docs.embd

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,8 @@
616616
"vision": false,
617617
"transcribe":false,
618618
"multiplayer": false,
619+
"websearch":false,
620+
"tts":false,
619621
},
620622
"schema": {
621623
"$ref": "#/components/schemas/KcppVersion"
@@ -1443,6 +1445,52 @@
14431445
]
14441446
}
14451447
},
1448+
"/api/extra/tts": {
1449+
"post": {
1450+
"description": "Creates text-to-speech audio from input text.",
1451+
"requestBody": {
1452+
"content": {
1453+
"application/json": {
1454+
"example": {
1455+
"input": "hello world, how are you today?",
1456+
"voice": "fire",
1457+
},
1458+
"schema": {
1459+
"properties": {
1460+
"input": {
1461+
"type": "string",
1462+
"description": "The text to generate audio for. Try to keep it short."
1463+
},
1464+
"voice": {
1465+
"type": "string",
1466+
"description": "The voice to use when generating the audio. You can enter anything you like, a qunique speaker will be generated."
1467+
}
1468+
},
1469+
"type": "object"
1470+
}
1471+
}
1472+
},
1473+
"required": true
1474+
},
1475+
"responses": {
1476+
"200": {
1477+
"content": {
1478+
"audio/wav": {
1479+
"schema": {
1480+
"type": "string",
1481+
"format": "binary"
1482+
}
1483+
}
1484+
},
1485+
"description": "Successful request"
1486+
}
1487+
},
1488+
"summary": "Creates text-to-speech audio from input text.",
1489+
"tags": [
1490+
"api/extra"
1491+
]
1492+
}
1493+
},
14461494
"/props": {
14471495
"get": {
14481496
"summary": "Returns the Jinja template stored in the GGUF model, if found.",
@@ -1840,6 +1888,16 @@
18401888
"responses": {"default": {"description": ""}}
18411889
}
18421890
},
1891+
"/v1/audio/speech": {
1892+
"post": {
1893+
"summary": "Generates Text-To-Speech audio from input text. Please refer to OpenAI documentation",
1894+
"description": "Generates Text-To-Speech audio from input text.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/audio/createSpeech](https://platform.openai.com/docs/api-reference/audio/createSpeech)",
1895+
"tags": [
1896+
"v1"
1897+
],
1898+
"responses": {"default": {"description": ""}}
1899+
}
1900+
},
18431901
},
18441902
"servers": [
18451903
{

klite.embd

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2030,6 +2030,15 @@ Current version indicated by LITEVER below.
20302030
.color_orangeurl:focus {
20312031
color: #ffedd3;
20322032
}
2033+
.color_grayurl {
2034+
color: #9e9e9e;
2035+
}
2036+
.color_grayurl:hover {
2037+
color: #9f9f9f;
2038+
}
2039+
.color_grayurl:focus {
2040+
color: #9e9e9e;
2041+
}
20332042

20342043
.color_orange {
20352044
color: #f7a223;
@@ -2793,7 +2802,8 @@ Current version indicated by LITEVER below.
27932802
const koboldcpp_transcribe_endpoint = "/api/extra/transcribe";
27942803
const koboldcpp_tokenize_endpoint = "/api/extra/tokencount";
27952804
const koboldcpp_perf_endpoint = "/api/extra/perf";
2796-
const koboldcpp_websearch_endpoint = "/api/extra/websearch"
2805+
const koboldcpp_websearch_endpoint = "/api/extra/websearch";
2806+
const koboldcpp_tts_endpoint = "/api/extra/tts";
27972807

27982808
const oai_models_endpoint = "/models";
27992809
const oai_submit_endpoint = "/completions";
@@ -2853,6 +2863,7 @@ Current version indicated by LITEVER below.
28532863
const XTTS_ID = 1000;
28542864
const ALLTALK_ID = 1001;
28552865
const OAI_TTS_ID = 1002;
2866+
const KCPP_TTS_ID = 1003;
28562867
const HD_RES_PX = 768;
28572868
const NO_HD_RES_PX = 512;
28582869
const AVATAR_PX = 384;
@@ -2965,6 +2976,7 @@ Current version indicated by LITEVER below.
29652976
var voice_is_processing = false; //currently processing voice?
29662977
let voiceprerecorder = null, voicerecorder = null, voice_is_speaking = false, voice_speaking_counter = 0;
29672978
let preaudiobuffers = [], preaudioblobs = []; //will store 2 preblobs at a time
2979+
var koboldcpp_has_tts = false;
29682980
var no_escape_html = false;
29692981
var timetaken_timestamp = performance.now();
29702982
var bg_silence = null;
@@ -3587,7 +3599,7 @@ Current version indicated by LITEVER below.
35873599
document.getElementById("lastreq1").innerHTML =
35883600
document.getElementById("lastreq2").innerHTML =
35893601
document.getElementById("lastreq3").innerHTML =
3590-
`KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`;
3602+
`KoboldAI Lite v${LITEVER} Web - Frontend for <a href="#" class="color_grayurl" onclick="msgbox('KoboldAI Lite allows you to connect to various third-party AI services. We do not control or assume responsibility for the models or content generated by these services. The user is responsible for ensuring that their usage of this software is legal in their country, and complies with the terms of service of the service they are connected to. Use at your own discretion.','Disclaimer')">External API Services</a>`;
35913603

35923604
trigger_abort_controller(); //first trigger sets it up
35933605

@@ -5840,6 +5852,10 @@ initializeInstructUIFunctionality();
58405852
{
58415853
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.80") >= 0 && koboldcpp_has_websearch);
58425854
}
5855+
function is_using_kcpp_with_tts()
5856+
{
5857+
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.81") >= 0 && koboldcpp_has_tts);
5858+
}
58435859
function is_using_web_lite()
58445860
{
58455861
return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("lostruins.github.io"));
@@ -9207,6 +9223,7 @@ initializeInstructUIFunctionality();
92079223
koboldcpp_has_whisper = (data.transcribe?true:false);
92089224
koboldcpp_has_multiplayer = (data.multiplayer?true:false);
92099225
koboldcpp_has_websearch = (data.websearch?true:false);
9226+
koboldcpp_has_tts = (data.tts?true:false);
92109227
let has_password = (data.protected?true:false);
92119228
let has_txt2img = (data.txt2img?true:false);
92129229
let no_txt_model = (mdlname=="inactive");
@@ -9315,7 +9332,7 @@ initializeInstructUIFunctionality();
93159332
},()=>{
93169333
});
93179334
}
9318-
else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper)
9335+
else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper && !koboldcpp_has_tts)
93199336
{
93209337
msgboxYesNo("This KoboldCpp instance has no models loaded. You can still use the WebUI to edit or view existing stories.<br><br>Would you like to connect to an external API service?","No Models Loaded",
93219338
()=>{
@@ -10311,6 +10328,8 @@ initializeInstructUIFunctionality();
1031110328
ttshtml += "<option value=\"1000\">XTTS API Server</option>";
1031210329
ttshtml += "<option value=\"1001\">AllTalk API Server</option>";
1031310330
ttshtml += "<option value=\"1002\">OpenAI-Compat. API Server</option>";
10331+
ttshtml += "<option value=\"1003\">KoboldCpp TTS API</option>";
10332+
1031410333
if ('speechSynthesis' in window) {
1031510334
let voices = window.speechSynthesis.getVoices();
1031610335
console.log("speech synth available: " + voices.length);
@@ -11894,6 +11913,7 @@ initializeInstructUIFunctionality();
1189411913
document.getElementById("xtts_container").classList.add("hidden");
1189511914
document.getElementById("oai_tts_container").classList.add("hidden");
1189611915
document.getElementById("alltalk_specific_controls").classList.add("hidden");
11916+
document.getElementById("kcpp_tts_container").classList.add("hidden");
1189711917

1189811918
const selectedTTS = document.getElementById("ttsselect").value;
1189911919

@@ -11910,6 +11930,15 @@ initializeInstructUIFunctionality();
1191011930
else if(selectedTTS == OAI_TTS_ID) {
1191111931
document.getElementById("oai_tts_container").classList.remove("hidden");
1191211932
}
11933+
else if(selectedTTS == KCPP_TTS_ID) {
11934+
document.getElementById("kcpp_tts_container").classList.remove("hidden");
11935+
if(is_using_kcpp_with_tts())
11936+
{
11937+
document.getElementById("nokcpptts").classList.add("hidden");
11938+
}else{
11939+
document.getElementById("nokcpptts").classList.remove("hidden");
11940+
}
11941+
}
1191311942
}
1191411943

1191511944
// Fetch RVC voices for AllTalk
@@ -12014,27 +12043,44 @@ initializeInstructUIFunctionality();
1201412043
}
1201512044
}
1201612045

12017-
if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID) //xtts api server
12046+
if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID || ssval==KCPP_TTS_ID) //xtts api server
1201812047
{
1201912048
let is_xtts = (ssval==XTTS_ID);
1202012049
let is_oai_tts = (ssval==OAI_TTS_ID);
12050+
let is_kcpp_tts = (ssval==KCPP_TTS_ID);
1202112051
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
1202212052

12023-
if(is_oai_tts)
12053+
if(is_oai_tts || is_kcpp_tts)
1202412054
{
12025-
let payload =
12055+
let payload = {};
12056+
let ttsheaders = {};
12057+
let sub_endpt = "";
12058+
if(is_oai_tts)
1202612059
{
12027-
"model": document.getElementById("oai_tts_model").value,
12028-
"input": text,
12029-
"voice": document.getElementById("oai_tts_voice").value
12030-
};
12031-
let oaiheaders = {
12032-
'Content-Type': 'application/json',
12033-
'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
12034-
};
12035-
fetch(localsettings.saved_oai_tts_url, {
12060+
sub_endpt = localsettings.saved_oai_tts_url;
12061+
payload =
12062+
{
12063+
"model": document.getElementById("oai_tts_model").value,
12064+
"input": text,
12065+
"voice": document.getElementById("oai_tts_voice").value
12066+
};
12067+
ttsheaders = {
12068+
'Content-Type': 'application/json',
12069+
'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
12070+
};
12071+
} else {
12072+
sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
12073+
payload =
12074+
{
12075+
"input": text,
12076+
"voice": document.getElementById("kcpp_tts_voice").value
12077+
};
12078+
ttsheaders = get_kobold_header();
12079+
}
12080+
12081+
fetch(sub_endpt, {
1203612082
method: 'POST',
12037-
headers: oaiheaders,
12083+
headers: ttsheaders,
1203812084
body: JSON.stringify(payload),
1203912085
})
1204012086
.then(response => response.arrayBuffer())
@@ -20199,6 +20245,14 @@ initializeInstructUIFunctionality();
2019920245
</tr><tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="alloy" id="oai_tts_voice" style="margin-left:3px; height:18px; width: 55px; padding: 2px;"></td></tr>
2020020246
</table>
2020120247
</div>
20248+
<div id="kcpp_tts_container" class="hidden">
20249+
<div class="color_red hidden" id="nokcpptts">KoboldCpp Not Connected</div>
20250+
<div class="settinglabel">
20251+
<table width="100%">
20252+
<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="kobo" placeholder="(Anything)" id="kcpp_tts_voice" style="margin-left:3px; height:18px; width: 80px; padding: 2px;"></td></tr>
20253+
</table>
20254+
</div>
20255+
</div>
2020220256
</div>
2020320257
<div class="settinglabel">
2020420258
<div class="justifyleft settingsmall" title="If unchecked, only speak AI replies, not other text.">Narrate Both Sides </div>

koboldcpp.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
783783
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
784784
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
785785
except Exception:
786-
FetchedCUdevices = []
787786
FetchedCUdeviceMem = []
788787
FetchedCUfreeMem = []
789788
faileddetectvram = True
@@ -806,7 +805,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
806805
if getamdvram:
807806
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
808807
except Exception:
809-
FetchedCUdevices = []
810808
FetchedCUdeviceMem = []
811809
FetchedCUfreeMem = []
812810
faileddetectvram = True
@@ -817,6 +815,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
817815
for idx in range(0,4):
818816
if(len(FetchedCUdevices)>idx):
819817
CUDevicesNames[idx] = FetchedCUdevices[idx]
818+
for idx in range(0,4):
819+
if(len(FetchedCUdevices)>idx):
820820
if len(FetchedCUdeviceMem)>idx:
821821
dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
822822
lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
@@ -1343,12 +1343,16 @@ def tts_generate(genparams):
13431343
is_quiet = True if (args.quiet or args.debugmode == -1) else False
13441344
prompt = genparams.get("input", genparams.get("text", ""))
13451345
prompt = prompt.strip()
1346+
voice = 1
13461347
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
1347-
voice = simple_lcg_hash(voicestr) if voicestr else 1
1348+
if voicestr and voicestr.strip().lower()=="kobo":
1349+
voice = 1
1350+
else:
1351+
voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
13481352
inputs = tts_generation_inputs()
13491353
inputs.prompt = prompt.encode("UTF-8")
13501354
inputs.speaker_seed = voice
1351-
inputs.audio_seed = 0
1355+
inputs.audio_seed = -1
13521356
inputs.quiet = is_quiet
13531357
ret = handle.tts_generate(inputs)
13541358
outstr = ""

otherarch/llama_v2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3028,4 +3028,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
30283028
res.resize(n);
30293029

30303030
return res;
3031-
}
3031+
}

otherarch/llama_v3.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4414,3 +4414,16 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
44144414
fputs(text, stderr);
44154415
fflush(stderr);
44164416
}
4417+
4418+
//// stuff this here since it's just some obsolete junk ////
4419+
static std::vector<uint8_t> kcpp_compute_buf;
4420+
void kcpp_graph_compute_helper(struct ggml_v3_cgraph *graph, int n_threads)
4421+
{
4422+
struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads);
4423+
if (plan.work_size > 0)
4424+
{
4425+
kcpp_compute_buf.resize(plan.work_size);
4426+
plan.work_data = kcpp_compute_buf.data();
4427+
}
4428+
ggml_v3_graph_compute(graph, &plan);
4429+
}

otherarch/tts_adapter.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
574574
}
575575
}
576576

577+
double ttstime = 0;
578+
timer_start();
579+
580+
577581
if(!inputs.quiet && ttsdebugmode==1)
578582
{
579583
printf("\nInput: %s\n", prompt_clean.c_str());
@@ -591,6 +595,14 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
591595
{
592596
printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
593597
}
598+
} else if (speaker_seed==1){ //1 is a special seed
599+
std::string speaker = "but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>";
600+
last_speaker_codes = common_tokenize(model_ttc, speaker, false, true);
601+
last_speaker_seed = speaker_seed;
602+
if(!inputs.quiet && ttsdebugmode==1)
603+
{
604+
printf("\nSpecial ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
605+
}
594606
} else {
595607
//generate the voice texture of our new speaker
596608
last_speaker_codes.clear();
@@ -800,8 +812,8 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
800812

801813
const int n_sr = 24000; // sampling rate
802814

803-
// zero out first 0.25 seconds or 0.05 depending on whether its seeded
804-
const int cutout = (speaker_seed>0?(24000/4):(24000/20));
815+
// zero out first 0.2 seconds or 0.05 depending on whether its seeded
816+
const int cutout = (speaker_seed>0?(24000/5):(24000/20));
805817
for (int i = 0; i < cutout; ++i) {
806818
audio[i] = 0.0f;
807819
}
@@ -811,10 +823,11 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
811823
}
812824

813825
last_generated_audio = save_wav16_base64(audio, n_sr);
826+
ttstime = timer_check();
814827

815828
if(!inputs.quiet)
816829
{
817-
printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
830+
printf("\nTTS Generated %d audio tokens in %.2fs.\n",(int) codes.size(),ttstime);
818831
}
819832

820833
output.data = last_generated_audio.c_str();

0 commit comments

Comments
 (0)