added a nicer built in voice

LostRuins · LostRuins · commit 636beac6d23c · 2025-01-13T23:26:54.000+08:00
diff --git a/kcpp_docs.embd b/kcpp_docs.embd
@@ -616,6 +616,8 @@
                                      "vision": false,
                                      "transcribe":false,
                                      "multiplayer": false,
+                                     "websearch":false,
+                                     "tts":false,
                                   },
                                   "schema": {
                                      "$ref": "#/components/schemas/KcppVersion"
@@ -1443,6 +1445,52 @@
                       ]
                    }
                 },
+                "/api/extra/tts": {
+                   "post": {
+                      "description": "Creates text-to-speech audio from input text.",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                  "input": "hello world, how are you today?",
+                                  "voice": "fire",
+                               },
+                               "schema": {
+                                  "properties": {
+                                     "input": {
+                                        "type": "string",
+                                        "description": "The text to generate audio for. Try to keep it short."
+                                     },
+                                     "voice": {
+                                        "type": "string",
+                                        "description": "The voice to use when generating the audio. You can enter anything you like, a qunique speaker will be generated."
+                                     }
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": true
+                      },
+                      "responses": {
+                         "200": {
+                            "content": {
+                                 "audio/wav": {
+                                    "schema": {
+                                       "type": "string",
+                                       "format": "binary"
+                                    }
+                                 }
+                           },
+                            "description": "Successful request"
+                         }
+                      },
+                      "summary": "Creates text-to-speech audio from input text.",
+                      "tags": [
+                         "api/extra"
+                      ]
+                   }
+                },
                 "/props": {
                    "get": {
                       "summary": "Returns the Jinja template stored in the GGUF model, if found.",
@@ -1840,6 +1888,16 @@
                       "responses": {"default": {"description": ""}}
                    }
                 },
+                "/v1/audio/speech": {
+                   "post": {
+                      "summary": "Generates Text-To-Speech audio from input text. Please refer to OpenAI documentation",
+                      "description": "Generates Text-To-Speech audio from input text.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/audio/createSpeech](https://platform.openai.com/docs/api-reference/audio/createSpeech)",
+                      "tags": [
+                         "v1"
+                      ],
+                      "responses": {"default": {"description": ""}}
+                   }
+                },
              },
              "servers": [
                 {
diff --git a/klite.embd b/klite.embd
@@ -2030,6 +2030,15 @@ Current version indicated by LITEVER below.
 	.color_orangeurl:focus {
 		color: #ffedd3;
 	}
+	.color_grayurl {
+		color: #9e9e9e;
+	}
+	.color_grayurl:hover {
+		color: #9f9f9f;
+	}
+	.color_grayurl:focus {
+		color: #9e9e9e;
+	}
 
 	.color_orange {
 		color: #f7a223;
@@ -2793,7 +2802,8 @@ Current version indicated by LITEVER below.
 	const koboldcpp_transcribe_endpoint = "/api/extra/transcribe";
 	const koboldcpp_tokenize_endpoint = "/api/extra/tokencount";
 	const koboldcpp_perf_endpoint = "/api/extra/perf";
-	const koboldcpp_websearch_endpoint = "/api/extra/websearch"
+	const koboldcpp_websearch_endpoint = "/api/extra/websearch";
+	const koboldcpp_tts_endpoint = "/api/extra/tts";
 
 	const oai_models_endpoint = "/models";
 	const oai_submit_endpoint = "/completions";
@@ -2853,6 +2863,7 @@ Current version indicated by LITEVER below.
 	const XTTS_ID = 1000;
 	const ALLTALK_ID = 1001;
 	const OAI_TTS_ID = 1002;
+	const KCPP_TTS_ID = 1003;
 	const HD_RES_PX = 768;
 	const NO_HD_RES_PX = 512;
 	const AVATAR_PX = 384;
@@ -2965,6 +2976,7 @@ Current version indicated by LITEVER below.
 	var voice_is_processing = false; //currently processing voice?
 	let voiceprerecorder = null, voicerecorder = null, voice_is_speaking = false, voice_speaking_counter = 0;
 	let preaudiobuffers = [], preaudioblobs = []; //will store 2 preblobs at a time
+	var koboldcpp_has_tts = false;
 	var no_escape_html = false;
 	var timetaken_timestamp = performance.now();
 	var bg_silence = null;
@@ -3587,7 +3599,7 @@ Current version indicated by LITEVER below.
 	document.getElementById("lastreq1").innerHTML =
 	document.getElementById("lastreq2").innerHTML =
 	document.getElementById("lastreq3").innerHTML =
-	`KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`;
+	`KoboldAI Lite v${LITEVER} Web - Frontend for <a href="#" class="color_grayurl" onclick="msgbox('KoboldAI Lite allows you to connect to various third-party AI services. We do not control or assume responsibility for the models or content generated by these services. The user is responsible for ensuring that their usage of this software is legal in their country, and complies with the terms of service of the service they are connected to. Use at your own discretion.','Disclaimer')">External API Services</a>`;
 
 	trigger_abort_controller(); //first trigger sets it up
 
@@ -5840,6 +5852,10 @@ initializeInstructUIFunctionality();
 	{
 		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.80") >= 0 && koboldcpp_has_websearch);
 	}
+	function is_using_kcpp_with_tts()
+	{
+		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.81") >= 0 && koboldcpp_has_tts);
+	}
 	function is_using_web_lite()
 	{
 		return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("lostruins.github.io"));
@@ -9207,6 +9223,7 @@ initializeInstructUIFunctionality();
 									koboldcpp_has_whisper = (data.transcribe?true:false);
 									koboldcpp_has_multiplayer = (data.multiplayer?true:false);
 									koboldcpp_has_websearch = (data.websearch?true:false);
+									koboldcpp_has_tts = (data.tts?true:false);
 									let has_password = (data.protected?true:false);
 									let has_txt2img = (data.txt2img?true:false);
 									let no_txt_model = (mdlname=="inactive");
@@ -9315,7 +9332,7 @@ initializeInstructUIFunctionality();
 										},()=>{
 										});
 									}
-									else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper)
+									else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper && !koboldcpp_has_tts)
 									{
 										msgboxYesNo("This KoboldCpp instance has no models loaded. You can still use the WebUI to edit or view existing stories.<br><br>Would you like to connect to an external API service?","No Models Loaded",
 										()=>{
@@ -10311,6 +10328,8 @@ initializeInstructUIFunctionality();
 		ttshtml += "<option value=\"1000\">XTTS API Server</option>";
 		ttshtml += "<option value=\"1001\">AllTalk API Server</option>";
 		ttshtml += "<option value=\"1002\">OpenAI-Compat. API Server</option>";
+		ttshtml += "<option value=\"1003\">KoboldCpp TTS API</option>";
+
 		if ('speechSynthesis' in window) {
 			let voices = window.speechSynthesis.getVoices();
 			console.log("speech synth available: " + voices.length);
@@ -11894,6 +11913,7 @@ initializeInstructUIFunctionality();
 		document.getElementById("xtts_container").classList.add("hidden");
 		document.getElementById("oai_tts_container").classList.add("hidden");
 		document.getElementById("alltalk_specific_controls").classList.add("hidden");
+		document.getElementById("kcpp_tts_container").classList.add("hidden");
 
 		const selectedTTS = document.getElementById("ttsselect").value;
 
@@ -11910,6 +11930,15 @@ initializeInstructUIFunctionality();
 		else if(selectedTTS == OAI_TTS_ID) {
 			document.getElementById("oai_tts_container").classList.remove("hidden");
 		}
+		else if(selectedTTS == KCPP_TTS_ID) {
+			document.getElementById("kcpp_tts_container").classList.remove("hidden");
+			if(is_using_kcpp_with_tts())
+			{
+				document.getElementById("nokcpptts").classList.add("hidden");
+			}else{
+				document.getElementById("nokcpptts").classList.remove("hidden");
+			}
+		}
 	}
 
 	// Fetch RVC voices for AllTalk
@@ -12014,27 +12043,44 @@ initializeInstructUIFunctionality();
 			}
 		}
 
-		if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID) //xtts api server
+		if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID || ssval==KCPP_TTS_ID) //xtts api server
 		{
 			let is_xtts = (ssval==XTTS_ID);
 			let is_oai_tts = (ssval==OAI_TTS_ID);
+			let is_kcpp_tts = (ssval==KCPP_TTS_ID);
 			const audioContext = new (window.AudioContext || window.webkitAudioContext)();
 
-			if(is_oai_tts)
+			if(is_oai_tts || is_kcpp_tts)
 			{
-				let payload =
+				let payload = {};
+				let ttsheaders = {};
+				let sub_endpt = "";
+				if(is_oai_tts)
 				{
-					"model": document.getElementById("oai_tts_model").value,
-					"input": text,
-					"voice": document.getElementById("oai_tts_voice").value
-				};
-				let oaiheaders = {
-					'Content-Type': 'application/json',
-					'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
-				};
-				fetch(localsettings.saved_oai_tts_url, {
+					sub_endpt = localsettings.saved_oai_tts_url;
+					payload =
+					{
+						"model": document.getElementById("oai_tts_model").value,
+						"input": text,
+						"voice": document.getElementById("oai_tts_voice").value
+					};
+					ttsheaders = {
+						'Content-Type': 'application/json',
+						'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
+					};
+				} else {
+					sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
+					payload =
+					{
+						"input": text,
+						"voice": document.getElementById("kcpp_tts_voice").value
+					};
+					ttsheaders = get_kobold_header();
+				}
+
+				fetch(sub_endpt, {
 					method: 'POST',
-					headers: oaiheaders,
+					headers: ttsheaders,
 					body: JSON.stringify(payload),
 				})
 				.then(response => response.arrayBuffer())
@@ -20199,6 +20245,14 @@ initializeInstructUIFunctionality();
 								</tr><tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="alloy" id="oai_tts_voice" style="margin-left:3px; height:18px; width: 55px; padding: 2px;"></td></tr>
 								</table>
 							</div>
+							<div id="kcpp_tts_container" class="hidden">
+								<div class="color_red hidden" id="nokcpptts">KoboldCpp Not Connected</div>
+								<div class="settinglabel">
+								<table width="100%">
+								<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="kobo" placeholder="(Anything)" id="kcpp_tts_voice" style="margin-left:3px; height:18px; width: 80px; padding: 2px;"></td></tr>
+								</table>
+								</div>
+							</div>
 						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall"  title="If unchecked, only speak AI replies, not other text.">Narrate Both Sides </div>
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -783,7 +783,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
             FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
             FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
         except Exception:
-            FetchedCUdevices = []
             FetchedCUdeviceMem = []
             FetchedCUfreeMem = []
             faileddetectvram = True
@@ -806,7 +805,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
                     if getamdvram:
                         FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
             except Exception:
-                FetchedCUdevices = []
                 FetchedCUdeviceMem = []
                 FetchedCUfreeMem = []
                 faileddetectvram = True
@@ -817,6 +815,8 @@ def fetch_gpu_properties(testCL,testCU,testVK):
             for idx in range(0,4):
                 if(len(FetchedCUdevices)>idx):
                     CUDevicesNames[idx] = FetchedCUdevices[idx]
+            for idx in range(0,4):
+                if(len(FetchedCUdevices)>idx):
                     if len(FetchedCUdeviceMem)>idx:
                         dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
                         lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
@@ -1343,12 +1343,16 @@ def tts_generate(genparams):
     is_quiet = True if (args.quiet or args.debugmode == -1) else False
     prompt = genparams.get("input", genparams.get("text", ""))
     prompt = prompt.strip()
+    voice = 1
     voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
-    voice = simple_lcg_hash(voicestr) if voicestr else 1
+    if voicestr and voicestr.strip().lower()=="kobo":
+        voice = 1
+    else:
+        voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
     inputs = tts_generation_inputs()
     inputs.prompt = prompt.encode("UTF-8")
     inputs.speaker_seed = voice
-    inputs.audio_seed = 0
+    inputs.audio_seed = -1
     inputs.quiet = is_quiet
     ret = handle.tts_generate(inputs)
     outstr = ""
diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp
@@ -3028,4 +3028,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
     res.resize(n);
 
     return res;
-}
+}
diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp
@@ -4414,3 +4414,16 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
     fputs(text, stderr);
     fflush(stderr);
 }
+
+//// stuff this here since it's just some obsolete junk ////
+static std::vector<uint8_t> kcpp_compute_buf;
+void kcpp_graph_compute_helper(struct ggml_v3_cgraph *graph, int n_threads)
+{
+    struct ggml_v3_cplan plan = ggml_v3_graph_plan(graph, n_threads);
+    if (plan.work_size > 0)
+    {
+        kcpp_compute_buf.resize(plan.work_size);
+        plan.work_data = kcpp_compute_buf.data();
+    }
+    ggml_v3_graph_compute(graph, &plan);
+}
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -574,6 +574,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
         }
     }
 
+    double ttstime = 0;
+    timer_start();
+
+
     if(!inputs.quiet && ttsdebugmode==1)
     {
         printf("\nInput: %s\n", prompt_clean.c_str());
@@ -591,6 +595,14 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
             {
                 printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
             }
+        } else if (speaker_seed==1){ //1 is a special seed
+            std::string speaker = "but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>";
+            last_speaker_codes = common_tokenize(model_ttc, speaker, false, true);
+            last_speaker_seed = speaker_seed;
+            if(!inputs.quiet && ttsdebugmode==1)
+            {
+                printf("\nSpecial ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
+            }
         } else {
             //generate the voice texture of our new speaker
             last_speaker_codes.clear();
@@ -800,8 +812,8 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
 
         const int n_sr = 24000; // sampling rate
 
-        // zero out first 0.25 seconds or 0.05 depending on whether its seeded
-        const int cutout = (speaker_seed>0?(24000/4):(24000/20));
+        // zero out first 0.2 seconds or 0.05 depending on whether its seeded
+        const int cutout = (speaker_seed>0?(24000/5):(24000/20));
         for (int i = 0; i < cutout; ++i) {
             audio[i] = 0.0f;
         }
@@ -811,10 +823,11 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
         }
 
         last_generated_audio = save_wav16_base64(audio, n_sr);
+        ttstime = timer_check();
 
         if(!inputs.quiet)
         {
-            printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());
+            printf("\nTTS Generated %d audio tokens in %.2fs.\n",(int) codes.size(),ttstime);
         }
 
         output.data = last_generated_audio.c_str();
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp

Original file line number	Diff line number	Diff line change
`@@ -574,6 +574,10 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)`
`574`	`574`	`}`
`575`	`575`	`}`
`576`	`576`
	`577`	`+ double ttstime = 0;`
	`578`	`+ timer_start();`
	`579`	`+`
	`580`	`+`
`577`	`581`	`if(!inputs.quiet && ttsdebugmode==1)`
`578`	`582`	`{`
`579`	`583`	`printf("\nInput: %s\n", prompt_clean.c_str());`
`@@ -591,6 +595,14 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)`
`591`	`595`	`{`
`592`	`596`	`printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());`
`593`	`597`	`}`
	`598`	`+ } else if (speaker_seed==1){ //1 is a special seed`
	`599`	+ std::string speaker = "but<\|t_0.31\|><\|code_start\|><\|1023\|><\|1474\|><\|17\|><\|121\|><\|1362\|><\|744\|><\|438\|><\|1319\|><\|744\|><\|1419\|><\|1246\|><\|923\|><\|1338\|><\|406\|><\|939\|><\|975\|><\|1491\|><\|965\|><\|1212\|><\|248\|><\|794\|><\|464\|><\|830\|><\|code_end\|>\nthat<\|t_0.13\|><\|code_start\|><\|1578\|><\|1773\|><\|660\|><\|1074\|><\|221\|><\|1803\|><\|142\|><\|914\|><\|798\|><\|485\|><\|code_end\|>\nis<\|t_0.11\|><\|code_start\|><\|737\|><\|794\|><\|1288\|><\|182\|><\|895\|><\|1653\|><\|448\|><\|471\|><\|code_end\|>\nwhat<\|t_0.12\|><\|code_start\|><\|1734\|><\|1306\|><\|779\|><\|490\|><\|525\|><\|1028\|><\|37\|><\|1633\|><\|1353\|><\|code_end\|>\nit<\|t_0.09\|><\|code_start\|><\|1343\|><\|898\|><\|270\|><\|1035\|><\|94\|><\|1409\|><\|388\|><\|code_end\|>\nis<\|t_0.23\|><\|code_start\|><\|694\|><\|695\|><\|577\|><\|692\|><\|1047\|><\|388\|><\|28\|><\|905\|><\|1155\|><\|50\|><\|1629\|><\|1775\|><\|1711\|><\|1729\|><\|404\|><\|1027\|><\|344\|><\|code_end\|>";
	`600`	`+ last_speaker_codes = common_tokenize(model_ttc, speaker, false, true);`
	`601`	`+ last_speaker_seed = speaker_seed;`
	`602`	`+ if(!inputs.quiet && ttsdebugmode==1)`
	`603`	`+ {`
	`604`	`+ printf("\nSpecial ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());`
	`605`	`+ }`
`594`	`606`	`} else {`
`595`	`607`	`//generate the voice texture of our new speaker`
`596`	`608`	`last_speaker_codes.clear();`
`@@ -800,8 +812,8 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)`
`800`	`812`
`801`	`813`	`const int n_sr = 24000; // sampling rate`
`802`	`814`
`803`		`- // zero out first 0.25 seconds or 0.05 depending on whether its seeded`
`804`		`- const int cutout = (speaker_seed>0?(24000/4):(24000/20));`
	`815`	`+ // zero out first 0.2 seconds or 0.05 depending on whether its seeded`
	`816`	`+ const int cutout = (speaker_seed>0?(24000/5):(24000/20));`
`805`	`817`	`for (int i = 0; i < cutout; ++i) {`
`806`	`818`	`audio[i] = 0.0f;`
`807`	`819`	`}`
`@@ -811,10 +823,11 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)`
`811`	`823`	`}`
`812`	`824`
`813`	`825`	`last_generated_audio = save_wav16_base64(audio, n_sr);`
	`826`	`+ ttstime = timer_check();`
`814`	`827`
`815`	`828`	`if(!inputs.quiet)`
`816`	`829`	`{`
`817`		`- printf("\nTTS Generated %d audio tokens.\n",(int) codes.size());`
	`830`	`+ printf("\nTTS Generated %d audio tokens in %.2fs.\n",(int) codes.size(),ttstime);`
`818`	`831`	`}`
`819`	`832`
`820`	`833`	`output.data = last_generated_audio.c_str();`