Nexesenex
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎colab.ipynb‎
Lines changed: 13 additions & 1 deletion b/‎colab.ipynb‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎kcpp_docs.embd‎
Lines changed: 58 additions & 0 deletions b/‎kcpp_docs.embd‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎klite.embd‎
Lines changed: 70 additions & 16 deletions b/‎klite.embd‎
Lines changed: 70 additions & 16 deletions
diff --git a/‎koboldcpp.py‎
Lines changed: 29 additions & 6 deletions b/‎koboldcpp.py‎
Lines changed: 29 additions & 6 deletions
diff --git a/‎otherarch/llama_v2.cpp‎
Lines changed: 1 addition & 1 deletion b/‎otherarch/llama_v2.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -411,3 +411,4 @@ when you can't use the precompiled binary directly, we provide an automated buil
   - Image Generation: [Anything v3](https://huggingface.co/admruul/anything-v3.0/resolve/main/Anything-V3.0-pruned-fp16.safetensors) or [Deliberate V2](https://huggingface.co/Yntec/Deliberate2/resolve/main/Deliberate_v2.safetensors) or [Dreamshaper SDXL](https://huggingface.co/Lykon/dreamshaper-xl-v2-turbo/resolve/main/DreamShaperXL_Turbo_v2_1.safetensors)
   - Image Recognition MMproj: [Pick the correct one for your model architecture here](https://huggingface.co/koboldcpp/mmproj/tree/main)
   - Speech Recognition: [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main)
+  - Text-To-Speech: [TTS models for Narration](https://huggingface.co/koboldcpp/tts/tree/main)
@@ -67,6 +67,11 @@
     "LoadSpeechModel = False #@param {type:\"boolean\"}\n",
     "SpeechModel = \"https://huggingface.co/koboldcpp/whisper/resolve/main/whisper-base.en-q5_1.bin\" #@param [\"https://huggingface.co/koboldcpp/whisper/resolve/main/whisper-base.en-q5_1.bin\"]{allow-input: true}\n",
     "WCommand = \"\"\n",
+    "#@markdown <hr>\n",
+    "LoadTTSModel = False #@param {type:\"boolean\"}\n",
+    "TTSModel = \"https://huggingface.co/koboldcpp/tts/resolve/main/OuteTTS-0.2-500M-Q4_0.gguf\" #@param [\"https://huggingface.co/koboldcpp/tts/resolve/main/OuteTTS-0.2-500M-Q4_0.gguf\"]{allow-input: true}\n",
+    "WavTokModel = \"https://huggingface.co/koboldcpp/tts/resolve/main/WavTokenizer-Large-75-Q4_0.gguf\" #@param [\"https://huggingface.co/koboldcpp/tts/resolve/main/WavTokenizer-Large-75-Q4_0.gguf\"]{allow-input: true}\n",
+    "TTSCommand = \"\"\n",
     "\n",
     "import os\n",
     "if not os.path.isfile(\"/opt/bin/nvidia-smi\"):\n",
@@ -85,6 +90,10 @@
     "  WCommand = \"--whispermodel wmodel.bin\"\n",
     "else:\n",
     "  WCommand = \"\"\n",
+    "if TTSModel and WavTokModel and LoadTTSModel:\n",
+    "  TTSCommand = \"--ttsmodel ttsmodel.bin --ttswavtokenizer ttswavtok.bin --ttsgpu\"\n",
+    "else:\n",
+    "  TTSCommand = \"\"\n",
     "if FlashAttention:\n",
     "  FACommand = \"--flashattention\"\n",
     "else:\n",
@@ -110,7 +119,10 @@
     "  !aria2c -x 10 -o imodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $ImgModel\n",
     "if WCommand:\n",
     "  !aria2c -x 10 -o wmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $SpeechModel\n",
-    "!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand\n"
+    "if TTSCommand:\n",
+    "  !aria2c -x 10 -o ttsmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $TTSModel\n",
+    "  !aria2c -x 10 -o ttswavtok.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $WavTokModel\n",
+    "!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"
    ]
   }
  ],
 
@@ -616,6 +616,8 @@
                                      "vision": false,
                                      "transcribe":false,
                                      "multiplayer": false,
+                                     "websearch":false,
+                                     "tts":false,
                                   },
                                   "schema": {
                                      "$ref": "#/components/schemas/KcppVersion"
@@ -1443,6 +1445,52 @@
                       ]
                    }
                 },
+                "/api/extra/tts": {
+                   "post": {
+                      "description": "Creates text-to-speech audio from input text.",
+                      "requestBody": {
+                         "content": {
+                            "application/json": {
+                               "example": {
+                                  "input": "hello world, how are you today?",
+                                  "voice": "fire",
+                               },
+                               "schema": {
+                                  "properties": {
+                                     "input": {
+                                        "type": "string",
+                                        "description": "The text to generate audio for. Try to keep it short."
+                                     },
+                                     "voice": {
+                                        "type": "string",
+                                        "description": "The voice to use when generating the audio. You can enter anything you like, a qunique speaker will be generated."
+                                     }
+                                  },
+                                  "type": "object"
+                               }
+                            }
+                         },
+                         "required": true
+                      },
+                      "responses": {
+                         "200": {
+                            "content": {
+                                 "audio/wav": {
+                                    "schema": {
+                                       "type": "string",
+                                       "format": "binary"
+                                    }
+                                 }
+                           },
+                            "description": "Successful request"
+                         }
+                      },
+                      "summary": "Creates text-to-speech audio from input text.",
+                      "tags": [
+                         "api/extra"
+                      ]
+                   }
+                },
                 "/props": {
                    "get": {
                       "summary": "Returns the Jinja template stored in the GGUF model, if found.",
@@ -1840,6 +1888,16 @@
                       "responses": {"default": {"description": ""}}
                    }
                 },
+                "/v1/audio/speech": {
+                   "post": {
+                      "summary": "Generates Text-To-Speech audio from input text. Please refer to OpenAI documentation",
+                      "description": "Generates Text-To-Speech audio from input text.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/audio/createSpeech](https://platform.openai.com/docs/api-reference/audio/createSpeech)",
+                      "tags": [
+                         "v1"
+                      ],
+                      "responses": {"default": {"description": ""}}
+                   }
+                },
              },
              "servers": [
                 {
 
@@ -2028,6 +2028,15 @@ Current version indicated by LITEVER below.
 	.color_orangeurl:focus {
 		color: #ffedd3;
 	}
+	.color_grayurl {
+		color: #9e9e9e;
+	}
+	.color_grayurl:hover {
+		color: #9f9f9f;
+	}
+	.color_grayurl:focus {
+		color: #9e9e9e;
+	}
 
 	.color_orange {
 		color: #f7a223;
@@ -2791,7 +2800,8 @@ Current version indicated by LITEVER below.
 	const koboldcpp_transcribe_endpoint = "/api/extra/transcribe";
 	const koboldcpp_tokenize_endpoint = "/api/extra/tokencount";
 	const koboldcpp_perf_endpoint = "/api/extra/perf";
-	const koboldcpp_websearch_endpoint = "/api/extra/websearch"
+	const koboldcpp_websearch_endpoint = "/api/extra/websearch";
+	const koboldcpp_tts_endpoint = "/api/extra/tts";
 
 	const oai_models_endpoint = "/models";
 	const oai_submit_endpoint = "/completions";
@@ -2851,6 +2861,7 @@ Current version indicated by LITEVER below.
 	const XTTS_ID = 1000;
 	const ALLTALK_ID = 1001;
 	const OAI_TTS_ID = 1002;
+	const KCPP_TTS_ID = 1003;
 
 	const HD_RES_PX = 768;
 	const NO_HD_RES_PX = 512;
@@ -2966,6 +2977,7 @@ Current version indicated by LITEVER below.
 	var voice_is_processing = false; //currently processing voice?
 	let voiceprerecorder = null, voicerecorder = null, voice_is_speaking = false, voice_speaking_counter = 0;
 	let preaudiobuffers = [], preaudioblobs = []; //will store 2 preblobs at a time
+	var koboldcpp_has_tts = false;
 	var no_escape_html = false;
 	var timetaken_timestamp = performance.now();
 	var bg_silence = null;
@@ -3588,7 +3600,7 @@ Current version indicated by LITEVER below.
 	document.getElementById("lastreq1").innerHTML =
 	document.getElementById("lastreq2").innerHTML =
 	document.getElementById("lastreq3").innerHTML =
-	`KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`;
+	`KoboldAI Lite v${LITEVER} Web - Frontend for <a href="#" class="color_grayurl" onclick="msgbox('KoboldAI Lite allows you to connect to various third-party AI services. We do not control or assume responsibility for the models or content generated by these services. The user is responsible for ensuring that their usage of this software is legal in their country, and complies with the terms of service of the service they are connected to. Use at your own discretion.','Disclaimer')">External API Services</a>`;
 
 	trigger_abort_controller(); //first trigger sets it up
 
@@ -5841,6 +5853,10 @@ initializeInstructUIFunctionality();
 	{
 		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.80") >= 0 && koboldcpp_has_websearch);
 	}
+	function is_using_kcpp_with_tts()
+	{
+		return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.81") >= 0 && koboldcpp_has_tts);
+	}
 	function is_using_web_lite()
 	{
 		return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("lostruins.github.io"));
@@ -9208,6 +9224,7 @@ initializeInstructUIFunctionality();
 									koboldcpp_has_whisper = (data.transcribe?true:false);
 									koboldcpp_has_multiplayer = (data.multiplayer?true:false);
 									koboldcpp_has_websearch = (data.websearch?true:false);
+									koboldcpp_has_tts = (data.tts?true:false);
 									let has_password = (data.protected?true:false);
 									let has_txt2img = (data.txt2img?true:false);
 									let no_txt_model = (mdlname=="inactive");
@@ -9316,7 +9333,7 @@ initializeInstructUIFunctionality();
 										},()=>{
 										});
 									}
-									else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper)
+									else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper && !koboldcpp_has_tts)
 									{
 										msgboxYesNo("This KoboldCpp instance has no models loaded. You can still use the WebUI to edit or view existing stories.<br><br>Would you like to connect to an external API service?","No Models Loaded",
 										()=>{
@@ -10312,6 +10329,8 @@ initializeInstructUIFunctionality();
 		ttshtml += "<option value=\"1000\">XTTS API Server</option>";
 		ttshtml += "<option value=\"1001\">AllTalk API Server</option>";
 		ttshtml += "<option value=\"1002\">OpenAI-Compat. API Server</option>";
+		ttshtml += "<option value=\"1003\">KoboldCpp TTS API</option>";
+
 		if ('speechSynthesis' in window) {
 			let voices = window.speechSynthesis.getVoices();
 			console.log("speech synth available: " + voices.length);
@@ -11895,6 +11914,7 @@ initializeInstructUIFunctionality();
 		document.getElementById("xtts_container").classList.add("hidden");
 		document.getElementById("oai_tts_container").classList.add("hidden");
 		document.getElementById("alltalk_specific_controls").classList.add("hidden");
+		document.getElementById("kcpp_tts_container").classList.add("hidden");
 
 		const selectedTTS = document.getElementById("ttsselect").value;
 
@@ -11911,6 +11931,15 @@ initializeInstructUIFunctionality();
 		else if(selectedTTS == OAI_TTS_ID) {
 			document.getElementById("oai_tts_container").classList.remove("hidden");
 		}
+		else if(selectedTTS == KCPP_TTS_ID) {
+			document.getElementById("kcpp_tts_container").classList.remove("hidden");
+			if(is_using_kcpp_with_tts())
+			{
+				document.getElementById("nokcpptts").classList.add("hidden");
+			}else{
+				document.getElementById("nokcpptts").classList.remove("hidden");
+			}
+		}
 	}
 
 	// Fetch RVC voices for AllTalk
@@ -12015,27 +12044,44 @@ initializeInstructUIFunctionality();
 			}
 		}
 
-		if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID) //xtts api server
+		if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID || ssval==KCPP_TTS_ID) //xtts api server
 		{
 			let is_xtts = (ssval==XTTS_ID);
 			let is_oai_tts = (ssval==OAI_TTS_ID);
+			let is_kcpp_tts = (ssval==KCPP_TTS_ID);
 			const audioContext = new (window.AudioContext || window.webkitAudioContext)();
 
-			if(is_oai_tts)
+			if(is_oai_tts || is_kcpp_tts)
 			{
-				let payload =
+				let payload = {};
+				let ttsheaders = {};
+				let sub_endpt = "";
+				if(is_oai_tts)
 				{
-					"model": document.getElementById("oai_tts_model").value,
-					"input": text,
-					"voice": document.getElementById("oai_tts_voice").value
-				};
-				let oaiheaders = {
-					'Content-Type': 'application/json',
-					'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
-				};
-				fetch(localsettings.saved_oai_tts_url, {
+					sub_endpt = localsettings.saved_oai_tts_url;
+					payload =
+					{
+						"model": document.getElementById("oai_tts_model").value,
+						"input": text,
+						"voice": document.getElementById("oai_tts_voice").value
+					};
+					ttsheaders = {
+						'Content-Type': 'application/json',
+						'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
+					};
+				} else {
+					sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
+					payload =
+					{
+						"input": text,
+						"voice": document.getElementById("kcpp_tts_voice").value
+					};
+					ttsheaders = get_kobold_header();
+				}
+
+				fetch(sub_endpt, {
 					method: 'POST',
-					headers: oaiheaders,
+					headers: ttsheaders,
 					body: JSON.stringify(payload),
 				})
 				.then(response => response.arrayBuffer())
@@ -20200,6 +20246,14 @@ initializeInstructUIFunctionality();
 								</tr><tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="alloy" id="oai_tts_voice" style="margin-left:3px; height:18px; width: 55px; padding: 2px;"></td></tr>
 								</table>
 							</div>
+							<div id="kcpp_tts_container" class="hidden">
+								<div class="color_red hidden" id="nokcpptts">KoboldCpp Not Connected</div>
+								<div class="settinglabel">
+								<table width="100%">
+								<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="kobo" placeholder="(Anything)" id="kcpp_tts_voice" style="margin-left:3px; height:18px; width: 80px; padding: 2px;"></td></tr>
+								</table>
+								</div>
+							</div>
 						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall"  title="If unchecked, only speak AI replies, not other text.">Narrate Both Sides </div>
 
@@ -67,10 +67,10 @@
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.82004"
+KcppVersion = "1.82008"
 LcppVersion = "b4458"
 CudaSpecifics = "Cu124_Ar6175_SMC2_DmmvX32Y1"
-ReleaseDate = "2025/01/10"
+ReleaseDate = "2025/01/13"
 showdebug = True
 guimode = False
 showsamplerwarning = True
@@ -1295,7 +1295,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
             FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
             FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
         except Exception:
-            FetchedCUdevices = []
             FetchedCUdeviceMem = []
             FetchedCUfreeMem = []
             faileddetectvram = True
@@ -1318,7 +1317,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
                     if getamdvram:
                         FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
             except Exception:
-                FetchedCUdevices = []
                 FetchedCUdeviceMem = []
                 FetchedCUfreeMem = []
                 faileddetectvram = True
@@ -1375,6 +1373,27 @@ def fetch_gpu_properties(testCL,testCU,testVK):
         # if faileddetectvram:
             # print("Unable to detect VRAM, please set layers manually.")
 
+# or then :
+
+        # lowestcumem = 0
+        # lowestfreecumem = 0
+        # try:
+            # for idx in range(0,4):
+                # if(len(FetchedCUdevices)>idx):
+                    # CUDevicesNames[idx] = FetchedCUdevices[idx]
+            # for idx in range(0,4):
+                # if(len(FetchedCUdevices)>idx):
+                    # if len(FetchedCUdeviceMem)>idx:
+                        # dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
+                        # lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
+                    # if len(FetchedCUfreeMem)>idx:
+                        # dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
+                        # lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
+        # except Exception:
+            # lowestcumem = 0
+            # lowestfreecumem = 0
+            # faileddetectvram = True
+
     if testVK:
         try: # Get Vulkan names
             output = subprocess.run(['vulkaninfo','--summary'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@@ -1910,12 +1929,16 @@ def tts_generate(genparams):
     is_quiet = True if (args.quiet or args.debugmode == -1) else False
     prompt = genparams.get("input", genparams.get("text", ""))
     prompt = prompt.strip()
+    voice = 1
     voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
-    voice = simple_lcg_hash(voicestr) if voicestr else 1
+    if voicestr and voicestr.strip().lower()=="kobo":
+        voice = 1
+    else:
+        voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
     inputs = tts_generation_inputs()
     inputs.prompt = prompt.encode("UTF-8")
     inputs.speaker_seed = voice
-    inputs.audio_seed = 0
+    inputs.audio_seed = -1
     inputs.quiet = is_quiet
     ret = handle.tts_generate(inputs)
     outstr = ""
 
@@ -3028,4 +3028,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
     res.resize(n);
 
     return res;
-}
+}