Skip to content

Commit 7d269ab

Browse files
committed
Merge branch 'concedo_experimental' into croco_nex_0
2 parents 19a0f05 + 9643296 commit 7d269ab

File tree

11 files changed

+213
-43
lines changed

11 files changed

+213
-43
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,3 +411,4 @@ when you can't use the precompiled binary directly, we provide an automated buil
411411
- Image Generation: [Anything v3](https://huggingface.co/admruul/anything-v3.0/resolve/main/Anything-V3.0-pruned-fp16.safetensors) or [Deliberate V2](https://huggingface.co/Yntec/Deliberate2/resolve/main/Deliberate_v2.safetensors) or [Dreamshaper SDXL](https://huggingface.co/Lykon/dreamshaper-xl-v2-turbo/resolve/main/DreamShaperXL_Turbo_v2_1.safetensors)
412412
- Image Recognition MMproj: [Pick the correct one for your model architecture here](https://huggingface.co/koboldcpp/mmproj/tree/main)
413413
- Speech Recognition: [Whisper models for Speech-To-Text](https://huggingface.co/koboldcpp/whisper/tree/main)
414+
- Text-To-Speech: [TTS models for Narration](https://huggingface.co/koboldcpp/tts/tree/main)

colab.ipynb

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@
6767
"LoadSpeechModel = False #@param {type:\"boolean\"}\n",
6868
"SpeechModel = \"https://huggingface.co/koboldcpp/whisper/resolve/main/whisper-base.en-q5_1.bin\" #@param [\"https://huggingface.co/koboldcpp/whisper/resolve/main/whisper-base.en-q5_1.bin\"]{allow-input: true}\n",
6969
"WCommand = \"\"\n",
70+
"#@markdown <hr>\n",
71+
"LoadTTSModel = False #@param {type:\"boolean\"}\n",
72+
"TTSModel = \"https://huggingface.co/koboldcpp/tts/resolve/main/OuteTTS-0.2-500M-Q4_0.gguf\" #@param [\"https://huggingface.co/koboldcpp/tts/resolve/main/OuteTTS-0.2-500M-Q4_0.gguf\"]{allow-input: true}\n",
73+
"WavTokModel = \"https://huggingface.co/koboldcpp/tts/resolve/main/WavTokenizer-Large-75-Q4_0.gguf\" #@param [\"https://huggingface.co/koboldcpp/tts/resolve/main/WavTokenizer-Large-75-Q4_0.gguf\"]{allow-input: true}\n",
74+
"TTSCommand = \"\"\n",
7075
"\n",
7176
"import os\n",
7277
"if not os.path.isfile(\"/opt/bin/nvidia-smi\"):\n",
@@ -85,6 +90,10 @@
8590
" WCommand = \"--whispermodel wmodel.bin\"\n",
8691
"else:\n",
8792
" WCommand = \"\"\n",
93+
"if TTSModel and WavTokModel and LoadTTSModel:\n",
94+
" TTSCommand = \"--ttsmodel ttsmodel.bin --ttswavtokenizer ttswavtok.bin --ttsgpu\"\n",
95+
"else:\n",
96+
" TTSCommand = \"\"\n",
8897
"if FlashAttention:\n",
8998
" FACommand = \"--flashattention\"\n",
9099
"else:\n",
@@ -110,7 +119,10 @@
110119
" !aria2c -x 10 -o imodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $ImgModel\n",
111120
"if WCommand:\n",
112121
" !aria2c -x 10 -o wmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $SpeechModel\n",
113-
"!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand\n"
122+
"if TTSCommand:\n",
123+
" !aria2c -x 10 -o ttsmodel.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $TTSModel\n",
124+
" !aria2c -x 10 -o ttswavtok.bin --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $WavTokModel\n",
125+
"!./koboldcpp_linux model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --websearch --quiet --remotetunnel $FACommand $MPCommand $VCommand $SCommand $WCommand $TTSCommand\n"
114126
]
115127
}
116128
],

kcpp_docs.embd

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,8 @@
616616
"vision": false,
617617
"transcribe":false,
618618
"multiplayer": false,
619+
"websearch":false,
620+
"tts":false,
619621
},
620622
"schema": {
621623
"$ref": "#/components/schemas/KcppVersion"
@@ -1443,6 +1445,52 @@
14431445
]
14441446
}
14451447
},
1448+
"/api/extra/tts": {
1449+
"post": {
1450+
"description": "Creates text-to-speech audio from input text.",
1451+
"requestBody": {
1452+
"content": {
1453+
"application/json": {
1454+
"example": {
1455+
"input": "hello world, how are you today?",
1456+
"voice": "fire",
1457+
},
1458+
"schema": {
1459+
"properties": {
1460+
"input": {
1461+
"type": "string",
1462+
"description": "The text to generate audio for. Try to keep it short."
1463+
},
1464+
"voice": {
1465+
"type": "string",
1466+
"description": "The voice to use when generating the audio. You can enter anything you like, a qunique speaker will be generated."
1467+
}
1468+
},
1469+
"type": "object"
1470+
}
1471+
}
1472+
},
1473+
"required": true
1474+
},
1475+
"responses": {
1476+
"200": {
1477+
"content": {
1478+
"audio/wav": {
1479+
"schema": {
1480+
"type": "string",
1481+
"format": "binary"
1482+
}
1483+
}
1484+
},
1485+
"description": "Successful request"
1486+
}
1487+
},
1488+
"summary": "Creates text-to-speech audio from input text.",
1489+
"tags": [
1490+
"api/extra"
1491+
]
1492+
}
1493+
},
14461494
"/props": {
14471495
"get": {
14481496
"summary": "Returns the Jinja template stored in the GGUF model, if found.",
@@ -1840,6 +1888,16 @@
18401888
"responses": {"default": {"description": ""}}
18411889
}
18421890
},
1891+
"/v1/audio/speech": {
1892+
"post": {
1893+
"summary": "Generates Text-To-Speech audio from input text. Please refer to OpenAI documentation",
1894+
"description": "Generates Text-To-Speech audio from input text.\n\n This is an OpenAI compatibility endpoint.\n\n Please refer to OpenAI documentation at [https://platform.openai.com/docs/api-reference/audio/createSpeech](https://platform.openai.com/docs/api-reference/audio/createSpeech)",
1895+
"tags": [
1896+
"v1"
1897+
],
1898+
"responses": {"default": {"description": ""}}
1899+
}
1900+
},
18431901
},
18441902
"servers": [
18451903
{

klite.embd

Lines changed: 70 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,6 +2028,15 @@ Current version indicated by LITEVER below.
20282028
.color_orangeurl:focus {
20292029
color: #ffedd3;
20302030
}
2031+
.color_grayurl {
2032+
color: #9e9e9e;
2033+
}
2034+
.color_grayurl:hover {
2035+
color: #9f9f9f;
2036+
}
2037+
.color_grayurl:focus {
2038+
color: #9e9e9e;
2039+
}
20312040

20322041
.color_orange {
20332042
color: #f7a223;
@@ -2791,7 +2800,8 @@ Current version indicated by LITEVER below.
27912800
const koboldcpp_transcribe_endpoint = "/api/extra/transcribe";
27922801
const koboldcpp_tokenize_endpoint = "/api/extra/tokencount";
27932802
const koboldcpp_perf_endpoint = "/api/extra/perf";
2794-
const koboldcpp_websearch_endpoint = "/api/extra/websearch"
2803+
const koboldcpp_websearch_endpoint = "/api/extra/websearch";
2804+
const koboldcpp_tts_endpoint = "/api/extra/tts";
27952805

27962806
const oai_models_endpoint = "/models";
27972807
const oai_submit_endpoint = "/completions";
@@ -2851,6 +2861,7 @@ Current version indicated by LITEVER below.
28512861
const XTTS_ID = 1000;
28522862
const ALLTALK_ID = 1001;
28532863
const OAI_TTS_ID = 1002;
2864+
const KCPP_TTS_ID = 1003;
28542865

28552866
const HD_RES_PX = 768;
28562867
const NO_HD_RES_PX = 512;
@@ -2966,6 +2977,7 @@ Current version indicated by LITEVER below.
29662977
var voice_is_processing = false; //currently processing voice?
29672978
let voiceprerecorder = null, voicerecorder = null, voice_is_speaking = false, voice_speaking_counter = 0;
29682979
let preaudiobuffers = [], preaudioblobs = []; //will store 2 preblobs at a time
2980+
var koboldcpp_has_tts = false;
29692981
var no_escape_html = false;
29702982
var timetaken_timestamp = performance.now();
29712983
var bg_silence = null;
@@ -3588,7 +3600,7 @@ Current version indicated by LITEVER below.
35883600
document.getElementById("lastreq1").innerHTML =
35893601
document.getElementById("lastreq2").innerHTML =
35903602
document.getElementById("lastreq3").innerHTML =
3591-
`KoboldAI Lite v${LITEVER} Web - Frontend for External API Services`;
3603+
`KoboldAI Lite v${LITEVER} Web - Frontend for <a href="#" class="color_grayurl" onclick="msgbox('KoboldAI Lite allows you to connect to various third-party AI services. We do not control or assume responsibility for the models or content generated by these services. The user is responsible for ensuring that their usage of this software is legal in their country, and complies with the terms of service of the service they are connected to. Use at your own discretion.','Disclaimer')">External API Services</a>`;
35923604

35933605
trigger_abort_controller(); //first trigger sets it up
35943606

@@ -5841,6 +5853,10 @@ initializeInstructUIFunctionality();
58415853
{
58425854
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.80") >= 0 && koboldcpp_has_websearch);
58435855
}
5856+
function is_using_kcpp_with_tts()
5857+
{
5858+
return (custom_kobold_endpoint!="" && koboldcpp_version && koboldcpp_version!="" && compare_version_str(koboldcpp_version, "1.81") >= 0 && koboldcpp_has_tts);
5859+
}
58445860
function is_using_web_lite()
58455861
{
58465862
return (window.location.hostname.includes("koboldai.net") || window.location.hostname.includes("lostruins.github.io"));
@@ -9208,6 +9224,7 @@ initializeInstructUIFunctionality();
92089224
koboldcpp_has_whisper = (data.transcribe?true:false);
92099225
koboldcpp_has_multiplayer = (data.multiplayer?true:false);
92109226
koboldcpp_has_websearch = (data.websearch?true:false);
9227+
koboldcpp_has_tts = (data.tts?true:false);
92119228
let has_password = (data.protected?true:false);
92129229
let has_txt2img = (data.txt2img?true:false);
92139230
let no_txt_model = (mdlname=="inactive");
@@ -9316,7 +9333,7 @@ initializeInstructUIFunctionality();
93169333
},()=>{
93179334
});
93189335
}
9319-
else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper)
9336+
else if(localflag && no_txt_model && !has_txt2img && !koboldcpp_has_vision && !koboldcpp_has_whisper && !koboldcpp_has_tts)
93209337
{
93219338
msgboxYesNo("This KoboldCpp instance has no models loaded. You can still use the WebUI to edit or view existing stories.<br><br>Would you like to connect to an external API service?","No Models Loaded",
93229339
()=>{
@@ -10312,6 +10329,8 @@ initializeInstructUIFunctionality();
1031210329
ttshtml += "<option value=\"1000\">XTTS API Server</option>";
1031310330
ttshtml += "<option value=\"1001\">AllTalk API Server</option>";
1031410331
ttshtml += "<option value=\"1002\">OpenAI-Compat. API Server</option>";
10332+
ttshtml += "<option value=\"1003\">KoboldCpp TTS API</option>";
10333+
1031510334
if ('speechSynthesis' in window) {
1031610335
let voices = window.speechSynthesis.getVoices();
1031710336
console.log("speech synth available: " + voices.length);
@@ -11895,6 +11914,7 @@ initializeInstructUIFunctionality();
1189511914
document.getElementById("xtts_container").classList.add("hidden");
1189611915
document.getElementById("oai_tts_container").classList.add("hidden");
1189711916
document.getElementById("alltalk_specific_controls").classList.add("hidden");
11917+
document.getElementById("kcpp_tts_container").classList.add("hidden");
1189811918

1189911919
const selectedTTS = document.getElementById("ttsselect").value;
1190011920

@@ -11911,6 +11931,15 @@ initializeInstructUIFunctionality();
1191111931
else if(selectedTTS == OAI_TTS_ID) {
1191211932
document.getElementById("oai_tts_container").classList.remove("hidden");
1191311933
}
11934+
else if(selectedTTS == KCPP_TTS_ID) {
11935+
document.getElementById("kcpp_tts_container").classList.remove("hidden");
11936+
if(is_using_kcpp_with_tts())
11937+
{
11938+
document.getElementById("nokcpptts").classList.add("hidden");
11939+
}else{
11940+
document.getElementById("nokcpptts").classList.remove("hidden");
11941+
}
11942+
}
1191411943
}
1191511944

1191611945
// Fetch RVC voices for AllTalk
@@ -12015,27 +12044,44 @@ initializeInstructUIFunctionality();
1201512044
}
1201612045
}
1201712046

12018-
if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID) //xtts api server
12047+
if(ssval==XTTS_ID || ssval==ALLTALK_ID || ssval==OAI_TTS_ID || ssval==KCPP_TTS_ID) //xtts api server
1201912048
{
1202012049
let is_xtts = (ssval==XTTS_ID);
1202112050
let is_oai_tts = (ssval==OAI_TTS_ID);
12051+
let is_kcpp_tts = (ssval==KCPP_TTS_ID);
1202212052
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
1202312053

12024-
if(is_oai_tts)
12054+
if(is_oai_tts || is_kcpp_tts)
1202512055
{
12026-
let payload =
12056+
let payload = {};
12057+
let ttsheaders = {};
12058+
let sub_endpt = "";
12059+
if(is_oai_tts)
1202712060
{
12028-
"model": document.getElementById("oai_tts_model").value,
12029-
"input": text,
12030-
"voice": document.getElementById("oai_tts_voice").value
12031-
};
12032-
let oaiheaders = {
12033-
'Content-Type': 'application/json',
12034-
'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
12035-
};
12036-
fetch(localsettings.saved_oai_tts_url, {
12061+
sub_endpt = localsettings.saved_oai_tts_url;
12062+
payload =
12063+
{
12064+
"model": document.getElementById("oai_tts_model").value,
12065+
"input": text,
12066+
"voice": document.getElementById("oai_tts_voice").value
12067+
};
12068+
ttsheaders = {
12069+
'Content-Type': 'application/json',
12070+
'Authorization': 'Bearer ' + localsettings.saved_oai_tts_key
12071+
};
12072+
} else {
12073+
sub_endpt = apply_proxy_url(custom_kobold_endpoint + koboldcpp_tts_endpoint);
12074+
payload =
12075+
{
12076+
"input": text,
12077+
"voice": document.getElementById("kcpp_tts_voice").value
12078+
};
12079+
ttsheaders = get_kobold_header();
12080+
}
12081+
12082+
fetch(sub_endpt, {
1203712083
method: 'POST',
12038-
headers: oaiheaders,
12084+
headers: ttsheaders,
1203912085
body: JSON.stringify(payload),
1204012086
})
1204112087
.then(response => response.arrayBuffer())
@@ -20200,6 +20246,14 @@ initializeInstructUIFunctionality();
2020020246
</tr><tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="alloy" id="oai_tts_voice" style="margin-left:3px; height:18px; width: 55px; padding: 2px;"></td></tr>
2020120247
</table>
2020220248
</div>
20249+
<div id="kcpp_tts_container" class="hidden">
20250+
<div class="color_red hidden" id="nokcpptts">KoboldCpp Not Connected</div>
20251+
<div class="settinglabel">
20252+
<table width="100%">
20253+
<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="kobo" placeholder="(Anything)" id="kcpp_tts_voice" style="margin-left:3px; height:18px; width: 80px; padding: 2px;"></td></tr>
20254+
</table>
20255+
</div>
20256+
</div>
2020320257
</div>
2020420258
<div class="settinglabel">
2020520259
<div class="justifyleft settingsmall" title="If unchecked, only speak AI replies, not other text.">Narrate Both Sides </div>

koboldcpp.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@
6767
modelbusy = threading.Lock()
6868
requestsinqueue = 0
6969
defaultport = 5001
70-
KcppVersion = "1.82004"
70+
KcppVersion = "1.82008"
7171
LcppVersion = "b4458"
7272
CudaSpecifics = "Cu124_Ar6175_SMC2_DmmvX32Y1"
73-
ReleaseDate = "2025/01/10"
73+
ReleaseDate = "2025/01/13"
7474
showdebug = True
7575
guimode = False
7676
showsamplerwarning = True
@@ -1295,7 +1295,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
12951295
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
12961296
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
12971297
except Exception:
1298-
FetchedCUdevices = []
12991298
FetchedCUdeviceMem = []
13001299
FetchedCUfreeMem = []
13011300
faileddetectvram = True
@@ -1318,7 +1317,6 @@ def fetch_gpu_properties(testCL,testCU,testVK):
13181317
if getamdvram:
13191318
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
13201319
except Exception:
1321-
FetchedCUdevices = []
13221320
FetchedCUdeviceMem = []
13231321
FetchedCUfreeMem = []
13241322
faileddetectvram = True
@@ -1375,6 +1373,27 @@ def fetch_gpu_properties(testCL,testCU,testVK):
13751373
# if faileddetectvram:
13761374
# print("Unable to detect VRAM, please set layers manually.")
13771375

1376+
# or then :
1377+
1378+
# lowestcumem = 0
1379+
# lowestfreecumem = 0
1380+
# try:
1381+
# for idx in range(0,4):
1382+
# if(len(FetchedCUdevices)>idx):
1383+
# CUDevicesNames[idx] = FetchedCUdevices[idx]
1384+
# for idx in range(0,4):
1385+
# if(len(FetchedCUdevices)>idx):
1386+
# if len(FetchedCUdeviceMem)>idx:
1387+
# dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
1388+
# lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
1389+
# if len(FetchedCUfreeMem)>idx:
1390+
# dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
1391+
# lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
1392+
# except Exception:
1393+
# lowestcumem = 0
1394+
# lowestfreecumem = 0
1395+
# faileddetectvram = True
1396+
13781397
if testVK:
13791398
try: # Get Vulkan names
13801399
output = subprocess.run(['vulkaninfo','--summary'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
@@ -1910,12 +1929,16 @@ def tts_generate(genparams):
19101929
is_quiet = True if (args.quiet or args.debugmode == -1) else False
19111930
prompt = genparams.get("input", genparams.get("text", ""))
19121931
prompt = prompt.strip()
1932+
voice = 1
19131933
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
1914-
voice = simple_lcg_hash(voicestr) if voicestr else 1
1934+
if voicestr and voicestr.strip().lower()=="kobo":
1935+
voice = 1
1936+
else:
1937+
voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
19151938
inputs = tts_generation_inputs()
19161939
inputs.prompt = prompt.encode("UTF-8")
19171940
inputs.speaker_seed = voice
1918-
inputs.audio_seed = 0
1941+
inputs.audio_seed = -1
19191942
inputs.quiet = is_quiet
19201943
ret = handle.tts_generate(inputs)
19211944
outstr = ""

otherarch/llama_v2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3028,4 +3028,4 @@ std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const
30283028
res.resize(n);
30293029

30303030
return res;
3031-
}
3031+
}

0 commit comments

Comments
 (0)