Skip to content

Commit ed9f7a3

Browse files
committed
add some built in voices
1 parent 0a6ccda commit ed9f7a3

File tree

6 files changed

+94
-43
lines changed

6 files changed

+94
-43
lines changed

klite.embd

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
1212
-->
1313

1414
<script>
15-
const LITEVER = 202;
15+
const LITEVER = 203;
1616
const urlParams = new URLSearchParams(window.location.search);
1717
var localflag = true;
1818
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -11938,6 +11938,7 @@ initializeInstructUIFunctionality();
1193811938
}else{
1193911939
document.getElementById("nokcpptts").classList.remove("hidden");
1194011940
}
11941+
adjust_kcpptts_controls();
1194111942
}
1194211943
}
1194311944

@@ -11981,6 +11982,15 @@ initializeInstructUIFunctionality();
1198111982
rvcPitch.disabled = streamingMode;
1198211983
}
1198311984

11985+
function adjust_kcpptts_controls() {
11986+
if (document.getElementById("kcpp_tts_voice").value == "custom") {
11987+
document.getElementById("kcpp_tts_voice_custom").classList.remove("hidden");
11988+
} else {
11989+
document.getElementById("kcpp_tts_voice_custom").classList.add("hidden");
11990+
}
11991+
11992+
}
11993+
1198411994
// Update set_xtts_url to use the new fetch_rvc_voices function
1198511995
function set_xtts_url() {
1198611996
let is_xtts = (document.getElementById("ttsselect").value==XTTS_ID);
@@ -12073,7 +12083,7 @@ initializeInstructUIFunctionality();
1207312083
payload =
1207412084
{
1207512085
"input": text,
12076-
"voice": document.getElementById("kcpp_tts_voice").value
12086+
"voice": (document.getElementById("kcpp_tts_voice").value == "custom")?document.getElementById("kcpp_tts_voice_custom").value:document.getElementById("kcpp_tts_voice").value
1207712087
};
1207812088
ttsheaders = get_kobold_header();
1207912089
}
@@ -14496,7 +14506,7 @@ initializeInstructUIFunctionality();
1449614506
gentxt = trim_extra_stop_seqs(gentxt,false);
1449714507

1449814508
//fix alpaca leakage
14499-
if(localsettings.fix_alpaca_leak && get_instruct_starttag(true).toLowerCase().includes("### instruction"))
14509+
if(localsettings.fix_alpaca_leak && (localsettings.opmode == 2 || localsettings.opmode == 3 || localsettings.opmode == 4) && get_instruct_starttag(true).toLowerCase().includes("### instruction"))
1450014510
{
1450114511
let matches = gentxt.match(/\n### (.+?):/g);
1450214512
for(let m in matches)
@@ -20249,7 +20259,18 @@ initializeInstructUIFunctionality();
2024920259
<div class="color_red hidden" id="nokcpptts">KoboldCpp Not Connected</div>
2025020260
<div class="settinglabel">
2025120261
<table width="100%">
20252-
<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td><input class="settinglabel miniinput" type="text" value="kobo" placeholder="(Anything)" id="kcpp_tts_voice" style="margin-left:3px; height:18px; width: 80px; padding: 2px;"></td></tr>
20262+
<tr style="font-size:12px;padding:2px;margin:0px 0 0;"><td>TTS Voice </td><td>
20263+
<select onchange="adjust_kcpptts_controls();" class="form-control" id="kcpp_tts_voice" style="font-size:12px;height:20px;padding:0;margin:0px 0 0;">
20264+
<option value="kobo" selected>kobo</option>
20265+
<option value="cheery">cheery</option>
20266+
<option value="sleepy">sleepy</option>
20267+
<option value="tutor">tutor</option>
20268+
<option value="shouty">shouty</option>
20269+
<option value="bored">bored</option>
20270+
<option value="record">record</option>
20271+
<option value="custom">custom</option>
20272+
</select></td>
20273+
<td><input class="settinglabel miniinput" type="text" value="" placeholder="(Name)" id="kcpp_tts_voice_custom" style="margin-left:3px; height:18px; width:44px; padding: 2px;"></td></tr>
2025320274
</table>
2025420275
</div>
2025520276
</div>

koboldcpp.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,8 +1356,10 @@ def tts_generate(genparams):
13561356
prompt = prompt.strip()
13571357
voice = 1
13581358
voicestr = genparams.get("voice", genparams.get("speaker_wav", ""))
1359-
if voicestr and voicestr.strip().lower()=="kobo":
1360-
voice = 1
1359+
voice_mapping = ["kobo","cheery","sleepy","tutor","shouty","bored","record"]
1360+
normalized_voice = voicestr.strip().lower() if voicestr else ""
1361+
if normalized_voice in voice_mapping:
1362+
voice = voice_mapping.index(normalized_voice) + 1
13611363
else:
13621364
voice = simple_lcg_hash(voicestr.strip()) if voicestr else 1
13631365
inputs = tts_generation_inputs()
@@ -2320,7 +2322,7 @@ def do_GET(self):
23202322
response_body = (json.dumps([]).encode())
23212323

23222324
elif self.path.endswith(('/speakers_list')): #xtts compatible
2323-
response_body = (json.dumps(["kobo","bean","corn","spicy","lime","fire","metal","potato"]).encode()) #some random voices for them to enjoy
2325+
response_body = (json.dumps(["kobo","cheery","sleepy","tutor","shouty","bored","record"]).encode()) #some random voices for them to enjoy
23242326

23252327
elif self.path.endswith(('/api/tags')): #ollama compatible
23262328
response_body = (json.dumps({"models":[{"name":"koboldcpp","model":friendlymodelname,"modified_at":"2024-07-19T15:26:55.6122841+08:00","size":394998579,"digest":"b5dc5e784f2a3ee1582373093acf69a2f4e2ac1710b253a001712b86a61f88bb","details":{"parent_model":"","format":"gguf","family":"koboldcpp","families":["koboldcpp"],"parameter_size":"128M","quantization_level":"Q4_0"}}]}).encode())

otherarch/tts_adapter.cpp

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -599,8 +599,32 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
599599
{
600600
printf("\nReuse speaker ID=%d (%d tokens)...", last_speaker_seed, last_speaker_codes.size());
601601
}
602-
} else if (speaker_seed==1){ //1 is a special seed
603-
std::string speaker = "but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>";
602+
} else if (speaker_seed>=1 && speaker_seed<=7){ //special seeds
603+
std::string speaker = "";
604+
switch(speaker_seed)
605+
{
606+
case 1:
607+
speaker = "but<|t_0.31|><|code_start|><|1023|><|1474|><|17|><|121|><|1362|><|744|><|438|><|1319|><|744|><|1419|><|1246|><|923|><|1338|><|406|><|939|><|975|><|1491|><|965|><|1212|><|248|><|794|><|464|><|830|><|code_end|>\nthat<|t_0.13|><|code_start|><|1578|><|1773|><|660|><|1074|><|221|><|1803|><|142|><|914|><|798|><|485|><|code_end|>\nis<|t_0.11|><|code_start|><|737|><|794|><|1288|><|182|><|895|><|1653|><|448|><|471|><|code_end|>\nwhat<|t_0.12|><|code_start|><|1734|><|1306|><|779|><|490|><|525|><|1028|><|37|><|1633|><|1353|><|code_end|>\nit<|t_0.09|><|code_start|><|1343|><|898|><|270|><|1035|><|94|><|1409|><|388|><|code_end|>\nis<|t_0.23|><|code_start|><|694|><|695|><|577|><|692|><|1047|><|388|><|28|><|905|><|1155|><|50|><|1629|><|1775|><|1711|><|1729|><|404|><|1027|><|344|><|code_end|>";
608+
break;
609+
case 2:
610+
speaker = "but<|t_0.23|><|code_start|><|762|><|612|><|316|><|1128|><|171|><|250|><|1765|><|60|><|1075|><|81|><|1159|><|140|><|81|><|1158|><|678|><|1639|><|970|><|code_end|>\nthat<|t_0.21|><|code_start|><|1254|><|460|><|378|><|1621|><|1477|><|210|><|270|><|571|><|179|><|324|><|408|><|81|><|642|><|408|><|794|><|1506|><|code_end|>\nis<|t_0.16|><|code_start|><|36|><|57|><|1132|><|881|><|844|><|260|><|79|><|1794|><|1195|><|333|><|1808|><|1375|><|code_end|>\nwhat<|t_0.23|><|code_start|><|485|><|1583|><|1091|><|736|><|668|><|1703|><|670|><|832|><|959|><|853|><|983|><|969|><|576|><|697|><|721|><|1032|><|990|><|code_end|>\nit<|t_0.16|><|code_start|><|772|><|741|><|794|><|1015|><|110|><|965|><|1060|><|62|><|1305|><|470|><|284|><|259|><|code_end|>\nis<|t_0.35|><|code_start|><|516|><|1099|><|405|><|1831|><|1051|><|1471|><|26|><|1207|><|809|><|0|><|1303|><|1329|><|1196|><|798|><|679|><|992|><|1358|><|930|><|1065|><|942|><|1573|><|823|><|823|><|1527|><|1617|><|865|><|code_end|>";
611+
break;
612+
case 3:
613+
speaker = "but<|t_0.32|><|code_start|><|862|><|899|><|1601|><|1749|><|121|><|1176|><|1601|><|1007|><|1722|><|121|><|1142|><|1465|><|696|><|1284|><|1698|><|1275|><|860|><|113|><|590|><|1356|><|577|><|1346|><|1433|><|1779|><|code_end|>\nthat<|t_0.40|><|code_start|><|1248|><|1181|><|1792|><|735|><|1289|><|1346|><|975|><|1751|><|1587|><|1042|><|221|><|29|><|991|><|797|><|1184|><|1171|><|152|><|352|><|1119|><|1282|><|110|><|73|><|524|><|1424|><|1276|><|996|><|777|><|1119|><|1166|><|859|><|code_end|>\nis<|t_0.61|><|code_start|><|1666|><|1819|><|566|><|1333|><|1658|><|981|><|1705|><|1185|><|939|><|1813|><|899|><|1465|><|1176|><|712|><|1390|><|1578|><|1275|><|92|><|1729|><|1200|><|1615|><|1484|><|1200|><|1574|><|1307|><|1221|><|1606|><|1307|><|428|><|1759|><|1127|><|1574|><|1581|><|127|><|1507|><|1060|><|1769|><|34|><|1583|><|1579|><|1828|><|1580|><|652|><|1688|><|1527|><|1547|><|code_end|>\nwhat<|t_0.93|><|code_start|><|1691|><|731|><|1592|><|1573|><|1547|><|1617|><|1528|><|1547|><|1664|><|867|><|1571|><|1637|><|273|><|1354|><|1573|><|34|><|1724|><|1669|><|1538|><|1293|><|1623|><|1536|><|1233|><|1176|><|1348|><|1011|><|1722|><|899|><|1176|><|1419|><|899|><|1763|><|1293|><|1601|><|1543|><|939|><|1543|><|1419|><|799|><|1722|><|1233|><|1011|><|1543|><|1007|><|1176|><|1628|><|1114|><|1763|><|862|><|957|><|1693|><|274|><|1176|><|1719|><|805|><|1706|><|1472|><|1249|><|1365|><|877|><|269|><|197|><|1068|><|969|><|1591|><|1192|><|996|><|1764|><|1455|><|1643|><|code_end|>\nit<|t_0.15|><|code_start|><|804|><|1141|><|1566|><|1013|><|529|><|1650|><|1149|><|1744|><|763|><|1640|><|1692|><|code_end|>\nis<|t_0.40|><|code_start|><|1218|><|774|><|1576|><|1192|><|286|><|1831|><|1407|><|92|><|803|><|1311|><|26|><|546|><|1124|><|978|><|319|><|1062|><|1675|><|1608|><|1158|><|1456|><|1572|><|1199|><|1603|><|1592|><|1664|><|1586|><|1571|><|1354|><|34|><|1627|><|code_end|>";
614+
break;
615+
case 4:
616+
speaker = "but<|t_0.24|><|code_start|><|710|><|505|><|555|><|1255|><|1474|><|1315|><|1740|><|530|><|1446|><|1651|><|991|><|186|><|1310|><|816|><|175|><|935|><|776|><|672|><|code_end|>\nthat<|t_0.40|><|code_start|><|1440|><|807|><|712|><|1525|><|177|><|584|><|1006|><|1288|><|1664|><|1732|><|951|><|79|><|797|><|790|><|172|><|1111|><|106|><|1222|><|186|><|186|><|1122|><|1153|><|81|><|1055|><|1355|><|1757|><|861|><|1067|><|971|><|563|><|code_end|>\nis<|t_0.36|><|code_start|><|915|><|396|><|869|><|1779|><|805|><|1489|><|1157|><|1142|><|1011|><|555|><|686|><|1578|><|1428|><|1624|><|1252|><|949|><|175|><|239|><|154|><|1280|><|716|><|1729|><|1445|><|1791|><|1679|><|1769|><|884|><|code_end|>\nwhat<|t_0.36|><|code_start|><|1710|><|1734|><|1364|><|1789|><|1805|><|1628|><|1025|><|859|><|1595|><|987|><|136|><|1584|><|635|><|1006|><|1789|><|552|><|871|><|1505|><|1206|><|474|><|705|><|803|><|1305|><|1595|><|627|><|1137|><|486|><|code_end|>\nit<|t_0.47|><|code_start|><|676|><|1746|><|1672|><|1465|><|1346|><|673|><|957|><|1293|><|1348|><|1628|><|710|><|1233|><|1628|><|727|><|1338|><|1536|><|673|><|686|><|1273|><|1114|><|1523|><|1338|><|1510|><|273|><|1487|><|1656|><|1573|><|1786|><|813|><|1284|><|1442|><|17|><|325|><|975|><|555|><|code_end|>\nis<|t_0.47|><|code_start|><|1747|><|1419|><|1465|><|1538|><|17|><|862|><|1419|><|986|><|1628|><|1157|><|933|><|1176|><|939|><|899|><|625|><|939|><|1085|><|101|><|1224|><|1744|><|1777|><|1462|><|176|><|1618|><|972|><|1623|><|1580|><|1252|><|1479|><|1702|><|1802|><|895|><|1673|><|1510|><|1513|><|code_end|>";
617+
break;
618+
case 5:
619+
speaker = "but<|t_0.20|><|code_start|><|686|><|1288|><|1251|><|1428|><|481|><|702|><|1812|><|829|><|81|><|756|><|76|><|104|><|952|><|1723|><|1632|><|code_end|>\nthat<|t_0.20|><|code_start|><|1006|><|1067|><|1614|><|1810|><|887|><|43|><|1192|><|106|><|400|><|43|><|730|><|660|><|186|><|87|><|467|><|code_end|>\nis<|t_0.27|><|code_start|><|648|><|1625|><|9|><|685|><|243|><|106|><|996|><|990|><|228|><|809|><|1009|><|2|><|806|><|1325|><|1332|><|1766|><|202|><|725|><|416|><|822|><|code_end|>\nwhat<|t_0.36|><|code_start|><|1287|><|328|><|1241|><|1661|><|1651|><|1708|><|1740|><|1685|><|1715|><|1787|><|1381|><|197|><|1769|><|525|><|1000|><|234|><|364|><|115|><|212|><|632|><|1153|><|228|><|73|><|1002|><|1800|><|1277|><|1117|><|code_end|>\nit<|t_0.40|><|code_start|><|1830|><|1199|><|1282|><|1163|><|1195|><|1752|><|1092|><|1481|><|1003|><|513|><|1639|><|1805|><|1485|><|1645|><|195|><|1464|><|181|><|195|><|123|><|87|><|433|><|878|><|170|><|1265|><|375|><|1708|><|1739|><|1519|><|1185|><|1099|><|code_end|>\nis<|t_0.76|><|code_start|><|1748|><|1422|><|276|><|1337|><|1322|><|1519|><|1779|><|1067|><|1724|><|891|><|1205|><|1419|><|1144|><|1667|><|591|><|1003|><|1543|><|566|><|1390|><|426|><|1824|><|182|><|1138|><|52|><|129|><|1056|><|155|><|1056|><|1298|><|919|><|155|><|125|><|500|><|1022|><|571|><|315|><|400|><|100|><|617|><|295|><|757|><|324|><|592|><|1298|><|1310|><|57|><|876|><|1175|><|1353|><|1770|><|1649|><|1828|><|1637|><|362|><|1744|><|884|><|1027|><|code_end|>";
620+
break;
621+
case 6:
622+
speaker = "but<|t_0.39|><|code_start|><|1338|><|1319|><|805|><|1176|><|799|><|591|><|325|><|1023|><|274|><|1348|><|1246|><|1176|><|591|><|555|><|758|><|591|><|438|><|710|><|727|><|1419|><|1157|><|1157|><|1293|><|633|><|1003|><|832|><|871|><|1399|><|1315|><|code_end|>\nthat<|t_0.20|><|code_start|><|1352|><|668|><|859|><|1793|><|1455|><|260|><|1117|><|260|><|186|><|1209|><|106|><|1098|><|260|><|1088|><|752|><|code_end|>\nis<|t_0.17|><|code_start|><|949|><|869|><|352|><|821|><|475|><|788|><|1150|><|1286|><|1079|><|1726|><|328|><|1624|><|1641|><|code_end|>\nwhat<|t_0.47|><|code_start|><|1175|><|1710|><|640|><|231|><|1781|><|884|><|1649|><|930|><|1270|><|1824|><|1383|><|1748|><|1011|><|1176|><|1023|><|986|><|1419|><|1425|><|686|><|899|><|627|><|1419|><|1023|><|799|><|1338|><|1163|><|1464|><|627|><|840|><|361|><|693|><|159|><|1041|><|562|><|1444|><|code_end|>\nit<|t_0.12|><|code_start|><|1078|><|685|><|982|><|277|><|1494|><|793|><|229|><|853|><|308|><|code_end|>\nis<|t_0.23|><|code_start|><|1291|><|1308|><|902|><|531|><|1022|><|231|><|992|><|1671|><|967|><|992|><|1646|><|1654|><|1791|><|701|><|1624|><|1565|><|1532|><|code_end|>";
623+
break;
624+
case 7:
625+
speaker = "but<|t_0.31|><|code_start|><|174|><|544|><|68|><|391|><|131|><|187|><|559|><|534|><|223|><|1185|><|612|><|301|><|387|><|94|><|1224|><|1159|><|162|><|236|><|1133|><|774|><|888|><|144|><|1038|><|code_end|>\nthat<|t_0.20|><|code_start|><|223|><|77|><|1517|><|446|><|1207|><|140|><|873|><|147|><|1051|><|210|><|1216|><|147|><|1148|><|678|><|501|><|code_end|>\nis<|t_0.13|><|code_start|><|912|><|822|><|622|><|519|><|1017|><|546|><|1740|><|1823|><|1561|><|273|><|code_end|>\nwhat<|t_0.16|><|code_start|><|1571|><|1597|><|486|><|1417|><|130|><|747|><|1088|><|1045|><|580|><|239|><|431|><|40|><|code_end|>\nit<|t_0.12|><|code_start|><|1736|><|878|><|1159|><|1004|><|1168|><|594|><|544|><|77|><|1032|><|code_end|>\nis<|t_0.28|><|code_start|><|1088|><|873|><|1726|><|1099|><|1095|><|1412|><|1106|><|1317|><|1292|><|149|><|1429|><|967|><|873|><|1754|><|229|><|1046|><|1595|><|1003|><|1603|><|1529|><|101|><|code_end|>";
626+
break;
627+
}
604628
last_speaker_codes = common_tokenize(model_ttc, speaker, false, true);
605629
last_speaker_seed = speaker_seed;
606630
if(!inputs.quiet && ttsdebugmode==1)
@@ -818,19 +842,22 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
818842
const float * embd = llama_get_embeddings(cts_ctx);
819843
std::vector<float> audio = embd_to_audio(embd, n_codes, n_embd, 4);
820844

821-
const int n_sr = 24000; // sampling rate
845+
const int n_sr = 24000; // original sampling rate
846+
const int t_sr = 16000; //final target sampling rate
822847

823-
// zero out first 0.2 seconds or 0.05 depending on whether its seeded
824-
const int cutout = (speaker_seed>0?(24000/5):(24000/20));
848+
// zero out first 0.1 seconds or 0.05 depending on whether its seeded
849+
const int cutout = (speaker_seed>0?(n_sr/10):(n_sr/20));
825850
for (int i = 0; i < cutout; ++i) {
826851
audio[i] = 0.0f;
827852
}
828853
//add some silence at the end
829-
for (int i = 0; i < 24000/10; ++i) {
854+
for (int i = 0; i < n_sr/10; ++i) {
830855
audio.push_back(0.0f);
831856
}
832857

833-
last_generated_audio = save_wav16_base64(audio, n_sr);
858+
audio = resample_wav(audio,n_sr,t_sr); //resample to 16k
859+
860+
last_generated_audio = save_wav16_base64(audio, t_sr);
834861
ttstime = timer_check();
835862

836863
if(!inputs.quiet)

otherarch/utils.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,29 @@ std::string get_timestamp_str()
345345
return timestamp;
346346
}
347347

348+
std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate) {
349+
350+
size_t input_size = input.size();
351+
352+
double ratio = static_cast<double>(output_rate) / input_rate;
353+
size_t newLength = static_cast<size_t>(input.size() * ratio);
354+
std::vector<float> output(newLength);
355+
356+
// Perform simple linear interpolation resampling
357+
for (size_t i = 0; i < newLength; ++i) {
358+
double srcIndex = i / ratio;
359+
size_t srcIndexInt = static_cast<size_t>(srcIndex);
360+
double frac = srcIndex - srcIndexInt;
361+
if (srcIndexInt + 1 < input_size) {
362+
output[i] = static_cast<float>(input[srcIndexInt] * (1 - frac) + input[srcIndexInt + 1] * frac);
363+
} else {
364+
output[i] = input[srcIndexInt];
365+
}
366+
}
367+
368+
return output;
369+
}
370+
348371
//a very rudimentary all in one sampling function which has no dependencies
349372
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng)
350373
{

otherarch/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_leng
6161
std::string kcpp_base64_encode(const std::string &data);
6262

6363
std::string get_timestamp_str();
64+
std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
65+
6466
int32_t kcpp_quick_sample(float * logits, const int n_logits, int top_k, float temp, std::mt19937 & rng);
6567

6668
struct kcpp_embd_batch { //duplcated from llava_embd_batch

otherarch/whispercpp/whisper_adapter.cpp

Lines changed: 5 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -41,35 +41,6 @@ static bool is_wav_buffer(const std::string buf) {
4141
return true;
4242
}
4343

44-
static std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate) {
45-
46-
size_t input_size = input.size();
47-
48-
double ratio = static_cast<double>(output_rate) / input_rate;
49-
size_t newLength = static_cast<size_t>(input.size() * ratio);
50-
std::vector<float> output(newLength);
51-
52-
if(whisperdebugmode==1)
53-
{
54-
printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu, out size: %zu)",
55-
input_rate, output_rate, input_size, static_cast<std::size_t>(output.size()));
56-
}
57-
58-
// Perform simple linear interpolation resampling
59-
for (size_t i = 0; i < newLength; ++i) {
60-
double srcIndex = i / ratio;
61-
size_t srcIndexInt = static_cast<size_t>(srcIndex);
62-
double frac = srcIndex - srcIndexInt;
63-
if (srcIndexInt + 1 < input_size) {
64-
output[i] = static_cast<float>(input[srcIndexInt] * (1 - frac) + input[srcIndexInt + 1] * frac);
65-
} else {
66-
output[i] = input[srcIndexInt];
67-
}
68-
}
69-
70-
return output;
71-
}
72-
7344
static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo)
7445
{
7546
drwav wav;
@@ -119,6 +90,11 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
11990
}
12091

12192
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
93+
if(whisperdebugmode==1)
94+
{
95+
printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",
96+
wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());
97+
}
12298
raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE);
12399
}
124100

0 commit comments

Comments
 (0)