Skip to content

Commit 3a1de0b

Browse files
authored
Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726)
1 parent e8d499d commit 3a1de0b

File tree

5 files changed

+154
-6
lines changed

5 files changed

+154
-6
lines changed

.github/scripts/test-nodejs-npm.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,15 @@ ls -lh
1010
ls -lh node_modules
1111

1212
# offline tts
13-
#
13+
14+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
15+
tar xf kokoro-en-v0_19.tar.bz2
16+
rm kokoro-en-v0_19.tar.bz2
17+
18+
node ./test-offline-tts-kokoro-en.js
19+
20+
ls -lh
21+
1422
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
1523
tar xvf matcha-icefall-zh-baker.tar.bz2
1624
rm matcha-icefall-zh-baker.tar.bz2

nodejs-examples/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js
4242

4343
In the following, we demonstrate how to run text-to-speech.
4444

45+
## ./test-offline-tts-kokoro-en.js
46+
47+
[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
48+
[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2)
49+
for text-to-speech.
50+
51+
You can use the following command to run it:
52+
53+
```bash
54+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
55+
tar xf kokoro-en-v0_19.tar.bz2
56+
rm kokoro-en-v0_19.tar.bz2
57+
58+
node ./test-offline-tts-kokoro-en.js
59+
```
60+
4561
## ./test-offline-tts-matcha-zh.js
4662

4763
[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
2+
3+
const sherpa_onnx = require('sherpa-onnx');
4+
5+
function createOfflineTts() {
6+
let offlineTtsKokoroModelConfig = {
7+
model: './kokoro-en-v0_19/model.onnx',
8+
voices: './kokoro-en-v0_19/voices.bin',
9+
tokens: './kokoro-en-v0_19/tokens.txt',
10+
dataDir: './kokoro-en-v0_19/espeak-ng-data',
11+
lengthScale: 1.0,
12+
};
13+
let offlineTtsModelConfig = {
14+
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
15+
numThreads: 1,
16+
debug: 1,
17+
provider: 'cpu',
18+
};
19+
20+
let offlineTtsConfig = {
21+
offlineTtsModelConfig: offlineTtsModelConfig,
22+
maxNumSentences: 1,
23+
};
24+
25+
return sherpa_onnx.createOfflineTts(offlineTtsConfig);
26+
}
27+
28+
const tts = createOfflineTts();
29+
const speakerId = 0;
30+
const speed = 1.0;
31+
const text =
32+
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
33+
34+
const audio = tts.generate({text: text, sid: speakerId, speed: speed});
35+
tts.save('./test-kokoro-en.wav', audio);
36+
console.log('Saved to test-kokoro-en.wav successfully.');
37+
tts.free();

wasm/tts/sherpa-onnx-tts.js

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@ function freeConfig(config, Module) {
88
freeConfig(config.config, Module)
99
}
1010

11-
if ('config2' in config) {
12-
freeConfig(config.config2, Module)
11+
if ('matcha' in config) {
12+
freeConfig(config.matcha, Module)
13+
}
14+
15+
if ('kokoro' in config) {
16+
freeConfig(config.kokoro, Module)
1317
}
1418

1519
Module._free(config.ptr);
@@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
132136
}
133137
}
134138

139+
function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
140+
const modelLen = Module.lengthBytesUTF8(config.model) + 1;
141+
const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
142+
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
143+
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
144+
145+
const n = modelLen + voicesLen + tokensLen + dataDirLen;
146+
147+
const buffer = Module._malloc(n);
148+
149+
const len = 5 * 4;
150+
const ptr = Module._malloc(len);
151+
152+
let offset = 0;
153+
Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
154+
offset += modelLen;
155+
156+
Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
157+
offset += voicesLen;
158+
159+
Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
160+
offset += tokensLen;
161+
162+
Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
163+
offset += dataDirLen;
164+
165+
offset = 0;
166+
Module.setValue(ptr, buffer + offset, 'i8*');
167+
offset += modelLen;
168+
169+
Module.setValue(ptr + 4, buffer + offset, 'i8*');
170+
offset += voicesLen;
171+
172+
Module.setValue(ptr + 8, buffer + offset, 'i8*');
173+
offset += tokensLen;
174+
175+
Module.setValue(ptr + 12, buffer + offset, 'i8*');
176+
offset += dataDirLen;
177+
178+
Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
179+
180+
return {
181+
buffer: buffer, ptr: ptr, len: len,
182+
}
183+
}
184+
135185
function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
136186
if (!('offlineTtsVitsModelConfig' in config)) {
137187
config.offlineTtsVitsModelConfig = {
@@ -159,14 +209,29 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
159209
};
160210
}
161211

212+
if (!('offlineTtsKokoroModelConfig' in config)) {
213+
config.offlineTtsKokoroModelConfig = {
214+
model: '',
215+
voices: '',
216+
tokens: '',
217+
lengthScale: 1.0,
218+
dataDir: '',
219+
};
220+
}
221+
162222

163223
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
164224
config.offlineTtsVitsModelConfig, Module);
165225

166226
const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
167227
config.offlineTtsMatchaModelConfig, Module);
168228

169-
const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
229+
const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
230+
config.offlineTtsKokoroModelConfig, Module);
231+
232+
const len = vitsModelConfig.len + matchaModelConfig.len +
233+
kokoroModelConfig.len + 3 * 4;
234+
170235
const ptr = Module._malloc(len);
171236

172237
let offset = 0;
@@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
188253
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
189254
offset += matchaModelConfig.len;
190255

256+
Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
257+
offset += kokoroModelConfig.len;
258+
191259
return {
192260
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
193-
config2: matchaModelConfig
261+
matcha: matchaModelConfig, kokoro: kokoroModelConfig,
194262
}
195263
}
196264

@@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) {
308376
lengthScale: 1.0,
309377
};
310378

379+
const offlineTtsKokoroModelConfig = {
380+
model: '',
381+
voices: '',
382+
tokens: '',
383+
dataDir: '',
384+
lengthScale: 1.0,
385+
};
386+
311387
const offlineTtsModelConfig = {
312388
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
313389
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
390+
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
314391
numThreads: 1,
315392
debug: 1,
316393
provider: 'cpu',

wasm/tts/sherpa-onnx-wasm-main-tts.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ extern "C" {
1515

1616
static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
1717
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
18+
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, "");
1819
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
1920
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
20-
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
21+
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
22+
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
2123
"");
2224
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
2325
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
@@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
2729
auto tts_model_config = &tts_config->model;
2830
auto vits_model_config = &tts_model_config->vits;
2931
auto matcha_model_config = &tts_model_config->matcha;
32+
auto kokoro = &tts_model_config->kokoro;
3033
fprintf(stdout, "----------vits model config----------\n");
3134
fprintf(stdout, "model: %s\n", vits_model_config->model);
3235
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
@@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
4750
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
4851
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);
4952

53+
fprintf(stdout, "----------kokoro model config----------\n");
54+
fprintf(stdout, "model: %s\n", kokoro->model);
55+
fprintf(stdout, "voices: %s\n", kokoro->voices);
56+
fprintf(stdout, "tokens: %s\n", kokoro->tokens);
57+
fprintf(stdout, "data_dir: %s\n", kokoro->data_dir);
58+
fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);
59+
5060
fprintf(stdout, "----------tts model config----------\n");
5161
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
5262
fprintf(stdout, "debug: %d\n", tts_model_config->debug);

0 commit comments

Comments
 (0)