Skip to content

Commit 639ad17

Browse files
authored
Add Javascript (WebAssembly) API for Dolphin CTC models (#2093)
1 parent 74f402e commit 639ad17

File tree

9 files changed

+172
-50
lines changed

9 files changed

+172
-50
lines changed

.github/scripts/test-nodejs-npm.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ git status
99
ls -lh
1010
ls -lh node_modules
1111

12+
# asr with offline dolphin ctc
13+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
14+
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
15+
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
16+
node ./test-offline-dolphin-ctc.js
17+
rm -rf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02
18+
1219
# speech enhancement
1320
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/gtcrn_simple.onnx
1421
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speech-enhancement-models/inp_16k.wav
@@ -56,7 +63,7 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/m
5663
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
5764
rm matcha-icefall-en_US-ljspeech.tar.bz2
5865

59-
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
66+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/vocos-22khz-univ.onnx
6067

6168
node ./test-offline-tts-matcha-en.js
6269

.github/workflows/wasm-simd-hf-space-vad-asr.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ jobs:
2121
fail-fast: false
2222
matrix:
2323
os: [ubuntu-latest]
24-
total: ["8"]
25-
index: ["0", "1", "2", "3", "4", "5", "6", "7"]
24+
total: ["11"]
25+
index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
2626

2727
steps:
2828
- uses: actions/checkout@v4

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ We also have spaces built using WebAssembly. They are listed below:
119119
|VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]|
120120
|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]|
121121
|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]|
122+
|VAD + speech recognition (多语种及多种中文方言) with [Dolphin][Dolphin]-base |[Click me][wasm-hf-vad-asr-multi-lang-dolphin-base]| [地址][wasm-ms-vad-asr-multi-lang-dolphin-base]|
122123
|Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]|
123124
|Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]|
124125
|Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]|
@@ -390,6 +391,10 @@ It uses TTS from sherpa-onnx. See also [✨ Speak command that uses the new glob
390391
[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer
391392
[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
392393
[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small
394+
[Dolphin]: https://github.com/DataoceanAI/Dolphin
395+
[wasm-ms-vad-asr-multi-lang-dolphin-base]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
396+
[wasm-hf-vad-asr-multi-lang-dolphin-base]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc
397+
393398
[wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en
394399
[wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en
395400
[wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de

nodejs-examples/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,20 @@ node ./test-offline-tts-vits-zh.js
140140
In the following, we demonstrate how to decode files and how to perform
141141
speech recognition with a microphone with `nodejs`.
142142

143+
## ./test-offline-dolphin-ctc.js
144+
145+
[./test-offline-dolphin-ctc.js](./test-offline-dolphin-ctc.js) demonstrates
146+
how to decode a file with a [Dolphin](https://github.com/DataoceanAI/Dolphin) CTC model.
147+
148+
You can use the following command to run it:
149+
150+
```bash
151+
wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
152+
tar xvf sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
153+
rm sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02.tar.bz2
154+
node ./test-offline-dolphin-ctc.js
155+
```
156+
143157
## ./test-offline-nemo-ctc.js
144158

145159
[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
2+
//
3+
const fs = require('fs');
4+
const {Readable} = require('stream');
5+
const wav = require('wav');
6+
7+
const sherpa_onnx = require('sherpa-onnx');
8+
9+
function createOfflineRecognizer() {
10+
let config = {
11+
modelConfig: {
12+
dolphin: {
13+
model:
14+
'./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/model.int8.onnx',
15+
},
16+
tokens:
17+
'./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/tokens.txt',
18+
}
19+
};
20+
21+
return sherpa_onnx.createOfflineRecognizer(config);
22+
}
23+
24+
const recognizer = createOfflineRecognizer();
25+
const stream = recognizer.createStream();
26+
27+
const waveFilename =
28+
'./sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02/test_wavs/0.wav';
29+
const wave = sherpa_onnx.readWave(waveFilename);
30+
stream.acceptWaveform(wave.sampleRate, wave.samples);
31+
32+
recognizer.decode(stream);
33+
const text = recognizer.getResult(stream).text;
34+
console.log(text);
35+
36+
stream.free();
37+
recognizer.free();

scripts/wasm/generate-vad-asr.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,21 @@ def get_models():
197197
git diff
198198
""",
199199
),
200+
Model(
201+
model_name="sherpa-onnx-dolphin-base-ctc-multi-lang-int8-2025-04-02",
202+
hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc",
203+
ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-multi-lang-dophin-ctc",
204+
short_name="vad-asr-multi_lang-dolphin_ctc",
205+
cmd="""
206+
pushd $model_name
207+
mv model.int8.onnx ../dolphin.onnx
208+
mv tokens.txt ../
209+
popd
210+
rm -rf $model_name
211+
sed -i.bak 's%Zipformer%<a href="https://github.com/DataoceanAI/Dolphin">Dolphin</a> (多种中文方言及非常多种语言)%g' ../index.html
212+
git diff
213+
""",
214+
),
200215
]
201216
return models
202217

wasm/asr/sherpa-onnx-asr.js

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ function freeConfig(config, Module) {
3939
freeConfig(config.fireRedAsr, Module)
4040
}
4141

42+
if ('dolphin' in config) {
43+
freeConfig(config.dolphin, Module)
44+
}
45+
4246
if ('moonshine' in config) {
4347
freeConfig(config.moonshine, Module)
4448
}
@@ -562,6 +566,23 @@ function initSherpaOnnxOfflineNemoEncDecCtcModelConfig(config, Module) {
562566
}
563567
}
564568

569+
function initSherpaOnnxOfflineDolphinModelConfig(config, Module) {
570+
const n = Module.lengthBytesUTF8(config.model || '') + 1;
571+
572+
const buffer = Module._malloc(n);
573+
574+
const len = 1 * 4; // 1 pointer
575+
const ptr = Module._malloc(len);
576+
577+
Module.stringToUTF8(config.model || '', buffer, n);
578+
579+
Module.setValue(ptr, buffer, 'i8*');
580+
581+
return {
582+
buffer: buffer, ptr: ptr, len: len,
583+
}
584+
}
585+
565586
function initSherpaOnnxOfflineWhisperModelConfig(config, Module) {
566587
const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1;
567588
const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1;
@@ -769,6 +790,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
769790
};
770791
}
771792

793+
if (!('dolphin' in config)) {
794+
config.dolphin = {
795+
model: '',
796+
};
797+
}
798+
772799
if (!('whisper' in config)) {
773800
config.whisper = {
774801
encoder: '',
@@ -832,8 +859,12 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
832859
const fireRedAsr =
833860
initSherpaOnnxOfflineFireRedAsrModelConfig(config.fireRedAsr, Module);
834861

862+
const dolphin =
863+
initSherpaOnnxOfflineDolphinModelConfig(config.dolphin, Module);
864+
835865
const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len +
836-
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len;
866+
tdnn.len + 8 * 4 + senseVoice.len + moonshine.len + fireRedAsr.len +
867+
dolphin.len;
837868

838869
const ptr = Module._malloc(len);
839870

@@ -932,10 +963,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) {
932963
Module._CopyHeap(fireRedAsr.ptr, fireRedAsr.len, ptr + offset);
933964
offset += fireRedAsr.len;
934965

966+
Module._CopyHeap(dolphin.ptr, dolphin.len, ptr + offset);
967+
offset += dolphin.len;
968+
935969
return {
936970
buffer: buffer, ptr: ptr, len: len, transducer: transducer,
937971
paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn,
938-
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr
972+
senseVoice: senseVoice, moonshine: moonshine, fireRedAsr: fireRedAsr,
973+
dolphin: dolphin
939974
}
940975
}
941976

wasm/nodejs/sherpa-onnx-wasm-nodejs.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ extern "C" {
1313
static_assert(sizeof(SherpaOnnxOfflineTransducerModelConfig) == 3 * 4, "");
1414
static_assert(sizeof(SherpaOnnxOfflineParaformerModelConfig) == 4, "");
1515

16+
static_assert(sizeof(SherpaOnnxOfflineDolphinModelConfig) == 4, "");
1617
static_assert(sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) == 4, "");
1718
static_assert(sizeof(SherpaOnnxOfflineWhisperModelConfig) == 5 * 4, "");
1819
static_assert(sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) == 2 * 4, "");
@@ -29,7 +30,8 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) ==
2930
sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4 +
3031
sizeof(SherpaOnnxOfflineSenseVoiceModelConfig) +
3132
sizeof(SherpaOnnxOfflineMoonshineModelConfig) +
32-
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig),
33+
sizeof(SherpaOnnxOfflineFireRedAsrModelConfig) +
34+
sizeof(SherpaOnnxOfflineDolphinModelConfig),
3335

3436
"");
3537
static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, "");
@@ -73,6 +75,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
7375
auto sense_voice = &model_config->sense_voice;
7476
auto moonshine = &model_config->moonshine;
7577
auto fire_red_asr = &model_config->fire_red_asr;
78+
auto dolphin = &model_config->dolphin;
7679

7780
fprintf(stdout, "----------offline transducer model config----------\n");
7881
fprintf(stdout, "encoder: %s\n", transducer->encoder);
@@ -110,6 +113,9 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) {
110113
fprintf(stdout, "encoder: %s\n", fire_red_asr->encoder);
111114
fprintf(stdout, "decoder: %s\n", fire_red_asr->decoder);
112115

116+
fprintf(stdout, "----------offline Dolphin model config----------\n");
117+
fprintf(stdout, "model: %s\n", dolphin->model);
118+
113119
fprintf(stdout, "tokens: %s\n", model_config->tokens);
114120
fprintf(stdout, "num_threads: %d\n", model_config->num_threads);
115121
fprintf(stdout, "provider: %s\n", model_config->provider);

0 commit comments

Comments
 (0)