Skip to content

Commit 4cf297b

Browse files
authored
Add JavaScript API for SenseVoice (k2-fsa#1157)
1 parent 5f2126c commit 4cf297b

File tree

6 files changed

+210
-1
lines changed

6 files changed

+210
-1
lines changed

.github/scripts/test-nodejs-addon-npm.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ if [[ $arch != "ia32" && $platform != "win32" ]]; then
2020
node ./test_asr_non_streaming_nemo_ctc.js
2121
rm -rf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k
2222

23+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
24+
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
25+
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
26+
27+
node ./test_asr_non_streaming_sense_voice.js
28+
rm -rf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
29+
2330
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
2431
tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
2532
rm sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2

nodejs-addon-examples/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ The following tables list the examples in this folder.
9595
|[./test_asr_non_streaming_whisper.js](./test_asr_non_streaming_whisper.js)| Non-streaming speech recognition from a file using [Whisper](https://github.com/openai/whisper)|
9696
|[./test_asr_non_streaming_nemo_ctc.js](./test_asr_non_streaming_nemo_ctc.js)|Non-streaming speech recognition from a file using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
9797
|[./test_asr_non_streaming_paraformer.js](./test_asr_non_streaming_paraformer.js)|Non-streaming speech recognition from a file using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
98+
|[./test_asr_non_streaming_sense_voice.js](./test_asr_non_streaming_sense_voice.js)|Non-streaming speech recognition from a file using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
9899

99100
## Non-Streaming speech-to-text from a microphone with VAD
100101

@@ -104,6 +105,7 @@ The following tables list the examples in this folder.
104105
|[./test_vad_asr_non_streaming_whisper_microphone.js](./test_vad_asr_non_streaming_whisper_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Whisper](https://github.com/openai/whisper)|
105106
|[./test_vad_asr_non_streaming_nemo_ctc_microphone.js](./test_vad_asr_non_streaming_nemo_ctc_microphone.js)|VAD + Non-streaming speech recognition from a microphone using a [NeMo](https://github.com/NVIDIA/NeMo) CTC model with greedy search|
106107
|[./test_vad_asr_non_streaming_paraformer_microphone.js](./test_vad_asr_non_streaming_paraformer_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [Paraformer](https://github.com/alibaba-damo-academy/FunASR)|
108+
|[./test_vad_asr_non_streaming_sense_voice_microphone.js](./test_vad_asr_non_streaming_sense_voice_microphone.js)|VAD + Non-streaming speech recognition from a microphone using [SenseVoice](https://github.com/FunAudioLLM/SenseVoice)|
107109

108110
## Text-to-speech
109111

@@ -252,6 +254,20 @@ npm install naudiodon2
252254
node ./test_vad_asr_non_streaming_paraformer_microphone.js
253255
```
254256

257+
### Non-streaming speech recognition with SenseVoice
258+
259+
```bash
260+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
261+
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
262+
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
263+
264+
node ./test_asr_non_streaming_sense_voice.js
265+
266+
# To run VAD + non-streaming ASR with Paraformer using a microphone
267+
npm install naudiodon2
268+
node ./test_vad_asr_non_streaming_sense_voice_microphone.js
269+
```
270+
255271
### Text-to-speech with piper VITS models (TTS)
256272

257273
```bash

nodejs-addon-examples/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"dependencies": {
3-
"sherpa-onnx-node": "^1.0.30"
3+
"sherpa-onnx-node": "^1.10.17"
44
}
55
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright (c) 2024 Xiaomi Corporation
2+
const sherpa_onnx = require('sherpa-onnx-node');
3+
4+
// Please download test files from
5+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
6+
const config = {
7+
'featConfig': {
8+
'sampleRate': 16000,
9+
'featureDim': 80,
10+
},
11+
'modelConfig': {
12+
'senseVoice': {
13+
'model':
14+
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
15+
'useInverseTextNormalization': 1,
16+
},
17+
'tokens': './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
18+
'numThreads': 2,
19+
'provider': 'cpu',
20+
'debug': 1,
21+
}
22+
};
23+
24+
const waveFilename =
25+
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav';
26+
27+
const recognizer = new sherpa_onnx.OfflineRecognizer(config);
28+
console.log('Started')
29+
let start = Date.now();
30+
const stream = recognizer.createStream();
31+
const wave = sherpa_onnx.readWave(waveFilename);
32+
stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples});
33+
34+
recognizer.decode(stream);
35+
result = recognizer.getResult(stream)
36+
let stop = Date.now();
37+
console.log('Done')
38+
39+
const elapsed_seconds = (stop - start) / 1000;
40+
const duration = wave.samples.length / wave.sampleRate;
41+
const real_time_factor = elapsed_seconds / duration;
42+
console.log('Wave duration', duration.toFixed(3), 'secodns')
43+
console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns')
44+
console.log(
45+
`RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`,
46+
real_time_factor.toFixed(3))
47+
console.log(waveFilename)
48+
console.log('result\n', result)
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang)
2+
//
3+
const portAudio = require('naudiodon2');
4+
// console.log(portAudio.getDevices());
5+
6+
const sherpa_onnx = require('sherpa-onnx-node');
7+
8+
function createRecognizer() {
9+
// Please download test files from
10+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
11+
const config = {
12+
'featConfig': {
13+
'sampleRate': 16000,
14+
'featureDim': 80,
15+
},
16+
'modelConfig': {
17+
'senseVoice': {
18+
'model':
19+
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx',
20+
'useInverseTextNormalization': 1,
21+
},
22+
'tokens':
23+
'./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt',
24+
'numThreads': 2,
25+
'provider': 'cpu',
26+
'debug': 1,
27+
}
28+
};
29+
30+
return new sherpa_onnx.OfflineRecognizer(config);
31+
}
32+
33+
function createVad() {
34+
// please download silero_vad.onnx from
35+
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
36+
const config = {
37+
sileroVad: {
38+
model: './silero_vad.onnx',
39+
threshold: 0.5,
40+
minSpeechDuration: 0.25,
41+
minSilenceDuration: 0.5,
42+
windowSize: 512,
43+
},
44+
sampleRate: 16000,
45+
debug: true,
46+
numThreads: 1,
47+
};
48+
49+
const bufferSizeInSeconds = 60;
50+
51+
return new sherpa_onnx.Vad(config, bufferSizeInSeconds);
52+
}
53+
54+
const recognizer = createRecognizer();
55+
const vad = createVad();
56+
57+
const bufferSizeInSeconds = 30;
58+
const buffer =
59+
new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate);
60+
61+
const ai = new portAudio.AudioIO({
62+
inOptions: {
63+
channelCount: 1,
64+
closeOnError: true, // Close the stream if an audio error is detected, if
65+
// set false then just log the error
66+
deviceId: -1, // Use -1 or omit the deviceId to select the default device
67+
sampleFormat: portAudio.SampleFormatFloat32,
68+
sampleRate: vad.config.sampleRate
69+
}
70+
});
71+
72+
let printed = false;
73+
let index = 0;
74+
ai.on('data', data => {
75+
const windowSize = vad.config.sileroVad.windowSize;
76+
buffer.push(new Float32Array(data.buffer));
77+
while (buffer.size() > windowSize) {
78+
const samples = buffer.get(buffer.head(), windowSize);
79+
buffer.pop(windowSize);
80+
vad.acceptWaveform(samples);
81+
}
82+
83+
while (!vad.isEmpty()) {
84+
const segment = vad.front();
85+
vad.pop();
86+
const stream = recognizer.createStream();
87+
stream.acceptWaveform({
88+
samples: segment.samples,
89+
sampleRate: recognizer.config.featConfig.sampleRate
90+
});
91+
recognizer.decode(stream);
92+
const r = recognizer.getResult(stream);
93+
if (r.text.length > 0) {
94+
const text = r.text.toLowerCase().trim();
95+
console.log(`${index}: ${text}`);
96+
97+
const filename = `${index}-${text}-${
98+
new Date()
99+
.toLocaleTimeString('en-US', {hour12: false})
100+
.split(' ')[0]}.wav`;
101+
sherpa_onnx.writeWave(
102+
filename,
103+
{samples: segment.samples, sampleRate: vad.config.sampleRate});
104+
105+
index += 1;
106+
}
107+
}
108+
});
109+
110+
ai.start();
111+
console.log('Started! Please speak')

scripts/node-addon-api/src/non-streaming-asr.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,24 @@ static SherpaOnnxOfflineTdnnModelConfig GetOfflineTdnnModelConfig(
9696
return c;
9797
}
9898

99+
static SherpaOnnxOfflineSenseVoiceModelConfig GetOfflineSenseVoiceModelConfig(
100+
Napi::Object obj) {
101+
SherpaOnnxOfflineSenseVoiceModelConfig c;
102+
memset(&c, 0, sizeof(c));
103+
104+
if (!obj.Has("senseVoice") || !obj.Get("senseVoice").IsObject()) {
105+
return c;
106+
}
107+
108+
Napi::Object o = obj.Get("senseVoice").As<Napi::Object>();
109+
110+
SHERPA_ONNX_ASSIGN_ATTR_STR(model, model);
111+
SHERPA_ONNX_ASSIGN_ATTR_STR(language, language);
112+
SHERPA_ONNX_ASSIGN_ATTR_INT32(use_itn, useInverseTextNormalization);
113+
114+
return c;
115+
}
116+
99117
static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
100118
SherpaOnnxOfflineModelConfig c;
101119
memset(&c, 0, sizeof(c));
@@ -111,6 +129,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) {
111129
c.nemo_ctc = GetOfflineNeMoCtcModelConfig(o);
112130
c.whisper = GetOfflineWhisperModelConfig(o);
113131
c.tdnn = GetOfflineTdnnModelConfig(o);
132+
c.sense_voice = GetOfflineSenseVoiceModelConfig(o);
114133

115134
SHERPA_ONNX_ASSIGN_ATTR_STR(tokens, tokens);
116135
SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads);
@@ -225,6 +244,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) {
225244
delete[] c.model_config.tdnn.model;
226245
}
227246

247+
if (c.model_config.sense_voice.model) {
248+
delete[] c.model_config.sense_voice.model;
249+
}
250+
251+
if (c.model_config.sense_voice.language) {
252+
delete[] c.model_config.sense_voice.language;
253+
}
254+
228255
if (c.model_config.tokens) {
229256
delete[] c.model_config.tokens;
230257
}

0 commit comments

Comments
 (0)