Skip to content

Commit 7f6342a

Browse files
authored
Add C++ runtime for SenseVoice models (k2-fsa#1148)
1 parent 1436e68 commit 7f6342a

34 files changed

+1160
-39
lines changed

.github/scripts/test-offline-ctc.sh

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,30 @@ echo "PATH: $PATH"
1515

1616
which $EXE
1717

18-
if false; then
18+
log "------------------------------------------------------------"
19+
log "Run SenseVoice models"
20+
log "------------------------------------------------------------"
21+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
22+
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
23+
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
24+
repo=sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
25+
26+
for m in model.onnx model.int8.onnx; do
27+
for w in zh en yue ja ko; do
28+
for use_itn in 0 1; do
29+
echo "$m $w $use_itn"
30+
time $EXE \
31+
--tokens=$repo/tokens.txt \
32+
--sense-voice-model=$repo/$m \
33+
--sense-voice-use-itn=$use_itn \
34+
$repo/test_wavs/$w.wav
35+
done
36+
done
37+
done
38+
39+
rm -rf $repo
40+
41+
if true; then
1942
# It has problems with onnxruntime 1.18
2043
log "------------------------------------------------------------"
2144
log "Run Wenet models"

.github/scripts/test-python.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@ log() {
1010

1111
export GIT_CLONE_PROTECTION_ACTIVE=false
1212

13+
log "test offline SenseVoice CTC"
14+
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
15+
name=$(basename $url)
16+
repo=$(basename -s .tar.bz2 $name)
17+
18+
curl -SL -O $url
19+
tar xvf $name
20+
rm $name
21+
ls -lh $repo
22+
python3 ./python-api-examples/offline-sense-voice-ctc-decode-files.py
23+
rm -rf $repo
24+
1325
log "test offline TeleSpeech CTC"
1426
url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2
1527
name=$(basename $url)

.github/workflows/export-sense-voice-to-onnx.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
echo "pwd: $PWD"
7474
ls -lh ../scripts/sense-voice
7575
76-
rm -rf ./
76+
rm -rf ./*
7777
7878
cp -v ../scripts/sense-voice/*.onnx .
7979
cp -v ../scripts/sense-voice/tokens.txt .

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,4 @@ sherpa-onnx-telespeech-ctc-*
111111
*.fst
112112
.ccache
113113
lib*.a
114+
sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.10.17
2+
3+
* Support SenseVoice CTC models.
4+
15
## 1.10.16
26

37
* Support zh-en TTS model from MeloTTS.

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ project(sherpa-onnx)
1111
# ./nodejs-addon-examples
1212
# ./dart-api-examples/
1313
# ./CHANGELOG.md
14-
set(SHERPA_ONNX_VERSION "1.10.16")
14+
set(SHERPA_ONNX_VERSION "1.10.17")
1515

1616
# Disable warning about
1717
#
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
This file shows how to use a non-streaming SenseVoice CTC model from
5+
https://github.com/FunAudioLLM/SenseVoice
6+
to decode files.
7+
8+
Please download model files from
9+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
10+
11+
For instance,
12+
13+
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
14+
tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
15+
rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2
16+
"""
17+
18+
from pathlib import Path
19+
20+
import sherpa_onnx
21+
import soundfile as sf
22+
23+
24+
def create_recognizer():
25+
model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx"
26+
tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt"
27+
test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav"
28+
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/en.wav"
29+
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/ja.wav"
30+
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/ko.wav"
31+
# test_wav = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/yue.wav"
32+
33+
if not Path(model).is_file() or not Path(test_wav).is_file():
34+
raise ValueError(
35+
"""Please download model files from
36+
https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
37+
"""
38+
)
39+
return (
40+
sherpa_onnx.OfflineRecognizer.from_sense_voice(
41+
model=model,
42+
tokens=tokens,
43+
use_itn=True,
44+
debug=True,
45+
),
46+
test_wav,
47+
)
48+
49+
50+
def main():
51+
recognizer, wave_filename = create_recognizer()
52+
53+
audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True)
54+
audio = audio[:, 0] # only use the first channel
55+
56+
# audio is a 1-D float32 numpy array normalized to the range [-1, 1]
57+
# sample_rate does not need to be 16000 Hz
58+
59+
stream = recognizer.create_stream()
60+
stream.accept_waveform(sample_rate, audio)
61+
recognizer.decode_stream(stream)
62+
print(wave_filename)
63+
print(stream.result)
64+
65+
66+
if __name__ == "__main__":
67+
main()

scripts/sense-voice/export-onnx.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,9 @@ def main():
162162
"neg_mean": neg_mean,
163163
"inv_stddev": inv_stddev,
164164
"model_type": "sense_voice_ctc",
165-
"version": "1",
165+
# version 1: Use QInt8
166+
# version 2: Use QUInt8
167+
"version": "2",
166168
"model_author": "iic",
167169
"maintainer": "k2-fsa",
168170
"vocab_size": vocab_size,
@@ -185,7 +187,10 @@ def main():
185187
model_input=filename,
186188
model_output=filename_int8,
187189
op_types_to_quantize=["MatMul"],
188-
weight_type=QuantType.QInt8,
190+
# Note that we have to use QUInt8 here.
191+
#
192+
# When QInt8 is used, C++ onnxruntime produces incorrect results
193+
weight_type=QuantType.QUInt8,
189194
)
190195

191196

sherpa-onnx/c-api/c-api.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ struct SherpaOnnxOfflineStream {
310310

311311
static sherpa_onnx::OfflineRecognizerConfig convertConfig(
312312
const SherpaOnnxOfflineRecognizerConfig *config);
313+
313314
SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer(
314315
const SherpaOnnxOfflineRecognizerConfig *config) {
315316
sherpa_onnx::OfflineRecognizerConfig recognizer_config =
@@ -391,6 +392,15 @@ sherpa_onnx::OfflineRecognizerConfig convertConfig(
391392
recognizer_config.model_config.telespeech_ctc =
392393
SHERPA_ONNX_OR(config->model_config.telespeech_ctc, "");
393394

395+
recognizer_config.model_config.sense_voice.model =
396+
SHERPA_ONNX_OR(config->model_config.sense_voice.model, "");
397+
398+
recognizer_config.model_config.sense_voice.language =
399+
SHERPA_ONNX_OR(config->model_config.sense_voice.language, "");
400+
401+
recognizer_config.model_config.sense_voice.use_itn =
402+
config->model_config.sense_voice.use_itn;
403+
394404
recognizer_config.lm_config.model =
395405
SHERPA_ONNX_OR(config->lm_config.model, "");
396406
recognizer_config.lm_config.scale =

sherpa-onnx/c-api/c-api.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineLMConfig {
379379
float scale;
380380
} SherpaOnnxOfflineLMConfig;
381381

382+
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineSenseVoiceModelConfig {
383+
const char *model;
384+
const char *language;
385+
int32_t use_itn;
386+
} SherpaOnnxOfflineSenseVoiceModelConfig;
387+
382388
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
383389
SherpaOnnxOfflineTransducerModelConfig transducer;
384390
SherpaOnnxOfflineParaformerModelConfig paraformer;
@@ -398,6 +404,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig {
398404
const char *modeling_unit;
399405
const char *bpe_vocab;
400406
const char *telespeech_ctc;
407+
SherpaOnnxOfflineSenseVoiceModelConfig sense_voice;
401408
} SherpaOnnxOfflineModelConfig;
402409

403410
SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {

0 commit comments

Comments
 (0)