Skip to content

Commit dd300b1

Browse files
authored
Add Java and Kotlin API for sense voice (#1164)
1 parent ac8223b commit dd300b1

16 files changed

+601
-2
lines changed

.github/workflows/run-java-test.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ jobs:
114114
./run-kws-from-file.sh
115115
rm -rf sherpa-onnx-*
116116
117+
- name: Run java test (VAD + Non-streaming SenseVoice)
118+
shell: bash
119+
run: |
120+
cd ./java-api-examples
121+
./run-vad-non-streaming-sense-voice.sh
122+
rm *.onnx
123+
ls -lh *.wav
124+
rm *.wav
125+
rm -rf sherpa-onnx-*
126+
117127
- name: Run java test (VAD + Non-streaming Paraformer)
118128
shell: bash
119129
run: |
@@ -193,6 +203,10 @@ jobs:
193203
shell: bash
194204
run: |
195205
cd ./java-api-examples
206+
207+
./run-non-streaming-decode-file-sense-voice.sh
208+
rm -rf sherpa-onnx-sense-voice-*
209+
196210
./run-inverse-text-normalization-paraformer.sh
197211
198212
./run-non-streaming-decode-file-paraformer.sh
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright 2024 Xiaomi Corporation
2+
3+
// This file shows how to use an offline SenseVoice model,
4+
// i.e., non-streaming SenseVoice model,
5+
// to decode files.
6+
import com.k2fsa.sherpa.onnx.*;
7+
8+
public class NonStreamingDecodeFileSenseVoice {
9+
public static void main(String[] args) {
10+
// please refer to
11+
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
12+
// to download model files
13+
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
14+
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
15+
16+
String waveFilename = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/test_wavs/zh.wav";
17+
18+
WaveReader reader = new WaveReader(waveFilename);
19+
20+
OfflineSenseVoiceModelConfig senseVoice =
21+
OfflineSenseVoiceModelConfig.builder().setModel(model).build();
22+
23+
OfflineModelConfig modelConfig =
24+
OfflineModelConfig.builder()
25+
.setSenseVoice(senseVoice)
26+
.setTokens(tokens)
27+
.setNumThreads(1)
28+
.setDebug(true)
29+
.build();
30+
31+
OfflineRecognizerConfig config =
32+
OfflineRecognizerConfig.builder()
33+
.setOfflineModelConfig(modelConfig)
34+
.setDecodingMethod("greedy_search")
35+
.build();
36+
37+
OfflineRecognizer recognizer = new OfflineRecognizer(config);
38+
OfflineStream stream = recognizer.createStream();
39+
stream.acceptWaveform(reader.getSamples(), reader.getSampleRate());
40+
41+
recognizer.decode(stream);
42+
43+
String text = recognizer.getResult(stream).getText();
44+
45+
System.out.printf("filename:%s\nresult:%s\n", waveFilename, text);
46+
47+
stream.release();
48+
recognizer.release();
49+
}
50+
}

java-api-examples/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ This directory contains examples for the JAVA API of sherpa-onnx.
1818

1919
```bash
2020
./run-non-streaming-decode-file-paraformer.sh
21+
./run-non-streaming-decode-file-sense-voice.sh
2122
./run-non-streaming-decode-file-transducer.sh
2223
./run-non-streaming-decode-file-whisper.sh
2324
./run-non-streaming-decode-file-nemo.sh
@@ -64,6 +65,12 @@ The punctuation model supports both English and Chinese.
6465
./run-vad-from-mic.sh
6566
```
6667

68+
## VAD with a microphone + Non-streaming SenseVoice for speech recognition
69+
70+
```bash
71+
./run-vad-from-mic-non-streaming-sense-voice.sh
72+
```
73+
6774
## VAD with a microphone + Non-streaming Paraformer for speech recognition
6875

6976
```bash
@@ -82,6 +89,12 @@ The punctuation model supports both English and Chinese.
8289
./run-vad-remove-slience.sh
8390
```
8491

92+
## VAD + Non-streaming SenseVoice for speech recognition
93+
94+
```bash
95+
./run-vad-non-streaming-sense-voice.sh
96+
```
97+
8598
## VAD + Non-streaming Paraformer for speech recognition
8699

87100
```bash
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Copyright 2024 Xiaomi Corporation
2+
3+
// This file shows how to use a silero_vad model with a non-streaming
4+
// SenseVoice model for speech recognition.
5+
6+
import com.k2fsa.sherpa.onnx.*;
7+
import javax.sound.sampled.*;
8+
9+
public class VadFromMicWithNonStreamingSenseVoice {
10+
private static final int sampleRate = 16000;
11+
private static final int windowSize = 512;
12+
13+
public static Vad createVad() {
14+
// please download ./silero_vad.onnx from
15+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
16+
String model = "./silero_vad.onnx";
17+
SileroVadModelConfig sileroVad =
18+
SileroVadModelConfig.builder()
19+
.setModel(model)
20+
.setThreshold(0.5f)
21+
.setMinSilenceDuration(0.25f)
22+
.setMinSpeechDuration(0.5f)
23+
.setWindowSize(windowSize)
24+
.build();
25+
26+
VadModelConfig config =
27+
VadModelConfig.builder()
28+
.setSileroVadModelConfig(sileroVad)
29+
.setSampleRate(sampleRate)
30+
.setNumThreads(1)
31+
.setDebug(true)
32+
.setProvider("cpu")
33+
.build();
34+
35+
return new Vad(config);
36+
}
37+
38+
public static OfflineRecognizer createOfflineRecognizer() {
39+
// please refer to
40+
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
41+
// to download model files
42+
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
43+
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
44+
45+
OfflineSenseVoiceModelConfig senseVoice =
46+
OfflineSenseVoiceModelConfig.builder().setModel(model).build();
47+
48+
OfflineModelConfig modelConfig =
49+
OfflineModelConfig.builder()
50+
.setSenseVoice(senseVoice)
51+
.setTokens(tokens)
52+
.setNumThreads(1)
53+
.setDebug(true)
54+
.build();
55+
56+
OfflineRecognizerConfig config =
57+
OfflineRecognizerConfig.builder()
58+
.setOfflineModelConfig(modelConfig)
59+
.setDecodingMethod("greedy_search")
60+
.build();
61+
62+
return new OfflineRecognizer(config);
63+
}
64+
65+
public static void main(String[] args) {
66+
Vad vad = createVad();
67+
OfflineRecognizer recognizer = createOfflineRecognizer();
68+
69+
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/AudioFormat.html
70+
// Linear PCM, 16000Hz, 16-bit, 1 channel, signed, little endian
71+
AudioFormat format = new AudioFormat(sampleRate, 16, 1, true, false);
72+
73+
// https://docs.oracle.com/javase/8/docs/api/javax/sound/sampled/DataLine.Info.html#Info-java.lang.Class-javax.sound.sampled.AudioFormat-int-
74+
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
75+
TargetDataLine targetDataLine;
76+
try {
77+
targetDataLine = (TargetDataLine) AudioSystem.getLine(info);
78+
targetDataLine.open(format);
79+
targetDataLine.start();
80+
} catch (LineUnavailableException e) {
81+
System.out.println("Failed to open target data line: " + e.getMessage());
82+
vad.release();
83+
recognizer.release();
84+
return;
85+
}
86+
87+
boolean printed = false;
88+
byte[] buffer = new byte[windowSize * 2];
89+
float[] samples = new float[windowSize];
90+
91+
System.out.println("Started. Please speak");
92+
boolean running = true;
93+
while (targetDataLine.isOpen() && running) {
94+
int n = targetDataLine.read(buffer, 0, buffer.length);
95+
if (n <= 0) {
96+
System.out.printf("Got %d bytes. Expected %d bytes.\n", n, buffer.length);
97+
continue;
98+
}
99+
for (int i = 0; i != windowSize; ++i) {
100+
short low = buffer[2 * i];
101+
short high = buffer[2 * i + 1];
102+
int s = (high << 8) + low;
103+
samples[i] = (float) s / 32768;
104+
}
105+
106+
vad.acceptWaveform(samples);
107+
if (vad.isSpeechDetected() && !printed) {
108+
System.out.println("Detected speech");
109+
printed = true;
110+
}
111+
112+
if (!vad.isSpeechDetected()) {
113+
printed = false;
114+
}
115+
116+
while (!vad.empty()) {
117+
SpeechSegment segment = vad.front();
118+
float startTime = segment.getStart() / (float) sampleRate;
119+
float duration = segment.getSamples().length / (float) sampleRate;
120+
121+
OfflineStream stream = recognizer.createStream();
122+
stream.acceptWaveform(segment.getSamples(), sampleRate);
123+
recognizer.decode(stream);
124+
String text = recognizer.getResult(stream).getText();
125+
stream.release();
126+
127+
if (!text.isEmpty()) {
128+
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
129+
}
130+
131+
if (text.contains("退出程序")) {
132+
running = false;
133+
}
134+
135+
vad.pop();
136+
}
137+
}
138+
139+
vad.release();
140+
recognizer.release();
141+
}
142+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Copyright 2024 Xiaomi Corporation
2+
3+
// This file shows how to use a silero_vad model with a non-streaming SenseVoiceModel
4+
// for speech recognition.
5+
6+
import com.k2fsa.sherpa.onnx.*;
7+
import java.util.Arrays;
8+
9+
public class VadNonStreamingSenseVoice {
10+
public static Vad createVad() {
11+
// please download ./silero_vad.onnx from
12+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
13+
String model = "./silero_vad.onnx";
14+
SileroVadModelConfig sileroVad =
15+
SileroVadModelConfig.builder()
16+
.setModel(model)
17+
.setThreshold(0.5f)
18+
.setMinSilenceDuration(0.25f)
19+
.setMinSpeechDuration(0.5f)
20+
.setWindowSize(512)
21+
.build();
22+
23+
VadModelConfig config =
24+
VadModelConfig.builder()
25+
.setSileroVadModelConfig(sileroVad)
26+
.setSampleRate(16000)
27+
.setNumThreads(1)
28+
.setDebug(true)
29+
.setProvider("cpu")
30+
.build();
31+
32+
return new Vad(config);
33+
}
34+
35+
public static OfflineRecognizer createOfflineRecognizer() {
36+
// please refer to
37+
// https://k2-fsa.github.io/sherpa/onnx/sense-voice/index.html
38+
// to download model files
39+
String model = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
40+
String tokens = "./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
41+
42+
OfflineSenseVoiceModelConfig senseVoice =
43+
OfflineSenseVoiceModelConfig.builder().setModel(model).build();
44+
45+
OfflineModelConfig modelConfig =
46+
OfflineModelConfig.builder()
47+
.setSenseVoice(senseVoice)
48+
.setTokens(tokens)
49+
.setNumThreads(1)
50+
.setDebug(true)
51+
.build();
52+
53+
OfflineRecognizerConfig config =
54+
OfflineRecognizerConfig.builder()
55+
.setOfflineModelConfig(modelConfig)
56+
.setDecodingMethod("greedy_search")
57+
.build();
58+
59+
return new OfflineRecognizer(config);
60+
}
61+
62+
public static void main(String[] args) {
63+
64+
Vad vad = createVad();
65+
OfflineRecognizer recognizer = createOfflineRecognizer();
66+
67+
// You can download the test file from
68+
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
69+
String testWaveFilename = "./lei-jun-test.wav";
70+
WaveReader reader = new WaveReader(testWaveFilename);
71+
72+
int numSamples = reader.getSamples().length;
73+
int numIter = numSamples / 512;
74+
75+
for (int i = 0; i != numIter; ++i) {
76+
int start = i * 512;
77+
int end = start + 512;
78+
float[] samples = Arrays.copyOfRange(reader.getSamples(), start, end);
79+
vad.acceptWaveform(samples);
80+
if (vad.isSpeechDetected()) {
81+
while (!vad.empty()) {
82+
SpeechSegment segment = vad.front();
83+
float startTime = segment.getStart() / 16000.0f;
84+
float duration = segment.getSamples().length / 16000.0f;
85+
86+
OfflineStream stream = recognizer.createStream();
87+
stream.acceptWaveform(segment.getSamples(), 16000);
88+
recognizer.decode(stream);
89+
String text = recognizer.getResult(stream).getText();
90+
stream.release();
91+
92+
if (!text.isEmpty()) {
93+
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
94+
}
95+
96+
vad.pop();
97+
}
98+
}
99+
}
100+
101+
vad.flush();
102+
while (!vad.empty()) {
103+
SpeechSegment segment = vad.front();
104+
float startTime = segment.getStart() / 16000.0f;
105+
float duration = segment.getSamples().length / 16000.0f;
106+
107+
OfflineStream stream = recognizer.createStream();
108+
stream.acceptWaveform(segment.getSamples(), 16000);
109+
recognizer.decode(stream);
110+
String text = recognizer.getResult(stream).getText();
111+
stream.release();
112+
113+
if (!text.isEmpty()) {
114+
System.out.printf("%.3f--%.3f: %s\n", startTime, startTime + duration, text);
115+
}
116+
117+
vad.pop();
118+
}
119+
120+
vad.release();
121+
recognizer.release();
122+
}
123+
}

0 commit comments

Comments
 (0)