Skip to content

Commit 0703bc1

Browse files
authored
Add CXX API for VAD (#2077)
1 parent 6ef9aeb commit 0703bc1

File tree

10 files changed

+461
-36
lines changed

10 files changed

+461
-36
lines changed

.github/workflows/cxx-api.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,45 @@ jobs:
8181
otool -L ./install/lib/libsherpa-onnx-cxx-api.dylib
8282
fi
8383
84+
- name: Test VAD
85+
shell: bash
86+
run: |
87+
name=vad-cxx-api
88+
g++ -std=c++17 -o $name ./cxx-api-examples/$name.cc \
89+
-I ./build/install/include \
90+
-L ./build/install/lib/ \
91+
-l sherpa-onnx-cxx-api \
92+
-l sherpa-onnx-c-api \
93+
-l onnxruntime
94+
95+
ls -lh $name
96+
97+
export LD_LIBRARY_PATH=$PWD/build/install/lib:$LD_LIBRARY_PATH
98+
export DYLD_LIBRARY_PATH=$PWD/build/install/lib:$DYLD_LIBRARY_PATH
99+
100+
if [[ ${{ matrix.os }} == ubuntu-latest || ${{ matrix.os }} == ubuntu-22.04-arm ]]; then
101+
ldd ./$name
102+
echo "----"
103+
readelf -d ./$name
104+
fi
105+
106+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
107+
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
108+
109+
./$name
110+
111+
mkdir vad-test
112+
cp -v lei-jun-test*.wav vad-test
113+
114+
ls -lh vad-test
115+
116+
rm $name
117+
118+
- uses: actions/upload-artifact@v4
119+
with:
120+
name: vad-test-wavs-cxx-${{ matrix.os }}
121+
path: ./vad-test/*.wav
122+
84123
- name: Test Speech Enhancement (GTCRN)
85124
shell: bash
86125
run: |

cmake/cmake_extension.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def get_binaries():
5353
"sherpa-onnx-microphone-offline-speaker-identification",
5454
"sherpa-onnx-offline",
5555
"sherpa-onnx-offline-audio-tagging",
56+
"sherpa-onnx-offline-denoiser",
5657
"sherpa-onnx-offline-language-identification",
5758
"sherpa-onnx-offline-punctuation",
5859
"sherpa-onnx-offline-speaker-diarization",
@@ -62,6 +63,7 @@ def get_binaries():
6263
"sherpa-onnx-online-punctuation",
6364
"sherpa-onnx-online-websocket-client",
6465
"sherpa-onnx-online-websocket-server",
66+
"sherpa-onnx-vad",
6567
"sherpa-onnx-vad-microphone",
6668
"sherpa-onnx-vad-microphone-offline-asr",
6769
"sherpa-onnx-vad-with-offline-asr",

cxx-api-examples/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ target_link_libraries(moonshine-cxx-api sherpa-onnx-cxx-api)
2424
add_executable(sense-voice-cxx-api ./sense-voice-cxx-api.cc)
2525
target_link_libraries(sense-voice-cxx-api sherpa-onnx-cxx-api)
2626

27+
add_executable(vad-cxx-api ./vad-cxx-api.cc)
28+
target_link_libraries(vad-cxx-api sherpa-onnx-cxx-api)
29+
2730
if(SHERPA_ONNX_ENABLE_TTS)
2831
add_executable(matcha-tts-zh-cxx-api ./matcha-tts-zh-cxx-api.cc)
2932
target_link_libraries(matcha-tts-zh-cxx-api sherpa-onnx-cxx-api)

cxx-api-examples/vad-cxx-api.cc

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// cxx-api-examples/vad-cxx-api.cc
2+
//
3+
// Copyright (c) 2025 Xiaomi Corporation
4+
5+
//
6+
// This file demonstrates how to use VAD to remove silences from a file
7+
// clang-format off
8+
//
9+
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
10+
// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav
11+
//
12+
// clang-format on
13+
#include <iostream>
14+
#include <string>
15+
16+
#include "sherpa-onnx/c-api/cxx-api.h"
17+
18+
int32_t main() {
19+
using namespace sherpa_onnx::cxx; // NOLINT
20+
21+
std::string wave_filename = "./lei-jun-test.wav";
22+
std::string vad_filename = "./silero_vad.onnx";
23+
24+
VadModelConfig config;
25+
config.silero_vad.model = vad_filename;
26+
config.silero_vad.threshold = 0.1;
27+
config.silero_vad.min_silence_duration = 0.5;
28+
config.silero_vad.min_speech_duration = 0.25;
29+
config.silero_vad.max_speech_duration = 20;
30+
config.sample_rate = 16000;
31+
config.debug = true;
32+
33+
VoiceActivityDetector vad = VoiceActivityDetector::Create(config, 20);
34+
if (!vad.Get()) {
35+
std::cerr << "Failed to create VAD. Please check your config\n";
36+
return -1;
37+
}
38+
39+
Wave wave = ReadWave(wave_filename);
40+
if (wave.samples.empty()) {
41+
std::cerr << "Failed to read: '" << wave_filename << "'\n";
42+
return -1;
43+
}
44+
bool is_eof = false;
45+
int32_t i = 0;
46+
int32_t window_size = config.silero_vad.window_size;
47+
48+
int32_t sample_rate = config.sample_rate;
49+
50+
std::vector<float> samples_without_silence;
51+
52+
while (!is_eof) {
53+
if (i + window_size < wave.samples.size()) {
54+
vad.AcceptWaveform(wave.samples.data() + i, window_size);
55+
i += window_size;
56+
} else {
57+
is_eof = true;
58+
vad.Flush();
59+
}
60+
61+
while (!vad.IsEmpty()) {
62+
auto segment = vad.Front();
63+
float start_time = segment.start / static_cast<float>(sample_rate);
64+
float end_time =
65+
start_time + segment.samples.size() / static_cast<float>(sample_rate);
66+
printf("%.3f -- %.3f\n", start_time, end_time);
67+
68+
samples_without_silence.insert(samples_without_silence.end(),
69+
segment.samples.begin(),
70+
segment.samples.end());
71+
72+
vad.Pop();
73+
}
74+
}
75+
76+
bool ok = WriteWave("./lei-jun-test-no-silence.wav",
77+
{samples_without_silence, sample_rate});
78+
if (ok) {
79+
std::cout << "Saved to ./lei-jun-test-no-silence.wav\n";
80+
} else {
81+
std::cerr << "Failed to write ./lei-jun-test-no-silence.wav\n";
82+
}
83+
84+
return 0;
85+
}

sherpa-onnx/c-api/c-api.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
785785
// in seconds
786786
float min_speech_duration;
787787

788-
int window_size;
788+
int32_t window_size;
789789

790790
// If a speech segment is longer than this value, then we increase
791791
// the threshold to 0.9. After finishing detecting the segment,

sherpa-onnx/c-api/cxx-api.cc

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,4 +558,114 @@ int32_t OfflineSpeechDenoiser::GetSampleRate() const {
558558
return SherpaOnnxOfflineSpeechDenoiserGetSampleRate(p_);
559559
}
560560

561+
CircularBuffer CircularBuffer::Create(int32_t capacity) {
562+
auto p = SherpaOnnxCreateCircularBuffer(capacity);
563+
return CircularBuffer(p);
564+
}
565+
566+
CircularBuffer::CircularBuffer(const SherpaOnnxCircularBuffer *p)
567+
: MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer>(p) {}
568+
569+
void CircularBuffer::Destroy(const SherpaOnnxCircularBuffer *p) const {
570+
SherpaOnnxDestroyCircularBuffer(p);
571+
}
572+
573+
void CircularBuffer::Push(const float *samples, int32_t n) const {
574+
SherpaOnnxCircularBufferPush(p_, samples, n);
575+
}
576+
577+
std::vector<float> CircularBuffer::Get(int32_t start_index, int32_t n) const {
578+
const float *samples = SherpaOnnxCircularBufferGet(p_, start_index, n);
579+
std::vector<float> ans(n);
580+
std::copy(samples, samples + n, ans.begin());
581+
582+
SherpaOnnxCircularBufferFree(samples);
583+
return ans;
584+
}
585+
586+
void CircularBuffer::Pop(int32_t n) const {
587+
SherpaOnnxCircularBufferPop(p_, n);
588+
}
589+
590+
int32_t CircularBuffer::Size() const {
591+
return SherpaOnnxCircularBufferSize(p_);
592+
}
593+
594+
int32_t CircularBuffer::Head() const {
595+
return SherpaOnnxCircularBufferHead(p_);
596+
}
597+
598+
void CircularBuffer::Reset() const { SherpaOnnxCircularBufferReset(p_); }
599+
600+
VoiceActivityDetector VoiceActivityDetector::Create(
601+
const VadModelConfig &config, float buffer_size_in_seconds) {
602+
struct SherpaOnnxVadModelConfig c;
603+
memset(&c, 0, sizeof(c));
604+
605+
c.silero_vad.model = config.silero_vad.model.c_str();
606+
c.silero_vad.threshold = config.silero_vad.threshold;
607+
c.silero_vad.min_silence_duration = config.silero_vad.min_silence_duration;
608+
c.silero_vad.min_speech_duration = config.silero_vad.min_speech_duration;
609+
c.silero_vad.window_size = config.silero_vad.window_size;
610+
c.silero_vad.max_speech_duration = config.silero_vad.max_speech_duration;
611+
612+
c.sample_rate = config.sample_rate;
613+
c.num_threads = config.num_threads;
614+
c.provider = config.provider.c_str();
615+
c.debug = config.debug;
616+
617+
auto p = SherpaOnnxCreateVoiceActivityDetector(&c, buffer_size_in_seconds);
618+
return VoiceActivityDetector(p);
619+
}
620+
621+
VoiceActivityDetector::VoiceActivityDetector(
622+
const SherpaOnnxVoiceActivityDetector *p)
623+
: MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector>(p) {}
624+
625+
void VoiceActivityDetector::Destroy(
626+
const SherpaOnnxVoiceActivityDetector *p) const {
627+
SherpaOnnxDestroyVoiceActivityDetector(p);
628+
}
629+
630+
void VoiceActivityDetector::AcceptWaveform(const float *samples,
631+
int32_t n) const {
632+
SherpaOnnxVoiceActivityDetectorAcceptWaveform(p_, samples, n);
633+
}
634+
635+
bool VoiceActivityDetector::IsEmpty() const {
636+
return SherpaOnnxVoiceActivityDetectorEmpty(p_);
637+
}
638+
639+
bool VoiceActivityDetector ::IsDetected() const {
640+
return SherpaOnnxVoiceActivityDetectorDetected(p_);
641+
}
642+
643+
void VoiceActivityDetector::Pop() const {
644+
SherpaOnnxVoiceActivityDetectorPop(p_);
645+
}
646+
647+
void VoiceActivityDetector::Clear() const {
648+
SherpaOnnxVoiceActivityDetectorClear(p_);
649+
}
650+
651+
SpeechSegment VoiceActivityDetector::Front() const {
652+
auto f = SherpaOnnxVoiceActivityDetectorFront(p_);
653+
654+
SpeechSegment segment;
655+
segment.start = f->start;
656+
segment.samples = std::vector<float>{f->samples, f->samples + f->n};
657+
658+
SherpaOnnxDestroySpeechSegment(f);
659+
660+
return segment;
661+
}
662+
663+
void VoiceActivityDetector::Reset() const {
664+
SherpaOnnxVoiceActivityDetectorReset(p_);
665+
}
666+
667+
void VoiceActivityDetector::Flush() const {
668+
SherpaOnnxVoiceActivityDetectorFlush(p_);
669+
}
670+
561671
} // namespace sherpa_onnx::cxx

sherpa-onnx/c-api/cxx-api.h

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,84 @@ class SHERPA_ONNX_API OfflineSpeechDenoiser
500500
explicit OfflineSpeechDenoiser(const SherpaOnnxOfflineSpeechDenoiser *p);
501501
};
502502

503+
// ==============================
504+
// VAD
505+
// ==============================
506+
507+
struct SileroVadModelConfig {
508+
std::string model;
509+
float threshold = 0.5;
510+
float min_silence_duration = 0.5;
511+
float min_speech_duration = 0.25;
512+
int32_t window_size = 512;
513+
float max_speech_duration = 20;
514+
};
515+
516+
struct VadModelConfig {
517+
SileroVadModelConfig silero_vad;
518+
519+
int32_t sample_rate = 16000;
520+
int32_t num_threads = 1;
521+
std::string provider = "cpu";
522+
bool debug = false;
523+
};
524+
525+
struct SpeechSegment {
526+
int32_t start;
527+
std::vector<float> samples;
528+
};
529+
530+
class SHERPA_ONNX_API CircularBuffer
531+
: public MoveOnly<CircularBuffer, SherpaOnnxCircularBuffer> {
532+
public:
533+
static CircularBuffer Create(int32_t capacity);
534+
535+
void Destroy(const SherpaOnnxCircularBuffer *p) const;
536+
537+
void Push(const float *p, int32_t n) const;
538+
539+
std::vector<float> Get(int32_t start_index, int32_t n) const;
540+
541+
void Pop(int32_t n) const;
542+
543+
int32_t Size() const;
544+
545+
int32_t Head() const;
546+
547+
void Reset() const;
548+
549+
private:
550+
explicit CircularBuffer(const SherpaOnnxCircularBuffer *p);
551+
};
552+
553+
class SHERPA_ONNX_API VoiceActivityDetector
554+
: public MoveOnly<VoiceActivityDetector, SherpaOnnxVoiceActivityDetector> {
555+
public:
556+
static VoiceActivityDetector Create(const VadModelConfig &config,
557+
float buffer_size_in_seconds);
558+
559+
void Destroy(const SherpaOnnxVoiceActivityDetector *p) const;
560+
561+
void AcceptWaveform(const float *samples, int32_t n) const;
562+
563+
bool IsEmpty() const;
564+
565+
bool IsDetected() const;
566+
567+
void Pop() const;
568+
569+
void Clear() const;
570+
571+
SpeechSegment Front() const;
572+
573+
void Reset() const;
574+
575+
void Flush() const;
576+
577+
private:
578+
explicit VoiceActivityDetector(const SherpaOnnxVoiceActivityDetector *p);
579+
};
580+
503581
} // namespace sherpa_onnx::cxx
504582

505583
#endif // SHERPA_ONNX_C_API_CXX_API_H_

sherpa-onnx/csrc/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
317317
add_executable(sherpa-onnx-keyword-spotter sherpa-onnx-keyword-spotter.cc)
318318
add_executable(sherpa-onnx-offline sherpa-onnx-offline.cc)
319319
add_executable(sherpa-onnx-offline-audio-tagging sherpa-onnx-offline-audio-tagging.cc)
320+
add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc)
320321
add_executable(sherpa-onnx-offline-language-identification sherpa-onnx-offline-language-identification.cc)
321322
add_executable(sherpa-onnx-offline-parallel sherpa-onnx-offline-parallel.cc)
322323
add_executable(sherpa-onnx-offline-punctuation sherpa-onnx-offline-punctuation.cc)
323324
add_executable(sherpa-onnx-online-punctuation sherpa-onnx-online-punctuation.cc)
324-
add_executable(sherpa-onnx-offline-denoiser sherpa-onnx-offline-denoiser.cc)
325+
add_executable(sherpa-onnx-vad sherpa-onnx-vad.cc)
325326

326327
if(SHERPA_ONNX_ENABLE_TTS)
327328
add_executable(sherpa-onnx-offline-tts sherpa-onnx-offline-tts.cc)
@@ -336,11 +337,12 @@ if(SHERPA_ONNX_ENABLE_BINARY)
336337
sherpa-onnx-keyword-spotter
337338
sherpa-onnx-offline
338339
sherpa-onnx-offline-audio-tagging
340+
sherpa-onnx-offline-denoiser
339341
sherpa-onnx-offline-language-identification
340342
sherpa-onnx-offline-parallel
341343
sherpa-onnx-offline-punctuation
342-
sherpa-onnx-offline-denoiser
343344
sherpa-onnx-online-punctuation
345+
sherpa-onnx-vad
344346
)
345347
if(SHERPA_ONNX_ENABLE_TTS)
346348
list(APPEND main_exes

0 commit comments

Comments
 (0)