Skip to content

Commit 0a4d85c

Browse files
authored
server : add Voice Activity Detection (VAD) support (#3246)
* server : add Voice Activity Detection (VAD) support This commit adds support for Voice Activity Detection (VAD) in the server example. The motivation for this is to enable VAD processing when using whisper-server. Resolves: #3089 * server : add VAD parameters to usage in README.md [no ci] This commit also adds a few missing parameters. * server : fix conflicting short options [no ci]
1 parent 9df8d54 commit 0a4d85c

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

examples/server/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ options:
2323
-sow, --split-on-word [false ] split on word rather than on token
2424
-bo N, --best-of N [2 ] number of best candidates to keep
2525
-bs N, --beam-size N [-1 ] beam size for beam search
26+
-ac N, --audio-ctx N [0 ] audio context size (0 - all)
2627
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
2728
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
2829
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
@@ -41,9 +42,28 @@ options:
4142
--prompt PROMPT [ ] initial prompt
4243
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
4344
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
45+
-dtw MODEL --dtw MODEL [ ] compute token-level timestamps
4446
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
4547
--port PORT, [8080 ] Port number for the server
48+
--public PATH, [examples/server/public] Path to the public folder
49+
--request-path PATH, [ ] Request path for all requests
50+
--inference-path PATH, [/inference] Inference path for all requests
4651
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
52+
-sns, --suppress-nst [false ] suppress non-speech tokens
53+
-nth N, --no-speech-thold N [0.60 ] no speech threshold
54+
-nc, --no-context [false ] do not use previous audio context
55+
-ng, --no-gpu [false ] do not use gpu
56+
-fa, --flash-attn [false ] flash attention
57+
58+
Voice Activity Detection (VAD) options:
59+
--vad [false ] enable Voice Activity Detection (VAD)
60+
-vm FNAME, --vad-model FNAME [ ] VAD model path
61+
-vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
62+
-vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
63+
-vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
64+
-vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
65+
-vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
66+
-vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
4767
```
4868

4969
> [!WARNING]

examples/server/server.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "httplib.h"
66
#include "json.hpp"
77

8+
#include <cfloat>
89
#include <chrono>
910
#include <cmath>
1011
#include <cstdio>
@@ -90,6 +91,16 @@ struct whisper_params {
9091
std::string openvino_encode_device = "CPU";
9192

9293
std::string dtw = "";
94+
95+
// Voice Activity Detection (VAD) parameters
96+
bool vad = false;
97+
std::string vad_model = "";
98+
float vad_threshold = 0.5f;
99+
int vad_min_speech_duration_ms = 250;
100+
int vad_min_silence_duration_ms = 100;
101+
float vad_max_speech_duration_s = FLT_MAX;
102+
int vad_speech_pad_ms = 30;
103+
float vad_samples_overlap = 0.1f;
93104
};
94105

95106
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
140151
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
141152
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
142153
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
154+
// Voice Activity Detection (VAD) parameters
155+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
156+
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
157+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
158+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
159+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
160+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
161+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
162+
std::string("FLT_MAX").c_str() :
163+
std::to_string(params.vad_max_speech_duration_s).c_str());
164+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
165+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
143166
fprintf(stderr, "\n");
144167
}
145168

@@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
195218
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
196219
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
197220
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
221+
222+
// Voice Activity Detection (VAD)
223+
else if ( arg == "--vad") { params.vad = true; }
224+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
225+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
226+
else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
227+
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
228+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
229+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
230+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
198231
else {
199232
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
200233
whisper_print_usage(argc, argv, params, sparams);
@@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
511544
{
512545
params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
513546
}
547+
if (req.has_file("vad"))
548+
{
549+
params.vad = parse_str_to_bool(req.get_file_value("vad").content);
550+
}
551+
if (req.has_file("vad_threshold"))
552+
{
553+
params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
554+
}
555+
if (req.has_file("vad_min_speech_duration_ms"))
556+
{
557+
params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
558+
}
559+
if (req.has_file("vad_min_silence_duration_ms"))
560+
{
561+
params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
562+
}
563+
if (req.has_file("vad_max_speech_duration_s"))
564+
{
565+
params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
566+
}
567+
if (req.has_file("vad_speech_pad_ms"))
568+
{
569+
params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
570+
}
571+
if (req.has_file("vad_samples_overlap"))
572+
{
573+
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
574+
}
514575
}
515576

516577
} // namespace
@@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
829890

830891
wparams.suppress_nst = params.suppress_nst;
831892

893+
wparams.vad = params.vad;
894+
wparams.vad_model_path = params.vad_model.c_str();
895+
896+
wparams.vad_params.threshold = params.vad_threshold;
897+
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
898+
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
899+
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
900+
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
901+
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
902+
832903
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
833904

834905
// this callback is called on each new segment

0 commit comments

Comments
 (0)