|
5 | 5 | #include "httplib.h"
|
6 | 6 | #include "json.hpp"
|
7 | 7 |
|
| 8 | +#include <cfloat> |
8 | 9 | #include <chrono>
|
9 | 10 | #include <cmath>
|
10 | 11 | #include <cstdio>
|
@@ -90,6 +91,16 @@ struct whisper_params {
|
90 | 91 | std::string openvino_encode_device = "CPU";
|
91 | 92 |
|
92 | 93 | std::string dtw = "";
|
| 94 | + |
| 95 | + // Voice Activity Detection (VAD) parameters |
| 96 | + bool vad = false; |
| 97 | + std::string vad_model = ""; |
| 98 | + float vad_threshold = 0.5f; |
| 99 | + int vad_min_speech_duration_ms = 250; |
| 100 | + int vad_min_silence_duration_ms = 100; |
| 101 | + float vad_max_speech_duration_s = FLT_MAX; |
| 102 | + int vad_speech_pad_ms = 30; |
| 103 | + float vad_samples_overlap = 0.1f; |
93 | 104 | };
|
94 | 105 |
|
95 | 106 | void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
|
@@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
140 | 151 | fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
|
141 | 152 | fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
|
142 | 153 | fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
|
| 154 | + // Voice Activity Detection (VAD) parameters |
| 155 | + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); |
| 156 | + fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); |
| 157 | + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); |
| 158 | + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); |
| 159 | + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); |
| 160 | + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); |
| 161 | + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? |
| 162 | + std::string("FLT_MAX").c_str() : |
| 163 | + std::to_string(params.vad_max_speech_duration_s).c_str()); |
| 164 | + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); |
| 165 | + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); |
143 | 166 | fprintf(stderr, "\n");
|
144 | 167 | }
|
145 | 168 |
|
@@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
|
195 | 218 | else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
|
196 | 219 | else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
|
197 | 220 | else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
|
| 221 | + |
| 222 | + // Voice Activity Detection (VAD) |
| 223 | + else if ( arg == "--vad") { params.vad = true; } |
| 224 | + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; } |
| 225 | + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); } |
| 226 | + else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } |
| 227 | + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } |
| 228 | + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); } |
| 229 | + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); } |
| 230 | + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); } |
198 | 231 | else {
|
199 | 232 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
200 | 233 | whisper_print_usage(argc, argv, params, sparams);
|
@@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
511 | 544 | {
|
512 | 545 | params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
|
513 | 546 | }
|
| 547 | + if (req.has_file("vad")) |
| 548 | + { |
| 549 | + params.vad = parse_str_to_bool(req.get_file_value("vad").content); |
| 550 | + } |
| 551 | + if (req.has_file("vad_threshold")) |
| 552 | + { |
| 553 | + params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content); |
| 554 | + } |
| 555 | + if (req.has_file("vad_min_speech_duration_ms")) |
| 556 | + { |
| 557 | + params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content); |
| 558 | + } |
| 559 | + if (req.has_file("vad_min_silence_duration_ms")) |
| 560 | + { |
| 561 | + params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content); |
| 562 | + } |
| 563 | + if (req.has_file("vad_max_speech_duration_s")) |
| 564 | + { |
| 565 | + params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content); |
| 566 | + } |
| 567 | + if (req.has_file("vad_speech_pad_ms")) |
| 568 | + { |
| 569 | + params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content); |
| 570 | + } |
| 571 | + if (req.has_file("vad_samples_overlap")) |
| 572 | + { |
| 573 | + params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content); |
| 574 | + } |
514 | 575 | }
|
515 | 576 |
|
516 | 577 | } // namespace
|
@@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
|
829 | 890 |
|
830 | 891 | wparams.suppress_nst = params.suppress_nst;
|
831 | 892 |
|
| 893 | + wparams.vad = params.vad; |
| 894 | + wparams.vad_model_path = params.vad_model.c_str(); |
| 895 | + |
| 896 | + wparams.vad_params.threshold = params.vad_threshold; |
| 897 | + wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; |
| 898 | + wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; |
| 899 | + wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; |
| 900 | + wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; |
| 901 | + wparams.vad_params.samples_overlap = params.vad_samples_overlap; |
| 902 | + |
832 | 903 | whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
833 | 904 |
|
834 | 905 | // this callback is called on each new segment
|
|
0 commit comments