|
11 | 11 | #include <thread> |
12 | 12 | #include <vector> |
13 | 13 | #include <cstring> |
| 14 | +#include <cfloat> |
14 | 15 |
|
15 | 16 | #if defined(_WIN32) |
16 | 17 | #ifndef NOMINMAX |
@@ -101,6 +102,17 @@ struct whisper_params { |
101 | 102 | std::vector<std::string> fname_out = {}; |
102 | 103 |
|
103 | 104 | grammar_parser::parse_state grammar_parsed; |
| 105 | + |
| 106 | + // Voice Activity Detection (VAD) parameters |
| 107 | + bool vad = false; |
| 108 | + std::string vad_model = ""; |
| 109 | + float vad_threshold = 0.5f; |
| 110 | + int vad_min_speech_duration_ms = 250; |
| 111 | + int vad_min_silence_duration_ms = 100; |
| 112 | + float vad_max_speech_duration_s = FLT_MAX; |
| 113 | + int vad_speech_pad_ms = 30; |
| 114 | + int vad_window_size_samples = 512; |
| 115 | + float vad_samples_overlap = 0.1f; |
104 | 116 | }; |
105 | 117 |
|
106 | 118 | static void whisper_print_usage(int argc, char ** argv, const whisper_params & params); |
@@ -189,6 +201,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params |
189 | 201 | else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; } |
190 | 202 | else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; } |
191 | 203 | else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); } |
| 204 | + // Voice Activity Detection (VAD) |
| 205 | + else if (arg == "-v" || arg == "--vad") { params.vad = true; } |
| 206 | + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; } |
| 207 | + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); } |
| 208 | + else if (arg == "-vsd" || arg == "--vad_min_speech_duration_ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 209 | + else if (arg == "-vsd" || arg == "--vad_min_silence_duration_ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 210 | + else if (arg == "-vmsd" || arg == "--vad_max_speech_duration_s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); } |
| 211 | + else if (arg == "-vp" || arg == "--vad_speech_pad_ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); } |
| 212 | + else if (arg == "-vs" || arg == "--vad_window_size_samples") { params.vad_window_size_samples = std::stoi(ARGV_NEXT); } |
| 213 | + else if (arg == "-vo" || arg == "--vad_samples_overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); } |
192 | 214 | else { |
193 | 215 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); |
194 | 216 | whisper_print_usage(argc, argv, params); |
@@ -258,6 +280,19 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params |
258 | 280 | fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str()); |
259 | 281 | fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str()); |
260 | 282 | fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty); |
| 283 | + // Voice Activity Detection (VAD) parameters |
| 284 | + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); |
| 285 | + fprintf(stderr, " -v, --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); |
| 286 | + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); |
| 287 | + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); |
| 288 | + fprintf(stderr, " -vs N, --vad_window_size_samples N [%-7d] VAD window size\n", params.vad_window_size_samples); |
| 289 | + fprintf(stderr, " -vspd N, --vad_min_speech_duration_ms N [%-7d] VAD min speech duration\n", params.vad_min_speech_duration_ms); |
| 290 | + fprintf(stderr, " -vsd N, --vad_min_silence_duration_ms N [%-7d] VAD min silence duration\n", params.vad_min_silence_duration_ms); |
| 291 | + fprintf(stderr, " -vmsd N, --vad_max_speech_duration_s N [%-7s] VAD max speech duration\n", params.vad_max_speech_duration_s == FLT_MAX ? |
| 292 | + std::string("FLT_MAX").c_str() : |
| 293 | + std::to_string(params.vad_max_speech_duration_s).c_str()); |
| 294 | + fprintf(stderr, " -vp N, --vad_speech_pad_ms N [%-7d] VAD speech padding\n", params.vad_speech_pad_ms); |
| 295 | + fprintf(stderr, " -vo N, --vad_samples_overlap N [%-7.2f] VAD samples overlap size\n", params.vad_samples_overlap); |
261 | 296 | fprintf(stderr, "\n"); |
262 | 297 | } |
263 | 298 |
|
@@ -1154,8 +1189,20 @@ int main(int argc, char ** argv) { |
1154 | 1189 |
|
1155 | 1190 | wparams.suppress_nst = params.suppress_nst; |
1156 | 1191 |
|
| 1192 | + wparams.vad = params.vad; |
| 1193 | + wparams.vad_model_path = params.vad_model.c_str(); |
| 1194 | + wparams.vad_threshold = params.vad_threshold; |
| 1195 | + wparams.vad_min_speech_duration_ms = params.vad_min_speech_duration_ms; |
| 1196 | + wparams.vad_min_silence_duration_ms = params.vad_min_silence_duration_ms; |
| 1197 | + wparams.vad_max_speech_duration_s = params.vad_max_speech_duration_s; |
| 1198 | + wparams.vad_speech_pad_ms = params.vad_speech_pad_ms; |
| 1199 | + wparams.vad_window_size_samples = params.vad_window_size_samples; |
| 1200 | + wparams.vad_samples_overlap = params.vad_samples_overlap; |
| 1201 | + |
1157 | 1202 | whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 }; |
1158 | 1203 |
|
| 1204 | + |
| 1205 | + |
1159 | 1206 | const auto & grammar_parsed = params.grammar_parsed; |
1160 | 1207 | auto grammar_rules = grammar_parsed.c_rules(); |
1161 | 1208 |
|
|
0 commit comments