Skip to content

Commit a8f07d7

Browse files
committed
examples : add VAD parameters to CLI [no ci]
Example of format: ```console $ ./build/bin/whisper-cli --help usage: ./build/bin/whisper-cli [options] file0 file1 ... supported audio formats: flac, mp3, ogg, wav options: -h, --help [default] show this help message and exit ... Voice Activity Detection (VAD) options: -v, --vad [false ] enable Voice Activity Detection (VAD) -vm FNAME, --vad-model FNAME [ ] VAD model path -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition -vs N, --vad_window_size_samples N [512 ] VAD window size -vspd N, --vad_min_speech_duration_ms N [250 ] VAD min speech duration -vsd N, --vad_min_silence_duration_ms N [100 ] VAD min silence duration -vmsd N, --vad_max_speech_duration_s N [FLT_MAX] VAD max speech duration -vp N, --vad_speech_pad_ms N [30 ] VAD speech padding -vo N, --vad_samples_overlap N [0.10 ] VAD samples overlap size ``` The main reason for the separate VAD options section is that the VAD options are longer and made the rest look a little ugly.
1 parent 0dc626e commit a8f07d7

File tree

2 files changed

+50
-3
lines changed

2 files changed

+50
-3
lines changed

examples/cli/cli.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <thread>
1212
#include <vector>
1313
#include <cstring>
14+
#include <cfloat>
1415

1516
#if defined(_WIN32)
1617
#ifndef NOMINMAX
@@ -101,6 +102,17 @@ struct whisper_params {
101102
std::vector<std::string> fname_out = {};
102103

103104
grammar_parser::parse_state grammar_parsed;
105+
106+
// Voice Activity Detection (VAD) parameters
107+
bool vad = false;
108+
std::string vad_model = "";
109+
float vad_threshold = 0.5f;
110+
int vad_min_speech_duration_ms = 250;
111+
int vad_min_silence_duration_ms = 100;
112+
float vad_max_speech_duration_s = FLT_MAX;
113+
int vad_speech_pad_ms = 30;
114+
int vad_window_size_samples = 512;
115+
float vad_samples_overlap = 0.1f;
104116
};
105117

106118
static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -189,6 +201,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
189201
else if ( arg == "--grammar") { params.grammar = ARGV_NEXT; }
190202
else if ( arg == "--grammar-rule") { params.grammar_rule = ARGV_NEXT; }
191203
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
204+
// Voice Activity Detection (VAD)
205+
else if (arg == "-v" || arg == "--vad") { params.vad = true; }
206+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
207+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
208+
else if (arg == "-vsd" || arg == "--vad_min_speech_duration_ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
209+
else if (arg == "-vsd" || arg == "--vad_min_silence_duration_ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
210+
else if (arg == "-vmsd" || arg == "--vad_max_speech_duration_s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
211+
else if (arg == "-vp" || arg == "--vad_speech_pad_ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
212+
else if (arg == "-vs" || arg == "--vad_window_size_samples") { params.vad_window_size_samples = std::stoi(ARGV_NEXT); }
213+
else if (arg == "-vo" || arg == "--vad_samples_overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); }
192214
else {
193215
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
194216
whisper_print_usage(argc, argv, params);
@@ -258,6 +280,19 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
258280
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
259281
fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str());
260282
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
283+
// Voice Activity Detection (VAD) parameters
284+
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
285+
fprintf(stderr, " -v, --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
286+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
287+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
288+
fprintf(stderr, " -vs N, --vad_window_size_samples N [%-7d] VAD window size\n", params.vad_window_size_samples);
289+
fprintf(stderr, " -vspd N, --vad_min_speech_duration_ms N [%-7d] VAD min speech duration\n", params.vad_min_speech_duration_ms);
290+
fprintf(stderr, " -vsd N, --vad_min_silence_duration_ms N [%-7d] VAD min silence duration\n", params.vad_min_silence_duration_ms);
291+
fprintf(stderr, " -vmsd N, --vad_max_speech_duration_s N [%-7s] VAD max speech duration\n", params.vad_max_speech_duration_s == FLT_MAX ?
292+
std::string("FLT_MAX").c_str() :
293+
std::to_string(params.vad_max_speech_duration_s).c_str());
294+
fprintf(stderr, " -vp N, --vad_speech_pad_ms N [%-7d] VAD speech padding\n", params.vad_speech_pad_ms);
295+
fprintf(stderr, " -vo N, --vad_samples_overlap N [%-7.2f] VAD samples overlap size\n", params.vad_samples_overlap);
261296
fprintf(stderr, "\n");
262297
}
263298

@@ -1154,8 +1189,20 @@ int main(int argc, char ** argv) {
11541189

11551190
wparams.suppress_nst = params.suppress_nst;
11561191

1192+
wparams.vad = params.vad;
1193+
wparams.vad_model_path = params.vad_model.c_str();
1194+
wparams.vad_threshold = params.vad_threshold;
1195+
wparams.vad_min_speech_duration_ms = params.vad_min_speech_duration_ms;
1196+
wparams.vad_min_silence_duration_ms = params.vad_min_silence_duration_ms;
1197+
wparams.vad_max_speech_duration_s = params.vad_max_speech_duration_s;
1198+
wparams.vad_speech_pad_ms = params.vad_speech_pad_ms;
1199+
wparams.vad_window_size_samples = params.vad_window_size_samples;
1200+
wparams.vad_samples_overlap = params.vad_samples_overlap;
1201+
11571202
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
11581203

1204+
1205+
11591206
const auto & grammar_parsed = params.grammar_parsed;
11601207
auto grammar_rules = grammar_parsed.c_rules();
11611208

src/whisper.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4524,7 +4524,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
45244524
return nullptr;
45254525
}
45264526

4527-
static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
4527+
static ggml_tensor * whisper_vad_build_stft_layer(ggml_context * ctx0,
45284528
const whisper_vad_model & model, ggml_tensor * cur) {
45294529
// Apply reflective padding to the input tensor
45304530
ggml_tensor * padded = ggml_pad_reflect_1d(ctx0, cur, 64, 64);
@@ -4552,7 +4552,7 @@ static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
45524552
return magnitude;
45534553
}
45544554

4555-
static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
4555+
static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context * ctx0,
45564556
const whisper_vad_model & model, ggml_tensor * cur) {
45574557
// First Conv1D: expands to 128 channels.
45584558
cur = ggml_conv_1d(ctx0, model.encoder_0_weight, cur, 1, 1, 1);
@@ -4577,7 +4577,7 @@ static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
45774577
return cur;
45784578
}
45794579

4580-
static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context* ctx0,
4580+
static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
45814581
const whisper_vad_context & vctx, ggml_tensor * cur, ggml_cgraph * gf) {
45824582
const whisper_vad_model & model = vctx.model;
45834583
const int hdim = model.hparams.lstm_hidden_size;

0 commit comments

Comments
 (0)