examples : add VAD parameters to CLI [no ci]

danbev · danbev · commit a8f07d770e01 · 2025-05-02T15:47:16.000+02:00
Example of format:
```console

$ ./build/bin/whisper-cli --help

usage: ./build/bin/whisper-cli [options] file0 file1 ...
supported audio formats: flac, mp3, ogg, wav

options:
  -h,        --help              [default] show this help message and exit
  ...

Voice Activity Detection (VAD) options:
  -v,        --vad                           [false  ] enable Voice Activity Detection (VAD)
  -vm FNAME, --vad-model FNAME               [       ] VAD model path
  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
  -vs N,     --vad_window_size_samples     N [512    ] VAD window size
  -vspd N,   --vad_min_speech_duration_ms  N [250    ] VAD min speech duration
  -vsd N,    --vad_min_silence_duration_ms N [100    ] VAD min silence duration
  -vmsd N,   --vad_max_speech_duration_s   N [FLT_MAX] VAD max speech duration
  -vp N,     --vad_speech_pad_ms           N [30     ] VAD speech padding
  -vo N,     --vad_samples_overlap         N [0.10   ] VAD samples overlap size
```
The main reason for the separate VAD options section is that the VAD
options are longer and made the rest look a little ugly.
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
@@ -11,6 +11,7 @@
 #include <thread>
 #include <vector>
 #include <cstring>
+#include <cfloat>
 
 #if defined(_WIN32)
 #ifndef NOMINMAX
@@ -101,6 +102,17 @@ struct whisper_params {
     std::vector<std::string> fname_out = {};
 
     grammar_parser::parse_state grammar_parsed;
+
+    // Voice Activity Detection (VAD) parameters
+    bool        vad           = false;
+    std::string vad_model     = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    int         vad_window_size_samples = 512;
+    float       vad_samples_overlap = 0.1f;
 };
 
 static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -189,6 +201,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (                  arg == "--grammar")         { params.grammar         = ARGV_NEXT; }
         else if (                  arg == "--grammar-rule")    { params.grammar_rule    = ARGV_NEXT; }
         else if (                  arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
+        // Voice Activity Detection (VAD)
+        else if (arg == "-v"    || arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad_min_speech_duration_ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad_min_silence_duration_ms") { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vmsd" || arg == "--vad_max_speech_duration_s")   { params.vad_max_speech_duration_s   = std::stof(ARGV_NEXT); }
+        else if (arg == "-vp"   || arg == "--vad_speech_pad_ms")           { params.vad_speech_pad_ms           = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vs"   || arg == "--vad_window_size_samples")     { params.vad_window_size_samples     = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vo"   || arg == "--vad_samples_overlap")         { params.vad_samples_overlap         = std::stof(ARGV_NEXT); }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params);
@@ -258,6 +280,19 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     fprintf(stderr, "  --grammar GRAMMAR              [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
     fprintf(stderr, "  --grammar-rule RULE            [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
     fprintf(stderr, "  --grammar-penalty N            [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "  -v,        --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",  params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                         params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
+    fprintf(stderr, "  -vs N,     --vad_window_size_samples     N [%-7d] VAD window size\n",                        params.vad_window_size_samples);
+    fprintf(stderr, "  -vspd N,   --vad_min_speech_duration_ms  N [%-7d] VAD min speech duration\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad_min_silence_duration_ms N [%-7d] VAD min silence duration\n",               params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad_max_speech_duration_s   N [%-7s] VAD max speech duration\n",                params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                    std::string("FLT_MAX").c_str() :
+                                                                                                                    std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad_speech_pad_ms           N [%-7d] VAD speech padding\n",                     params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad_samples_overlap         N [%-7.2f] VAD samples overlap size\n",             params.vad_samples_overlap);
     fprintf(stderr, "\n");
 }
 
@@ -1154,8 +1189,20 @@ int main(int argc, char ** argv) {
 
             wparams.suppress_nst     = params.suppress_nst;
 
+            wparams.vad                         = params.vad;
+            wparams.vad_model_path              = params.vad_model.c_str();
+            wparams.vad_threshold               = params.vad_threshold;
+            wparams.vad_min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_window_size_samples     = params.vad_window_size_samples;
+            wparams.vad_samples_overlap         = params.vad_samples_overlap;
+
             whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
 
+
+
             const auto & grammar_parsed = params.grammar_parsed;
             auto grammar_rules = grammar_parsed.c_rules();
 
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4524,7 +4524,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
     return nullptr;
 }
 
-static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
+static ggml_tensor * whisper_vad_build_stft_layer(ggml_context * ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
     // Apply reflective padding to the input tensor
     ggml_tensor * padded = ggml_pad_reflect_1d(ctx0, cur, 64, 64);
@@ -4552,7 +4552,7 @@ static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
     return magnitude;
 }
 
-static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
+static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context * ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
     // First Conv1D: expands to 128 channels.
     cur = ggml_conv_1d(ctx0, model.encoder_0_weight, cur, 1, 1, 1);
@@ -4577,7 +4577,7 @@ static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
     return cur;
 }
 
-static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context* ctx0,
+static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context * ctx0,
         const whisper_vad_context & vctx, ggml_tensor * cur, ggml_cgraph * gf) {
     const whisper_vad_model & model = vctx.model;
     const int hdim = model.hparams.lstm_hidden_size;