@@ -25,9 +25,6 @@ struct whisper_params {
25
25
int32_t audio_ctx = 0 ;
26
26
int32_t beam_size = -1 ;
27
27
28
- float vad_thold = 0 .6f ;
29
- float freq_thold = 100 .0f ;
30
-
31
28
bool translate = false ;
32
29
bool no_fallback = false ;
33
30
bool print_special = false ;
@@ -37,10 +34,21 @@ struct whisper_params {
37
34
bool save_audio = false ; // save audio to wav file
38
35
bool use_gpu = true ;
39
36
bool flash_attn = false ;
37
+ bool no_prints = false ;
40
38
41
39
std::string language = " en" ;
42
40
std::string model = " models/ggml-base.en.bin" ;
43
41
std::string fname_out;
42
+
43
+ // Voice Activity Detection (VAD) parameters
44
+ bool vad = false ;
45
+ std::string vad_model = " models/for-tests-silero-v5.1.2-ggml.bin" ;
46
+ float vad_threshold = 0 .5f ;
47
+ int vad_min_speech_duration_ms = 250 ;
48
+ int vad_min_silence_duration_ms = 100 ;
49
+ float vad_max_speech_duration_s = FLT_MAX;
50
+ int vad_speech_pad_ms = 30 ;
51
+ float vad_samples_overlap = 0 .1f ;
44
52
};
45
53
46
54
void whisper_print_usage (int argc, char ** argv, const whisper_params & params);
@@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
61
69
else if (arg == " -mt" || arg == " --max-tokens" ) { params.max_tokens = std::stoi (argv[++i]); }
62
70
else if (arg == " -ac" || arg == " --audio-ctx" ) { params.audio_ctx = std::stoi (argv[++i]); }
63
71
else if (arg == " -bs" || arg == " --beam-size" ) { params.beam_size = std::stoi (argv[++i]); }
64
- else if (arg == " -vth" || arg == " --vad-thold" ) { params.vad_thold = std::stof (argv[++i]); }
65
- else if (arg == " -fth" || arg == " --freq-thold" ) { params.freq_thold = std::stof (argv[++i]); }
66
72
else if (arg == " -tr" || arg == " --translate" ) { params.translate = true ; }
67
73
else if (arg == " -nf" || arg == " --no-fallback" ) { params.no_fallback = true ; }
68
74
else if (arg == " -ps" || arg == " --print-special" ) { params.print_special = true ; }
@@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
74
80
else if (arg == " -sa" || arg == " --save-audio" ) { params.save_audio = true ; }
75
81
else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
76
82
else if (arg == " -fa" || arg == " --flash-attn" ) { params.flash_attn = true ; }
77
-
83
+ else if (arg == " -np" || arg == " --no-prints" ) { params.no_prints = true ; }
84
+ // Voice Activity Detection (VAD)
85
+ else if ( arg == " --vad" ) { params.vad = true ; }
86
+ else if (arg == " -vm" || arg == " --vad-model" ) { params.vad_model = argv[++i]; }
87
+ else if (arg == " -vt" || arg == " --vad-threshold" ) { params.vad_threshold = std::stof (argv[++i]); }
88
+ else if (arg == " -vsd" || arg == " --vad-min-speech-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
89
+ else if (arg == " -vsd" || arg == " --vad-min-silence-duration-ms" ) { params.vad_min_speech_duration_ms = std::stoi (argv[++i]); }
90
+ else if (arg == " -vmsd" || arg == " --vad-max-speech-duration-s" ) { params.vad_max_speech_duration_s = std::stof (argv[++i]); }
91
+ else if (arg == " -vp" || arg == " --vad-speech-pad-ms" ) { params.vad_speech_pad_ms = std::stoi (argv[++i]); }
92
+ else if (arg == " -vo" || arg == " --vad-samples-overlap" ) { params.vad_samples_overlap = std::stof (argv[++i]); }
78
93
else {
79
94
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
80
95
whisper_print_usage (argc, argv, params);
@@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
99
114
fprintf (stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n " , params.max_tokens );
100
115
fprintf (stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n " , params.audio_ctx );
101
116
fprintf (stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n " , params.beam_size );
102
- fprintf (stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n " , params.vad_thold );
103
- fprintf (stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n " , params.freq_thold );
104
117
fprintf (stderr, " -tr, --translate [%-7s] translate from source language to english\n " , params.translate ? " true" : " false" );
105
118
fprintf (stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n " , params.no_fallback ? " true" : " false" );
106
119
fprintf (stderr, " -ps, --print-special [%-7s] print special tokens\n " , params.print_special ? " true" : " false" );
@@ -112,30 +125,45 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112
125
fprintf (stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n " , params.save_audio ? " true" : " false" );
113
126
fprintf (stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n " , params.use_gpu ? " false" : " true" );
114
127
fprintf (stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n " , params.flash_attn ? " true" : " false" );
128
+ fprintf (stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n " , params.no_prints ? " true" : " false" );
129
+ // Voice Activity Detection (VAD) parameters
130
+ fprintf (stderr, " \n Voice Activity Detection (VAD) options:\n " );
131
+ fprintf (stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n " , params.vad ? " true" : " false" );
132
+ fprintf (stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n " , params.vad_model .c_str ());
133
+ fprintf (stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n " , params.vad_threshold );
134
+ fprintf (stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n " , params.vad_min_speech_duration_ms );
135
+ fprintf (stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n " , params.vad_min_silence_duration_ms );
136
+ fprintf (stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n " , params.vad_max_speech_duration_s == FLT_MAX ?
137
+ std::string (" FLT_MAX" ).c_str () :
138
+ std::to_string (params.vad_max_speech_duration_s ).c_str ());
139
+ fprintf (stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n " , params.vad_speech_pad_ms );
140
+ fprintf (stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n " , params.vad_samples_overlap );
115
141
fprintf (stderr, " \n " );
116
142
}
117
143
144
+ static void cb_log_disable (enum ggml_log_level , const char * , void * ) { }
145
+
118
146
int main (int argc, char ** argv) {
119
147
whisper_params params;
120
148
121
149
if (whisper_params_parse (argc, argv, params) == false ) {
122
150
return 1 ;
123
151
}
124
152
153
+ if (params.no_prints ) {
154
+ whisper_log_set (cb_log_disable, NULL );
155
+ }
156
+
125
157
params.keep_ms = std::min (params.keep_ms , params.step_ms );
126
158
params.length_ms = std::max (params.length_ms , params.step_ms );
127
159
160
+
128
161
const int n_samples_step = (1e-3 *params.step_ms )*WHISPER_SAMPLE_RATE;
129
162
const int n_samples_len = (1e-3 *params.length_ms )*WHISPER_SAMPLE_RATE;
130
163
const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
131
164
const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
132
165
133
- const bool use_vad = n_samples_step <= 0 ; // sliding window mode uses VAD
134
-
135
- const int n_new_line = !use_vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
136
-
137
- params.no_timestamps = !use_vad;
138
- params.no_context |= use_vad;
166
+ const int n_new_line = std::max (1 , params.length_ms / params.step_ms - 1 ); // number of steps to print new line
139
167
params.max_tokens = 0 ;
140
168
141
169
// init audio
@@ -189,11 +217,7 @@ int main(int argc, char ** argv) {
189
217
params.translate ? " translate" : " transcribe" ,
190
218
params.no_timestamps ? 0 : 1 );
191
219
192
- if (!use_vad) {
193
- fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
194
- } else {
195
- fprintf (stderr, " %s: using VAD, will transcribe on speech activity\n " , __func__);
196
- }
220
+ fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
197
221
198
222
fprintf (stderr, " \n " );
199
223
}
@@ -242,67 +266,44 @@ int main(int argc, char ** argv) {
242
266
243
267
// process new audio
244
268
245
- if (!use_vad) {
246
- while (true ) {
247
- // handle Ctrl + C
248
- is_running = sdl_poll_events ();
249
- if (!is_running) {
250
- break ;
251
- }
252
- audio.get (params.step_ms , pcmf32_new);
253
-
254
- if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
255
- fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
256
- audio.clear ();
257
- continue ;
258
- }
259
-
260
- if ((int ) pcmf32_new.size () >= n_samples_step) {
261
- audio.clear ();
262
- break ;
263
- }
264
-
265
- std::this_thread::sleep_for (std::chrono::milliseconds (1 ));
269
+ while (true ) {
270
+ // handle Ctrl + C
271
+ is_running = sdl_poll_events ();
272
+ if (!is_running) {
273
+ break ;
266
274
}
275
+ audio.get (params.step_ms , pcmf32_new);
267
276
268
- const int n_samples_new = pcmf32_new.size ();
269
-
270
- // take up to params.length_ms audio from previous iteration
271
- const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
272
-
273
- // printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
274
-
275
- pcmf32.resize (n_samples_new + n_samples_take);
277
+ if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
278
+ fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
279
+ audio.clear ();
280
+ continue ;
281
+ }
276
282
277
- for (int i = 0 ; i < n_samples_take; i++) {
278
- pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
283
+ if ((int ) pcmf32_new.size () >= n_samples_step) {
284
+ audio.clear ();
285
+ break ;
279
286
}
280
287
281
- memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
288
+ std::this_thread::sleep_for (std::chrono::milliseconds (1 ));
289
+ }
282
290
283
- pcmf32_old = pcmf32;
284
- } else {
285
- const auto t_now = std::chrono::high_resolution_clock::now ();
286
- const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count ();
291
+ const int n_samples_new = pcmf32_new.size ();
287
292
288
- if (t_diff < 2000 ) {
289
- std::this_thread::sleep_for ( std::chrono::milliseconds ( 100 ));
293
+ // take up to params.length_ms audio from previous iteration
294
+ const int n_samples_take = std::min (( int ) pcmf32_old. size (), std::max ( 0 , n_samples_keep + n_samples_len - n_samples_new ));
290
295
291
- continue ;
292
- }
296
+ // fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
293
297
294
- audio. get ( 2000 , pcmf32_new );
298
+ pcmf32. resize (n_samples_new + n_samples_take );
295
299
296
- if (::vad_simple (pcmf32_new, WHISPER_SAMPLE_RATE, 1000 , params.vad_thold , params.freq_thold , false )) {
297
- audio.get (params.length_ms , pcmf32);
298
- } else {
299
- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
300
+ for (int i = 0 ; i < n_samples_take; i++) {
301
+ pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
302
+ }
300
303
301
- continue ;
302
- }
304
+ memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
303
305
304
- t_last = t_now;
305
- }
306
+ pcmf32_old = pcmf32;
306
307
307
308
// run the inference
308
309
{
@@ -313,7 +314,6 @@ int main(int argc, char ** argv) {
313
314
wparams.print_realtime = false ;
314
315
wparams.print_timestamps = !params.no_timestamps ;
315
316
wparams.translate = params.translate ;
316
- wparams.single_segment = !use_vad;
317
317
wparams.max_tokens = params.max_tokens ;
318
318
wparams.language = params.language .c_str ();
319
319
wparams.n_threads = params.n_threads ;
@@ -330,28 +330,29 @@ int main(int argc, char ** argv) {
330
330
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data ();
331
331
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size ();
332
332
333
+ wparams.vad = params.vad ;
334
+ wparams.vad_model_path = params.vad_model .c_str ();
335
+
336
+ wparams.vad_params .threshold = params.vad_threshold ;
337
+ wparams.vad_params .min_speech_duration_ms = params.vad_min_speech_duration_ms ;
338
+ wparams.vad_params .min_silence_duration_ms = params.vad_min_silence_duration_ms ;
339
+ wparams.vad_params .max_speech_duration_s = params.vad_max_speech_duration_s ;
340
+ wparams.vad_params .speech_pad_ms = params.vad_speech_pad_ms ;
341
+ wparams.vad_params .samples_overlap = params.vad_samples_overlap ;
342
+
333
343
if (whisper_full (ctx, wparams, pcmf32.data (), pcmf32.size ()) != 0 ) {
334
344
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
335
345
return 6 ;
336
346
}
337
347
338
348
// print result;
339
349
{
340
- if (!use_vad) {
341
- printf (" \33 [2K\r " );
342
-
343
- // print long empty line to clear the previous line
344
- printf (" %s" , std::string (100 , ' ' ).c_str ());
350
+ printf (" \33 [2K\r " );
345
351
346
- printf (" \33 [2K\r " );
347
- } else {
348
- const int64_t t1 = (t_last - t_start).count ()/1000000 ;
349
- const int64_t t0 = std::max (0.0 , t1 - pcmf32.size ()*1000.0 /WHISPER_SAMPLE_RATE);
352
+ // print long empty line to clear the previous line
353
+ printf (" %s" , std::string (100 , ' ' ).c_str ());
350
354
351
- printf (" \n " );
352
- printf (" ### Transcription %d START | t0 = %d ms | t1 = %d ms\n " , n_iter, (int ) t0, (int ) t1);
353
- printf (" \n " );
354
- }
355
+ printf (" \33 [2K\r " );
355
356
356
357
const int n_segments = whisper_full_n_segments (ctx);
357
358
for (int i = 0 ; i < n_segments; ++i) {
@@ -389,15 +390,11 @@ int main(int argc, char ** argv) {
389
390
fout << std::endl;
390
391
}
391
392
392
- if (use_vad) {
393
- printf (" \n " );
394
- printf (" ### Transcription %d END\n " , n_iter);
395
- }
396
393
}
397
394
398
395
++n_iter;
399
396
400
- if (!use_vad && (n_iter % n_new_line) == 0 ) {
397
+ if ((n_iter % n_new_line) == 0 ) {
401
398
printf (" \n " );
402
399
403
400
// keep part of the audio for next iteration to try to mitigate word boundary issues
0 commit comments