@@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
161
161
const int n_samples_keep = (1e-3 *params.keep_ms )*WHISPER_SAMPLE_RATE;
162
162
const int n_samples_30s = (1e-3 *30000.0 )*WHISPER_SAMPLE_RATE;
163
163
164
- const int n_new_line = !params. vad ? std::max (1 , params.length_ms / params.step_ms - 1 ) : 1 ; // number of steps to print new line
165
- params.no_timestamps = !params.vad ;
166
- params.no_context |= params.vad ;
164
+ const int n_new_line = std::max (1 , params.length_ms / params.step_ms - 1 ); // number of steps to print new line
165
+ // params.no_timestamps = !params.vad;
166
+ // params.no_context |= params.vad;
167
167
params.max_tokens = 0 ;
168
168
169
169
// init audio
@@ -217,12 +217,7 @@ int main(int argc, char ** argv) {
217
217
params.translate ? " translate" : " transcribe" ,
218
218
params.no_timestamps ? 0 : 1 );
219
219
220
- if (!params.vad ) {
221
- fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
222
- } else {
223
- fprintf (stderr, " %s: using VAD, will transcribe on speech activity\n " , __func__);
224
- }
225
-
220
+ fprintf (stderr, " %s: n_new_line = %d, no_context = %d\n " , __func__, n_new_line, params.no_context );
226
221
fprintf (stderr, " \n " );
227
222
}
228
223
@@ -270,75 +265,45 @@ int main(int argc, char ** argv) {
270
265
271
266
// process new audio
272
267
273
- if (!params.vad ) {
274
- while (true ) {
275
- // handle Ctrl + C
276
- is_running = sdl_poll_events ();
277
- if (!is_running) {
278
- break ;
279
- }
280
- audio.get (params.step_ms , pcmf32_new);
281
-
282
- if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
283
- fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
284
- audio.clear ();
285
- continue ;
286
- }
287
-
288
- if ((int ) pcmf32_new.size () >= n_samples_step) {
289
- audio.clear ();
290
- break ;
291
- }
292
-
293
- std::this_thread::sleep_for (std::chrono::milliseconds (1 ));
268
+ while (true ) {
269
+ // handle Ctrl + C
270
+ is_running = sdl_poll_events ();
271
+ if (!is_running) {
272
+ break ;
294
273
}
274
+ audio.get (params.step_ms , pcmf32_new);
295
275
296
- const int n_samples_new = pcmf32_new.size ();
297
-
298
- // take up to params.length_ms audio from previous iteration
299
- const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
300
-
301
- // fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
302
-
303
- pcmf32.resize (n_samples_new + n_samples_take);
304
-
305
- for (int i = 0 ; i < n_samples_take; i++) {
306
- pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
276
+ if ((int ) pcmf32_new.size () > 2 *n_samples_step) {
277
+ fprintf (stderr, " \n\n %s: WARNING: cannot process audio fast enough, dropping audio ...\n\n " , __func__);
278
+ audio.clear ();
279
+ continue ;
307
280
}
308
281
309
- memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
310
-
311
- pcmf32_old = pcmf32;
312
- } else {
313
- const auto t_now = std::chrono::high_resolution_clock::now ();
314
- const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count ();
315
-
316
- if (t_diff < params.step_ms ) {
317
- std::this_thread::sleep_for (std::chrono::milliseconds (params.step_ms ));
318
- continue ;
282
+ if ((int ) pcmf32_new.size () >= n_samples_step) {
283
+ audio.clear ();
284
+ break ;
319
285
}
320
286
321
- // Get new audio for this step
322
- audio. get (params. step_ms , pcmf32_new);
287
+ std::this_thread::sleep_for ( std::chrono::milliseconds ( 1 ));
288
+ }
323
289
324
- // Calculate how much old audio to keep
325
- const int n_samples_new = pcmf32_new.size ();
326
- const int n_samples_take = std::min ((int ) pcmf32_old.size (), std::max (0 , n_samples_keep + n_samples_len - n_samples_new));
290
+ const int n_samples_new = pcmf32_new.size ();
327
291
328
- // Combine old + new audio with overlap
329
- pcmf32. resize (n_samples_new + n_samples_take );
292
+ // take up to params.length_ms audio from previous iteration
293
+ const int n_samples_take = std::min (( int ) pcmf32_old. size (), std::max ( 0 , n_samples_keep + n_samples_len - n_samples_new) );
330
294
331
- // Copy kept portion from previous iteration
332
- for (int i = 0 ; i < n_samples_take; i++) {
333
- pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
334
- }
295
+ // fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
335
296
336
- // Append new audio
337
- memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new * sizeof (float ));
297
+ pcmf32.resize (n_samples_new + n_samples_take);
338
298
339
- t_last = t_now;
299
+ for (int i = 0 ; i < n_samples_take; i++) {
300
+ pcmf32[i] = pcmf32_old[pcmf32_old.size () - n_samples_take + i];
340
301
}
341
302
303
+ memcpy (pcmf32.data () + n_samples_take, pcmf32_new.data (), n_samples_new*sizeof (float ));
304
+
305
+ pcmf32_old = pcmf32;
306
+
342
307
// run the inference
343
308
{
344
309
whisper_full_params wparams = whisper_full_default_params (params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
@@ -381,14 +346,12 @@ int main(int argc, char ** argv) {
381
346
382
347
// print result;
383
348
{
384
- if (!params.vad ) {
385
- printf (" \33 [2K\r " );
349
+ printf (" \33 [2K\r " );
386
350
387
- // print long empty line to clear the previous line
388
- printf (" %s" , std::string (100 , ' ' ).c_str ());
351
+ // print long empty line to clear the previous line
352
+ printf (" %s" , std::string (100 , ' ' ).c_str ());
389
353
390
- printf (" \33 [2K\r " );
391
- }
354
+ printf (" \33 [2K\r " );
392
355
393
356
const int n_segments = whisper_full_n_segments (ctx);
394
357
for (int i = 0 ; i < n_segments; ++i) {
@@ -430,7 +393,7 @@ int main(int argc, char ** argv) {
430
393
431
394
++n_iter;
432
395
433
- if (!params. vad && (n_iter % n_new_line) == 0 ) {
396
+ if ((n_iter % n_new_line) == 0 ) {
434
397
printf (" \n " );
435
398
436
399
// keep part of the audio for next iteration to try to mitigate word boundary issues
0 commit comments