Skip to content

Commit 8ac388f

Browse files
committed
stream : remove simple_vad specific code from stream.cpp [no ci]
wip
1 parent b924706 commit 8ac388f

File tree

1 file changed

+35
-72
lines changed

1 file changed

+35
-72
lines changed

examples/stream/stream.cpp

Lines changed: 35 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
161161
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
162162
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
163163

164-
const int n_new_line = !params.vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
165-
params.no_timestamps = !params.vad;
166-
params.no_context |= params.vad;
164+
const int n_new_line = std::max(1, params.length_ms / params.step_ms - 1); // number of steps to print new line
165+
//params.no_timestamps = !params.vad;
166+
//params.no_context |= params.vad;
167167
params.max_tokens = 0;
168168

169169
// init audio
@@ -217,12 +217,7 @@ int main(int argc, char ** argv) {
217217
params.translate ? "translate" : "transcribe",
218218
params.no_timestamps ? 0 : 1);
219219

220-
if (!params.vad) {
221-
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
222-
} else {
223-
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
224-
}
225-
220+
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
226221
fprintf(stderr, "\n");
227222
}
228223

@@ -270,75 +265,45 @@ int main(int argc, char ** argv) {
270265

271266
// process new audio
272267

273-
if (!params.vad) {
274-
while (true) {
275-
// handle Ctrl + C
276-
is_running = sdl_poll_events();
277-
if (!is_running) {
278-
break;
279-
}
280-
audio.get(params.step_ms, pcmf32_new);
281-
282-
if ((int) pcmf32_new.size() > 2*n_samples_step) {
283-
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
284-
audio.clear();
285-
continue;
286-
}
287-
288-
if ((int) pcmf32_new.size() >= n_samples_step) {
289-
audio.clear();
290-
break;
291-
}
292-
293-
std::this_thread::sleep_for(std::chrono::milliseconds(1));
268+
while (true) {
269+
// handle Ctrl + C
270+
is_running = sdl_poll_events();
271+
if (!is_running) {
272+
break;
294273
}
274+
audio.get(params.step_ms, pcmf32_new);
295275

296-
const int n_samples_new = pcmf32_new.size();
297-
298-
// take up to params.length_ms audio from previous iteration
299-
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
300-
301-
//fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
302-
303-
pcmf32.resize(n_samples_new + n_samples_take);
304-
305-
for (int i = 0; i < n_samples_take; i++) {
306-
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
276+
if ((int) pcmf32_new.size() > 2*n_samples_step) {
277+
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
278+
audio.clear();
279+
continue;
307280
}
308281

309-
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
310-
311-
pcmf32_old = pcmf32;
312-
} else {
313-
const auto t_now = std::chrono::high_resolution_clock::now();
314-
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
315-
316-
if (t_diff < params.step_ms) {
317-
std::this_thread::sleep_for(std::chrono::milliseconds(params.step_ms));
318-
continue;
282+
if ((int) pcmf32_new.size() >= n_samples_step) {
283+
audio.clear();
284+
break;
319285
}
320286

321-
// Get new audio for this step
322-
audio.get(params.step_ms, pcmf32_new);
287+
std::this_thread::sleep_for(std::chrono::milliseconds(1));
288+
}
323289

324-
// Calculate how much old audio to keep
325-
const int n_samples_new = pcmf32_new.size();
326-
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
290+
const int n_samples_new = pcmf32_new.size();
327291

328-
// Combine old + new audio with overlap
329-
pcmf32.resize(n_samples_new + n_samples_take);
292+
// take up to params.length_ms audio from previous iteration
293+
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
330294

331-
// Copy kept portion from previous iteration
332-
for (int i = 0; i < n_samples_take; i++) {
333-
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
334-
}
295+
//fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
335296

336-
// Append new audio
337-
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new * sizeof(float));
297+
pcmf32.resize(n_samples_new + n_samples_take);
338298

339-
t_last = t_now;
299+
for (int i = 0; i < n_samples_take; i++) {
300+
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
340301
}
341302

303+
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
304+
305+
pcmf32_old = pcmf32;
306+
342307
// run the inference
343308
{
344309
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
@@ -381,14 +346,12 @@ int main(int argc, char ** argv) {
381346

382347
// print result;
383348
{
384-
if (!params.vad) {
385-
printf("\33[2K\r");
349+
printf("\33[2K\r");
386350

387-
// print long empty line to clear the previous line
388-
printf("%s", std::string(100, ' ').c_str());
351+
// print long empty line to clear the previous line
352+
printf("%s", std::string(100, ' ').c_str());
389353

390-
printf("\33[2K\r");
391-
}
354+
printf("\33[2K\r");
392355

393356
const int n_segments = whisper_full_n_segments(ctx);
394357
for (int i = 0; i < n_segments; ++i) {
@@ -430,7 +393,7 @@ int main(int argc, char ** argv) {
430393

431394
++n_iter;
432395

433-
if (!params.vad && (n_iter % n_new_line) == 0) {
396+
if ((n_iter % n_new_line) == 0) {
434397
printf("\n");
435398

436399
// keep part of the audio for next iteration to try to mitigate word boundary issues

0 commit comments

Comments
 (0)