Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ else()
add_subdirectory(bench)
add_subdirectory(server)
add_subdirectory(quantize)
add_subdirectory(vad-speech-segments)
if (WHISPER_SDL2)
add_subdirectory(stream)
add_subdirectory(command)
Expand Down
8 changes: 8 additions & 0 deletions examples/vad-speech-segments/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
set(TARGET vad-speech-segments)
add_executable(${TARGET} speech.cpp)

include(DefaultTargetOptions)

target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

install(TARGETS ${TARGET} RUNTIME)
52 changes: 52 additions & 0 deletions examples/vad-speech-segments/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# whisper.cpp/examples/vad-speech-segments

This examples demonstrates how to use a VAD (Voice Activity Detection) model to
segment an audio file into speech segments.

### Building the example
The example can be built using the following command:
```console
cmake -S . -B build
cmake --build build -j8 --target vad-speech-segments
```

### Running the example
The examples can be run using the following command, which uses a model
that we use internally for testing:
```console
./build/bin/vad-speech-segments \
-vad-model models/for-tests-silero-v5.1.2-ggml.bin \
--file samples/jfk.wav \
--no-prints

Detected 5 speech segments:
Speech segment 0: start = 0.29, end = 2.21
Speech segment 1: start = 3.30, end = 3.77
Speech segment 2: start = 4.00, end = 4.35
Speech segment 3: start = 5.38, end = 7.65
Speech segment 4: start = 8.16, end = 10.59
```
To see more output from whisper.cpp remove the `--no-prints` argument.


### Command line options
```console
./build/bin/vad-speech-segments --help

usage: ./build/bin/vad-speech-segments [options] file
supported audio formats: flac, mp3, ogg, wav

options:
-h, --help [default] show this help message and exit
-f FNAME, --file FNAME [ ] input audio file path
-t N, --threads N [4 ] number of threads to use during computation
-ug, --use-gpu [true ] use GPU
-vm FNAME, --vad-model FNAME [ ] VAD model path
-vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
-vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
-vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
-vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
-vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
-vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
-np, --no-prints [false ] do not print anything other than the results
```
143 changes: 143 additions & 0 deletions examples/vad-speech-segments/speech.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#include "common.h"
#include "common-whisper.h"

#include "whisper.h"

#include <cstdio>
#include <cfloat>
#include <string>

// command-line parameters
struct cli_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
std::string vad_model = "";
float vad_threshold = 0.5f;
int vad_min_speech_duration_ms = 250;
int vad_min_silence_duration_ms = 100;
float vad_max_speech_duration_s = FLT_MAX;
int vad_speech_pad_ms = 30;
float vad_samples_overlap = 0.1f;
bool use_gpu = false;
std::string fname_inp = {};
bool no_prints = false;
};

static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] file\n", argv[0]);
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -ug, --use-gpu [%-7s] use GPU\n", params.use_gpu ? "true" : "false");
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
std::string("FLT_MAX").c_str() :
std::to_string(params.vad_max_speech_duration_s).c_str());
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
fprintf(stderr, "\n");
}

static char * requires_value_error(const std::string & arg) {
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
exit(0);
}

static bool vad_params_parse(int argc, char ** argv, cli_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];

if (arg == "-h" || arg == "--help") {
vad_print_usage(argc, argv, params);
exit(0);
}
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
else if (arg == "-f" || arg == "--file") { params.fname_inp = ARGV_NEXT; }
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
else if (arg == "-ug" || arg == "--use-gpu") { params.use_gpu = true; }
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); }
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
vad_print_usage(argc, argv, params);
exit(0);
}
}

return true;
}

static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }

int main(int argc, char ** argv) {
cli_params cli_params;

if (!vad_params_parse(argc, argv, cli_params)) {
vad_print_usage(argc, argv, cli_params);
return 1;
}

if (cli_params.no_prints) {
whisper_log_set(cb_log_disable, NULL);
}

// Load the input sample audio file.
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) {
fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str());
return 2;
}

// Initialize the context which loads the VAD model.
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
ctx_params.n_threads = cli_params.n_threads;
ctx_params.use_gpu = cli_params.use_gpu;
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
cli_params.vad_model.c_str(),
ctx_params);

// Detect speech in the input audio file.
if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) {
fprintf(stderr, "error: failed to detect speech\n");
return 3;
}

// Get the the vad segements using the probabilities that have been computed
// previously and stored in the whisper_vad_context.
struct whisper_vad_params params = whisper_vad_default_params();
params.threshold = cli_params.vad_threshold;
params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms;
params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms;
params.max_speech_duration_s = cli_params.vad_max_speech_duration_s;
params.speech_pad_ms = cli_params.vad_speech_pad_ms;
params.samples_overlap = cli_params.vad_samples_overlap;
struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params);

printf("\n");
printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments));
for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) {
printf("Speech segment %d: start = %.2f, end = %.2f\n", i,
whisper_vad_segments_get_segment_t0(segments, i),
whisper_vad_segments_get_segment_t1(segments, i));
}
printf("\n");

whisper_vad_free_segments(segments);
whisper_vad_free(vctx);

return 0;
}
Loading