|
| 1 | +#include "common.h" |
| 2 | +#include "common-whisper.h" |
| 3 | + |
| 4 | +#include "whisper.h" |
| 5 | + |
| 6 | +#include <cstdio> |
| 7 | +#include <cfloat> |
| 8 | +#include <string> |
| 9 | + |
| 10 | +// command-line parameters |
| 11 | +struct cli_params { |
| 12 | + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); |
| 13 | + std::string vad_model = ""; |
| 14 | + float vad_threshold = 0.5f; |
| 15 | + int vad_min_speech_duration_ms = 250; |
| 16 | + int vad_min_silence_duration_ms = 100; |
| 17 | + float vad_max_speech_duration_s = FLT_MAX; |
| 18 | + int vad_speech_pad_ms = 30; |
| 19 | + float vad_samples_overlap = 0.1f; |
| 20 | + bool use_gpu = false; |
| 21 | + std::string fname_inp = {}; |
| 22 | + bool no_prints = false; |
| 23 | +}; |
| 24 | + |
| 25 | +static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) { |
| 26 | + fprintf(stderr, "\n"); |
| 27 | + fprintf(stderr, "usage: %s [options] file\n", argv[0]); |
| 28 | + fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n"); |
| 29 | + fprintf(stderr, "\n"); |
| 30 | + fprintf(stderr, "options:\n"); |
| 31 | + fprintf(stderr, " -h, --help [default] show this help message and exit\n"); |
| 32 | + fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", ""); |
| 33 | + fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads); |
| 34 | + fprintf(stderr, " -ug, --use-gpu [%-7s] use GPU\n", params.use_gpu ? "true" : "false"); |
| 35 | + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); |
| 36 | + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); |
| 37 | + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); |
| 38 | + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); |
| 39 | + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? |
| 40 | + std::string("FLT_MAX").c_str() : |
| 41 | + std::to_string(params.vad_max_speech_duration_s).c_str()); |
| 42 | + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); |
| 43 | + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); |
| 44 | + fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); |
| 45 | + fprintf(stderr, "\n"); |
| 46 | +} |
| 47 | + |
| 48 | +static char * requires_value_error(const std::string & arg) { |
| 49 | + fprintf(stderr, "error: argument %s requires value\n", arg.c_str()); |
| 50 | + exit(0); |
| 51 | +} |
| 52 | + |
| 53 | +static bool vad_params_parse(int argc, char ** argv, cli_params & params) { |
| 54 | + for (int i = 1; i < argc; i++) { |
| 55 | + std::string arg = argv[i]; |
| 56 | + |
| 57 | + if (arg == "-h" || arg == "--help") { |
| 58 | + vad_print_usage(argc, argv, params); |
| 59 | + exit(0); |
| 60 | + } |
| 61 | + #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg)) |
| 62 | + else if (arg == "-f" || arg == "--file") { params.fname_inp = ARGV_NEXT; } |
| 63 | + else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); } |
| 64 | + else if (arg == "-ug" || arg == "--use-gpu") { params.use_gpu = true; } |
| 65 | + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; } |
| 66 | + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); } |
| 67 | + else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 68 | + else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); } |
| 69 | + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); } |
| 70 | + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); } |
| 71 | + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); } |
| 72 | + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } |
| 73 | + else { |
| 74 | + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); |
| 75 | + vad_print_usage(argc, argv, params); |
| 76 | + exit(0); |
| 77 | + } |
| 78 | + } |
| 79 | + |
| 80 | + return true; |
| 81 | +} |
| 82 | + |
| 83 | +static void cb_log_disable(enum ggml_log_level , const char * , void * ) { } |
| 84 | + |
| 85 | +int main(int argc, char ** argv) { |
| 86 | + cli_params cli_params; |
| 87 | + |
| 88 | + if (!vad_params_parse(argc, argv, cli_params)) { |
| 89 | + vad_print_usage(argc, argv, cli_params); |
| 90 | + return 1; |
| 91 | + } |
| 92 | + |
| 93 | + if (cli_params.no_prints) { |
| 94 | + whisper_log_set(cb_log_disable, NULL); |
| 95 | + } |
| 96 | + |
| 97 | + // Load the input sample audio file. |
| 98 | + std::vector<float> pcmf32; |
| 99 | + std::vector<std::vector<float>> pcmf32s; |
| 100 | + if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) { |
| 101 | + fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str()); |
| 102 | + return 2; |
| 103 | + } |
| 104 | + |
| 105 | + // Initialize the context which loads the VAD model. |
| 106 | + struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params(); |
| 107 | + ctx_params.n_threads = cli_params.n_threads; |
| 108 | + ctx_params.use_gpu = cli_params.use_gpu; |
| 109 | + struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params( |
| 110 | + cli_params.vad_model.c_str(), |
| 111 | + ctx_params); |
| 112 | + |
| 113 | + // Detect speech in the input audio file. |
| 114 | + if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) { |
| 115 | + fprintf(stderr, "error: failed to detect speech\n"); |
| 116 | + return 3; |
| 117 | + } |
| 118 | + |
| 119 | + // Get the the vad segements using the probabilities that have been computed |
| 120 | + // previously and stored in the whisper_vad_context. |
| 121 | + struct whisper_vad_params params = whisper_vad_default_params(); |
| 122 | + params.threshold = cli_params.vad_threshold; |
| 123 | + params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms; |
| 124 | + params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms; |
| 125 | + params.max_speech_duration_s = cli_params.vad_max_speech_duration_s; |
| 126 | + params.speech_pad_ms = cli_params.vad_speech_pad_ms; |
| 127 | + params.samples_overlap = cli_params.vad_samples_overlap; |
| 128 | + struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params); |
| 129 | + |
| 130 | + printf("\n"); |
| 131 | + printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments)); |
| 132 | + for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) { |
| 133 | + printf("Speech segment %d: start = %.2f, end = %.2f\n", i, |
| 134 | + whisper_vad_segments_get_segment_t0(segments, i), |
| 135 | + whisper_vad_segments_get_segment_t1(segments, i)); |
| 136 | + } |
| 137 | + printf("\n"); |
| 138 | + |
| 139 | + whisper_vad_free_segments(segments); |
| 140 | + whisper_vad_free(vctx); |
| 141 | + |
| 142 | + return 0; |
| 143 | +} |
0 commit comments