|
| 1 | +// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- |
| 2 | +// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi |
| 3 | +// |
| 4 | +// Copyright 2024 Mozilla Foundation |
| 5 | +// |
| 6 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +// you may not use this file except in compliance with the License. |
| 8 | +// You may obtain a copy of the License at |
| 9 | +// |
| 10 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +// |
| 12 | +// Unless required by applicable law or agreed to in writing, software |
| 13 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +// See the License for the specific language governing permissions and |
| 16 | +// limitations under the License. |
| 17 | + |
| 18 | +#include "color.h" |
| 19 | +#include "whisper.h" |
| 20 | + |
| 21 | +#include <math.h> |
| 22 | +#include <cosmo.h> |
| 23 | +#include <stdio.h> |
| 24 | +#include <stdlib.h> |
| 25 | +#include <assert.h> |
| 26 | +#include <signal.h> |
| 27 | +#include <unistd.h> |
| 28 | +#include <pthread.h> |
| 29 | +#include <ctl/min.h> |
| 30 | +#include <ctl/max.h> |
| 31 | +#include <sys/stat.h> |
| 32 | +#include <ctl/vector.h> |
| 33 | +#include <cosmoaudio.h> |
| 34 | + |
| 35 | +#define FRAMES_PER_SECOND 30 |
| 36 | +#define CHUNK_FRAMES (WHISPER_SAMPLE_RATE / FRAMES_PER_SECOND) |
| 37 | + |
| 38 | +const char *g_model; |
| 39 | +volatile sig_atomic_t g_done; |
| 40 | +struct whisper_context *g_ctx; |
| 41 | +struct whisper_context_params g_cparams; |
| 42 | + |
| 43 | +static void onsig(int sig) { |
| 44 | + g_done = 1; |
| 45 | +} |
| 46 | + |
| 47 | +static void *load_model(void *arg) { |
| 48 | + g_ctx = whisper_init_from_file_with_params(g_model, g_cparams); |
| 49 | + if (!g_ctx) { |
| 50 | + fprintf(stderr, "error: failed to initialize whisper context\n"); |
| 51 | + exit(2); |
| 52 | + } |
| 53 | + return 0; |
| 54 | +} |
| 55 | + |
| 56 | +int main(int argc, char *argv[]) { |
| 57 | + FLAG_gpu = LLAMAFILE_GPU_DISABLE; |
| 58 | + FLAG_log_disable = true; |
| 59 | + llamafile_check_cpu(); |
| 60 | + ShowCrashReports(); |
| 61 | + |
| 62 | + // get argument |
| 63 | + if (argc != 2) { |
| 64 | + fprintf(stderr, "usage: %s MODEL\n", argv[0]); |
| 65 | + return 1; |
| 66 | + } |
| 67 | + struct stat st; |
| 68 | + g_model = argv[1]; |
| 69 | + if (stat(g_model, &st)) { |
| 70 | + perror(g_model); |
| 71 | + return 1; |
| 72 | + } |
| 73 | + |
| 74 | + // detect teletypewriters |
| 75 | + bool should_print_color = isatty(1) && isatty(2); |
| 76 | + |
| 77 | + // connect to microphone |
| 78 | + int status; |
| 79 | + struct CosmoAudio *mic; |
| 80 | + struct CosmoAudioOpenOptions cao = {}; |
| 81 | + cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions); |
| 82 | + cao.deviceType = kCosmoAudioDeviceTypeCapture; |
| 83 | + cao.sampleRate = WHISPER_SAMPLE_RATE; |
| 84 | + cao.bufferFrames = CHUNK_FRAMES * 2; |
| 85 | + cao.channels = 1; |
| 86 | + if ((status = cosmoaudio_open(&mic, &cao)) != COSMOAUDIO_SUCCESS) { |
| 87 | + fprintf(stderr, "error: failed to open microphone: %d\n", status); |
| 88 | + return 1; |
| 89 | + } |
| 90 | + |
| 91 | + // load model |
| 92 | + pthread_t model_loader; |
| 93 | + g_cparams = whisper_context_default_params(); |
| 94 | + unassert(!pthread_create(&model_loader, 0, load_model, 0)); |
| 95 | + |
| 96 | + // setup signals |
| 97 | + struct sigaction sa; |
| 98 | + sa.sa_flags = 0; |
| 99 | + sa.sa_handler = onsig; |
| 100 | + sigemptyset(&sa.sa_mask); |
| 101 | + sigaction(SIGINT, &sa, 0); |
| 102 | + |
| 103 | + // record audio until ctrl-c is pressed |
| 104 | + ctl::vector<float> samples; |
| 105 | + while (!g_done) { |
| 106 | + size_t n = samples.size(); |
| 107 | + samples.resize(n + CHUNK_FRAMES); |
| 108 | + cosmoaudio_poll(mic, (int[]){CHUNK_FRAMES}, 0); |
| 109 | + cosmoaudio_read(mic, &samples[n], CHUNK_FRAMES); |
| 110 | + fprintf(stderr, "\rcaptured %f seconds of audio... (press ctrl-c when done)", |
| 111 | + (double)samples.size() / WHISPER_SAMPLE_RATE); |
| 112 | + fflush(stderr); |
| 113 | + } |
| 114 | + fprintf(stderr, "\n"); |
| 115 | + cosmoaudio_close(mic); |
| 116 | + |
| 117 | + // transcribe audio |
| 118 | + unassert(!pthread_join(model_loader, 0)); |
| 119 | + whisper_full_params wparams = |
| 120 | + whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH); |
| 121 | + wparams.no_timestamps = true; |
| 122 | + wparams.suppress_non_speech_tokens = true; |
| 123 | + wparams.greedy.best_of = 8; |
| 124 | + wparams.beam_search.beam_size = 8; |
| 125 | + wparams.initial_prompt = nullptr; |
| 126 | + if ((status = whisper_full(g_ctx, wparams, samples.data(), samples.size()))) { |
| 127 | + fprintf(stderr, "error: whisper failed with %d\n", status); |
| 128 | + return 3; |
| 129 | + } |
| 130 | + int n_segments = whisper_full_n_segments(g_ctx); |
| 131 | + for (int i = 0; i < n_segments; ++i) { |
| 132 | + int n_tokens = whisper_full_n_tokens(g_ctx, i); |
| 133 | + for (int j = 0; j < n_tokens; ++j) { |
| 134 | + const whisper_token id = whisper_full_get_token_id(g_ctx, i, j); |
| 135 | + const char *text = whisper_full_get_token_text(g_ctx, i, j); |
| 136 | + float p = whisper_full_get_token_p(g_ctx, i, j); |
| 137 | + if (should_print_color) { |
| 138 | + int colorcount = kRedToGreenXterm256.size(); |
| 139 | + int colorindex = powf(p, 2.5) * colorcount; |
| 140 | + colorindex = ctl::max(0, ctl::min(colorcount - 1, colorindex)); |
| 141 | + fprintf(stderr, "%s", kRedToGreenXterm256[colorindex].c_str()); |
| 142 | + fflush(stderr); |
| 143 | + } |
| 144 | + printf("%12f %8d %-`'20s\n", id, text, p); |
| 145 | + fflush(stdout); |
| 146 | + } |
| 147 | + } |
| 148 | + if (should_print_color) |
| 149 | + fprintf(stderr, "\033[0m"); |
| 150 | + whisper_free(g_ctx); |
| 151 | +} |
0 commit comments