Check in some experimental whisper programs

jart · jart · commit 70e3dcd60931 · 2024-09-18T03:31:36.000-07:00
diff --git a/whisper.cpp/BUILD.mk b/whisper.cpp/BUILD.mk
@@ -26,7 +26,6 @@ $(WHISPER_CPP_OBJS): private				\
 $(WHISPER_CPP_OBJS): private				\
 		CXXFLAGS +=				\
 			-frtti				\
-			-Wno-alloc-size-larger-than	\
 			-Wno-deprecated-declarations
 
 o/$(MODE)/whisper.cpp/main:				\
@@ -36,10 +35,26 @@ o/$(MODE)/whisper.cpp/main:				\
 		o/$(MODE)/llama.cpp/llama.cpp.a		\
 		o/$(MODE)/stb/stb.a			\
 
+o/$(MODE)/whisper.cpp/stream:				\
+		o/$(MODE)/whisper.cpp/whisper.cpp.a	\
+		o/$(MODE)/llama.cpp/llama.cpp.a		\
+		o/$(MODE)/stb/stb.a			\
+
+o/$(MODE)/whisper.cpp/mic2txt:				\
+		o/$(MODE)/whisper.cpp/whisper.cpp.a	\
+		o/$(MODE)/llama.cpp/llama.cpp.a		\
+
+o/$(MODE)/whisper.cpp/mic2raw:				\
+		o/$(MODE)/whisper.cpp/whisper.cpp.a	\
+		o/$(MODE)/llama.cpp/llama.cpp.a		\
+
 o/$(MODE)/whisper.cpp/miniaudio.o: private COPTS += -O3
 
 $(WHISPER_CPP_OBJS): whisper.cpp/BUILD.mk
 
 .PHONY: o/$(MODE)/whisper.cpp
 o/$(MODE)/whisper.cpp:					\
 		o/$(MODE)/whisper.cpp/main		\
+		o/$(MODE)/whisper.cpp/stream		\
+		o/$(MODE)/whisper.cpp/mic2txt		\
+		o/$(MODE)/whisper.cpp/mic2raw		\
diff --git a/whisper.cpp/color.cpp b/whisper.cpp/color.cpp
@@ -0,0 +1,57 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "color.h"
+
+#include <ctl/ostringstream.h>
+
+#define SQR(X) ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
+
+// quantizes 24-bit rgb to xterm256 code range [16,256)
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static ctl::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    ctl::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+// Lowest is red, middle is yellow, highest is green. Color scheme from
+// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
+const ctl::vector<ctl::string> kRedToGreenXterm256 = {
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
+};
diff --git a/whisper.cpp/color.h b/whisper.cpp/color.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <ctl/vector.h>
+#include <ctl/string.h>
+
+extern const ctl::vector<ctl::string> kRedToGreenXterm256;
diff --git a/whisper.cpp/mic2raw.cpp b/whisper.cpp/mic2raw.cpp
@@ -0,0 +1,151 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
+// Copyright 2024 Mozilla Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "color.h"
+#include "whisper.h"
+
+#include <math.h>
+#include <cosmo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <signal.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <ctl/min.h>
+#include <ctl/max.h>
+#include <sys/stat.h>
+#include <ctl/vector.h>
+#include <cosmoaudio.h>
+
+#define FRAMES_PER_SECOND 30
+#define CHUNK_FRAMES (WHISPER_SAMPLE_RATE / FRAMES_PER_SECOND)
+
+const char *g_model;
+volatile sig_atomic_t g_done;
+struct whisper_context *g_ctx;
+struct whisper_context_params g_cparams;
+
+static void onsig(int sig) {
+    g_done = 1;
+}
+
+static void *load_model(void *arg) {
+    g_ctx = whisper_init_from_file_with_params(g_model, g_cparams);
+    if (!g_ctx) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        exit(2);
+    }
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+    FLAG_gpu = LLAMAFILE_GPU_DISABLE;
+    FLAG_log_disable = true;
+    llamafile_check_cpu();
+    ShowCrashReports();
+
+    // get argument
+    if (argc != 2) {
+        fprintf(stderr, "usage: %s MODEL\n", argv[0]);
+        return 1;
+    }
+    struct stat st;
+    g_model = argv[1];
+    if (stat(g_model, &st)) {
+        perror(g_model);
+        return 1;
+    }
+
+    // detect teletypewriters
+    bool should_print_color = isatty(1) && isatty(2);
+
+    // connect to microphone
+    int status;
+    struct CosmoAudio *mic;
+    struct CosmoAudioOpenOptions cao = {};
+    cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
+    cao.deviceType = kCosmoAudioDeviceTypeCapture;
+    cao.sampleRate = WHISPER_SAMPLE_RATE;
+    cao.bufferFrames = CHUNK_FRAMES * 2;
+    cao.channels = 1;
+    if ((status = cosmoaudio_open(&mic, &cao)) != COSMOAUDIO_SUCCESS) {
+        fprintf(stderr, "error: failed to open microphone: %d\n", status);
+        return 1;
+    }
+
+    // load model
+    pthread_t model_loader;
+    g_cparams = whisper_context_default_params();
+    unassert(!pthread_create(&model_loader, 0, load_model, 0));
+
+    // setup signals
+    struct sigaction sa;
+    sa.sa_flags = 0;
+    sa.sa_handler = onsig;
+    sigemptyset(&sa.sa_mask);
+    sigaction(SIGINT, &sa, 0);
+
+    // record audio until ctrl-c is pressed
+    ctl::vector<float> samples;
+    while (!g_done) {
+        size_t n = samples.size();
+        samples.resize(n + CHUNK_FRAMES);
+        cosmoaudio_poll(mic, (int[]){CHUNK_FRAMES}, 0);
+        cosmoaudio_read(mic, &samples[n], CHUNK_FRAMES);
+        fprintf(stderr, "\rcaptured %f seconds of audio... (press ctrl-c when done)",
+                (double)samples.size() / WHISPER_SAMPLE_RATE);
+        fflush(stderr);
+    }
+    fprintf(stderr, "\n");
+    cosmoaudio_close(mic);
+
+    // transcribe audio
+    unassert(!pthread_join(model_loader, 0));
+    whisper_full_params wparams =
+            whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
+    wparams.no_timestamps = true;
+    wparams.suppress_non_speech_tokens = true;
+    wparams.greedy.best_of = 8;
+    wparams.beam_search.beam_size = 8;
+    wparams.initial_prompt = nullptr;
+    if ((status = whisper_full(g_ctx, wparams, samples.data(), samples.size()))) {
+        fprintf(stderr, "error: whisper failed with %d\n", status);
+        return 3;
+    }
+    int n_segments = whisper_full_n_segments(g_ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        int n_tokens = whisper_full_n_tokens(g_ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const whisper_token id = whisper_full_get_token_id(g_ctx, i, j);
+            const char *text = whisper_full_get_token_text(g_ctx, i, j);
+            float p = whisper_full_get_token_p(g_ctx, i, j);
+            if (should_print_color) {
+                int colorcount = kRedToGreenXterm256.size();
+                int colorindex = powf(p, 2.5) * colorcount;
+                colorindex = ctl::max(0, ctl::min(colorcount - 1, colorindex));
+                fprintf(stderr, "%s", kRedToGreenXterm256[colorindex].c_str());
+                fflush(stderr);
+            }
+            printf("%12f %8d %-`'20s\n", id, text, p);
+            fflush(stdout);
+        }
+    }
+    if (should_print_color)
+        fprintf(stderr, "\033[0m");
+    whisper_free(g_ctx);
+}
diff --git a/whisper.cpp/mic2txt.cpp b/whisper.cpp/mic2txt.cpp
diff --git a/whisper.cpp/stream.cpp b/whisper.cpp/stream.cpp