Skip to content

Commit 70e3dcd

Browse files
committed
Check in some experimental whisper programs
1 parent 484a1b2 commit 70e3dcd

File tree

6 files changed

+745
-1
lines changed

6 files changed

+745
-1
lines changed

whisper.cpp/BUILD.mk

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ $(WHISPER_CPP_OBJS): private \
2626
$(WHISPER_CPP_OBJS): private \
2727
CXXFLAGS += \
2828
-frtti \
29-
-Wno-alloc-size-larger-than \
3029
-Wno-deprecated-declarations
3130

3231
o/$(MODE)/whisper.cpp/main: \
@@ -36,10 +35,26 @@ o/$(MODE)/whisper.cpp/main: \
3635
o/$(MODE)/llama.cpp/llama.cpp.a \
3736
o/$(MODE)/stb/stb.a \
3837

38+
o/$(MODE)/whisper.cpp/stream: \
39+
o/$(MODE)/whisper.cpp/whisper.cpp.a \
40+
o/$(MODE)/llama.cpp/llama.cpp.a \
41+
o/$(MODE)/stb/stb.a \
42+
43+
o/$(MODE)/whisper.cpp/mic2txt: \
44+
o/$(MODE)/whisper.cpp/whisper.cpp.a \
45+
o/$(MODE)/llama.cpp/llama.cpp.a \
46+
47+
o/$(MODE)/whisper.cpp/mic2raw: \
48+
o/$(MODE)/whisper.cpp/whisper.cpp.a \
49+
o/$(MODE)/llama.cpp/llama.cpp.a \
50+
3951
o/$(MODE)/whisper.cpp/miniaudio.o: private COPTS += -O3
4052

4153
$(WHISPER_CPP_OBJS): whisper.cpp/BUILD.mk
4254

4355
.PHONY: o/$(MODE)/whisper.cpp
4456
o/$(MODE)/whisper.cpp: \
4557
o/$(MODE)/whisper.cpp/main \
58+
o/$(MODE)/whisper.cpp/stream \
59+
o/$(MODE)/whisper.cpp/mic2txt \
60+
o/$(MODE)/whisper.cpp/mic2raw \

whisper.cpp/color.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2+
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
//
4+
// Copyright 2024 Mozilla Foundation
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
18+
#include "color.h"
19+
20+
#include <ctl/ostringstream.h>
21+
22+
#define SQR(X) ((X) * (X))
23+
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
24+
25+
// quantizes 24-bit rgb to xterm256 code range [16,256)
26+
static int rgb2xterm256(int r, int g, int b) {
27+
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
28+
int av, ir, ig, ib, il, qr, qg, qb, ql;
29+
av = r * .299 + g * .587 + b * .114 + .5;
30+
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
31+
qr = cube[(ir = UNCUBE(r))];
32+
qg = cube[(ig = UNCUBE(g))];
33+
qb = cube[(ib = UNCUBE(b))];
34+
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
35+
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
36+
return ir * 36 + ig * 6 + ib + 020;
37+
return il + 0350;
38+
}
39+
40+
static ctl::string set_xterm256_foreground(int r, int g, int b) {
41+
int x = rgb2xterm256(r, g, b);
42+
ctl::ostringstream oss;
43+
oss << "\033[38;5;" << x << "m";
44+
return oss.str();
45+
}
46+
47+
// Lowest is red, middle is yellow, highest is green. Color scheme from
48+
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
49+
const ctl::vector<ctl::string> kRedToGreenXterm256 = {
50+
set_xterm256_foreground(220, 5, 12),
51+
set_xterm256_foreground(232, 96, 28),
52+
set_xterm256_foreground(241, 147, 45),
53+
set_xterm256_foreground(246, 193, 65),
54+
set_xterm256_foreground(247, 240, 86),
55+
set_xterm256_foreground(144, 201, 135),
56+
set_xterm256_foreground( 78, 178, 101),
57+
};

whisper.cpp/color.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#pragma once
2+
#include <ctl/vector.h>
3+
#include <ctl/string.h>
4+
5+
extern const ctl::vector<ctl::string> kRedToGreenXterm256;

whisper.cpp/mic2raw.cpp

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
2+
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
3+
//
4+
// Copyright 2024 Mozilla Foundation
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
18+
#include "color.h"
19+
#include "whisper.h"
20+
21+
#include <math.h>
22+
#include <cosmo.h>
23+
#include <stdio.h>
24+
#include <stdlib.h>
25+
#include <assert.h>
26+
#include <signal.h>
27+
#include <unistd.h>
28+
#include <pthread.h>
29+
#include <ctl/min.h>
30+
#include <ctl/max.h>
31+
#include <sys/stat.h>
32+
#include <ctl/vector.h>
33+
#include <cosmoaudio.h>
34+
35+
#define FRAMES_PER_SECOND 30
36+
#define CHUNK_FRAMES (WHISPER_SAMPLE_RATE / FRAMES_PER_SECOND)
37+
38+
const char *g_model;
39+
volatile sig_atomic_t g_done;
40+
struct whisper_context *g_ctx;
41+
struct whisper_context_params g_cparams;
42+
43+
static void onsig(int sig) {
44+
g_done = 1;
45+
}
46+
47+
static void *load_model(void *arg) {
48+
g_ctx = whisper_init_from_file_with_params(g_model, g_cparams);
49+
if (!g_ctx) {
50+
fprintf(stderr, "error: failed to initialize whisper context\n");
51+
exit(2);
52+
}
53+
return 0;
54+
}
55+
56+
int main(int argc, char *argv[]) {
57+
FLAG_gpu = LLAMAFILE_GPU_DISABLE;
58+
FLAG_log_disable = true;
59+
llamafile_check_cpu();
60+
ShowCrashReports();
61+
62+
// get argument
63+
if (argc != 2) {
64+
fprintf(stderr, "usage: %s MODEL\n", argv[0]);
65+
return 1;
66+
}
67+
struct stat st;
68+
g_model = argv[1];
69+
if (stat(g_model, &st)) {
70+
perror(g_model);
71+
return 1;
72+
}
73+
74+
// detect teletypewriters
75+
bool should_print_color = isatty(1) && isatty(2);
76+
77+
// connect to microphone
78+
int status;
79+
struct CosmoAudio *mic;
80+
struct CosmoAudioOpenOptions cao = {};
81+
cao.sizeofThis = sizeof(struct CosmoAudioOpenOptions);
82+
cao.deviceType = kCosmoAudioDeviceTypeCapture;
83+
cao.sampleRate = WHISPER_SAMPLE_RATE;
84+
cao.bufferFrames = CHUNK_FRAMES * 2;
85+
cao.channels = 1;
86+
if ((status = cosmoaudio_open(&mic, &cao)) != COSMOAUDIO_SUCCESS) {
87+
fprintf(stderr, "error: failed to open microphone: %d\n", status);
88+
return 1;
89+
}
90+
91+
// load model
92+
pthread_t model_loader;
93+
g_cparams = whisper_context_default_params();
94+
unassert(!pthread_create(&model_loader, 0, load_model, 0));
95+
96+
// setup signals
97+
struct sigaction sa;
98+
sa.sa_flags = 0;
99+
sa.sa_handler = onsig;
100+
sigemptyset(&sa.sa_mask);
101+
sigaction(SIGINT, &sa, 0);
102+
103+
// record audio until ctrl-c is pressed
104+
ctl::vector<float> samples;
105+
while (!g_done) {
106+
size_t n = samples.size();
107+
samples.resize(n + CHUNK_FRAMES);
108+
cosmoaudio_poll(mic, (int[]){CHUNK_FRAMES}, 0);
109+
cosmoaudio_read(mic, &samples[n], CHUNK_FRAMES);
110+
fprintf(stderr, "\rcaptured %f seconds of audio... (press ctrl-c when done)",
111+
(double)samples.size() / WHISPER_SAMPLE_RATE);
112+
fflush(stderr);
113+
}
114+
fprintf(stderr, "\n");
115+
cosmoaudio_close(mic);
116+
117+
// transcribe audio
118+
unassert(!pthread_join(model_loader, 0));
119+
whisper_full_params wparams =
120+
whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
121+
wparams.no_timestamps = true;
122+
wparams.suppress_non_speech_tokens = true;
123+
wparams.greedy.best_of = 8;
124+
wparams.beam_search.beam_size = 8;
125+
wparams.initial_prompt = nullptr;
126+
if ((status = whisper_full(g_ctx, wparams, samples.data(), samples.size()))) {
127+
fprintf(stderr, "error: whisper failed with %d\n", status);
128+
return 3;
129+
}
130+
int n_segments = whisper_full_n_segments(g_ctx);
131+
for (int i = 0; i < n_segments; ++i) {
132+
int n_tokens = whisper_full_n_tokens(g_ctx, i);
133+
for (int j = 0; j < n_tokens; ++j) {
134+
const whisper_token id = whisper_full_get_token_id(g_ctx, i, j);
135+
const char *text = whisper_full_get_token_text(g_ctx, i, j);
136+
float p = whisper_full_get_token_p(g_ctx, i, j);
137+
if (should_print_color) {
138+
int colorcount = kRedToGreenXterm256.size();
139+
int colorindex = powf(p, 2.5) * colorcount;
140+
colorindex = ctl::max(0, ctl::min(colorcount - 1, colorindex));
141+
fprintf(stderr, "%s", kRedToGreenXterm256[colorindex].c_str());
142+
fflush(stderr);
143+
}
144+
printf("%12f %8d %-`'20s\n", id, text, p);
145+
fflush(stdout);
146+
}
147+
}
148+
if (should_print_color)
149+
fprintf(stderr, "\033[0m");
150+
whisper_free(g_ctx);
151+
}

0 commit comments

Comments
 (0)