Skip to content

Commit dc99002

Browse files
committed
Convert audio files (mp3/flac/ogg) to 16khz wav
It's no longer necessary to run sox or ffmpeg beforehand, when using the whisperfile command. If you're audio file isn't in the preferred format, it'll be converted for you automatically using the embedded audio tools.
1 parent 2043660 commit dc99002

File tree

7 files changed

+150
-17
lines changed

7 files changed

+150
-17
lines changed

whisper.cpp/BUILD.mk

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,12 @@ o/$(MODE)/whisper.cpp/main: \
3434
o/$(MODE)/whisper.cpp/main.1.asc.zip.o \
3535
o/$(MODE)/whisper.cpp/whisper.cpp.a \
3636
o/$(MODE)/llama.cpp/llama.cpp.a \
37+
o/$(MODE)/stb/stb.a \
38+
39+
o/$(MODE)/whisper.cpp/miniaudio.o: private COPTS += -O3
3740

3841
$(WHISPER_CPP_OBJS): whisper.cpp/BUILD.mk
3942

4043
.PHONY: o/$(MODE)/whisper.cpp
4144
o/$(MODE)/whisper.cpp: \
42-
o/$(MODE)/whisper.cpp/main
45+
o/$(MODE)/whisper.cpp/main \

whisper.cpp/README.llamafile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ ORIGIN
1515
LOCAL MODIFICATIONS
1616

1717
- Integrate with llamafile file loader
18+
- Automatically convert MP3/FLAC/OGG to WAV

whisper.cpp/common.cpp

Lines changed: 130 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
33
#define _USE_MATH_DEFINES // for M_PI
44

5+
#include "llamafile/log.h"
6+
#include "llamafile/llamafile.h"
57
#include "common.h"
68

79
// third-party utilities
810
// use your favorite implementations
9-
#define DR_WAV_IMPLEMENTATION
11+
// #define DR_WAV_IMPLEMENTATION // [jart] comment out
1012
#include "dr_wav.h"
1113

1214
#if defined(_MSC_VER)
@@ -18,6 +20,23 @@
1820
#include <io.h>
1921
#endif
2022

23+
#include <cosmo.h>
24+
#include <stdlib.h>
25+
#include <unistd.h>
26+
27+
#include "stb/stb_vorbis.h"
28+
#include "miniaudio.h"
29+
30+
#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096
31+
32+
static std::string delete_me;
33+
34+
static void on_exit(void) {
35+
if (!delete_me.empty()) {
36+
unlink(delete_me.c_str());
37+
}
38+
}
39+
2140
bool is_wav_buffer(const std::string buf) {
2241
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
2342
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
@@ -33,10 +52,103 @@ bool is_wav_buffer(const std::string buf) {
3352
return true;
3453
}
3554

36-
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
55+
static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
56+
ma_result rc = MA_SUCCESS;
57+
for (;;) {
58+
ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
59+
ma_uint64 framesReadThisIteration;
60+
ma_uint64 framesToReadThisIteration;
61+
framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
62+
rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
63+
if (rc != MA_SUCCESS) {
64+
break;
65+
}
66+
ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
67+
if (framesReadThisIteration < framesToReadThisIteration) {
68+
break;
69+
}
70+
}
71+
return rc;
72+
}
73+
74+
// converts audio file to signed 16-bit 16000hz wav
75+
static std::string convert_audio_file(const std::string & fname, bool stereo) {
76+
77+
// create temporary filename
78+
std::string newpath;
79+
newpath = __get_tmpdir();
80+
newpath += "/whisperfile.";
81+
newpath += std::to_string(_rand64());
82+
newpath += ".wav";
83+
84+
// create decoder
85+
ma_decoder_config decoderConfig =
86+
ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
87+
decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
88+
decoderConfig.resampling.linear.lpfOrder = 8;
89+
90+
// open input file
91+
ma_decoder decoder;
92+
ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
93+
if (rc != MA_SUCCESS) {
94+
fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
95+
fname.c_str(), ma_result_description(rc));
96+
return "";
97+
}
98+
99+
// create encoder
100+
ma_encoder encoder;
101+
ma_encoder_config encoderConfig = ma_encoder_config_init(
102+
ma_encoding_format_wav,
103+
decoder.outputFormat,
104+
decoder.outputChannels,
105+
decoder.outputSampleRate);
106+
rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
107+
if (rc != MA_SUCCESS) {
108+
ma_decoder_uninit(&decoder);
109+
fprintf(stderr, "%s: failed to open output file: %s\n",
110+
newpath.c_str(), ma_result_description(rc));
111+
return "";
112+
}
113+
114+
// perform the conversion
115+
rc = perform_audio_conversion(&decoder, &encoder);
116+
ma_encoder_uninit(&encoder);
117+
ma_decoder_uninit(&decoder);
118+
if (rc != MA_SUCCESS) {
119+
fprintf(stderr, "%s: failed to convert audio file: %s\n",
120+
fname.c_str(), ma_result_description(rc));
121+
return "";
122+
}
123+
124+
// return new path
125+
delete_me = newpath;
126+
atexit(on_exit);
127+
return newpath;
128+
}
129+
130+
#define TRY_CONVERSION \
131+
do { \
132+
if (did_conversion) { \
133+
fprintf(stderr, "error: failed to open audio file\n"); \
134+
return false; \
135+
} \
136+
std::string fname2 = convert_audio_file(fname, stereo); \
137+
if (fname2.empty()) { \
138+
return false; \
139+
} \
140+
fname = fname2; \
141+
did_conversion = true; \
142+
goto TryAgain; \
143+
} while (0)
144+
145+
bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
37146
drwav wav;
38147
std::vector<uint8_t> wav_data; // used for pipe input from stdin
148+
std::string fname = fname_;
149+
bool did_conversion = false;
39150

151+
TryAgain:
40152
if (fname == "-") {
41153
{
42154
#ifdef _WIN32
@@ -68,32 +180,38 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
68180
}
69181
}
70182
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
71-
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
183+
tinylogf("%s: converting to wav...\n", fname.c_str());
184+
TRY_CONVERSION;
185+
}
186+
187+
if (stereo && wav.channels < 2) {
188+
fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
189+
drwav_uninit(&wav);
72190
return false;
73191
}
74192

75193
if (wav.channels != 1 && wav.channels != 2) {
76-
fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
194+
tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
77195
drwav_uninit(&wav);
78-
return false;
196+
TRY_CONVERSION;
79197
}
80198

81199
if (stereo && wav.channels != 2) {
82-
fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
200+
tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
83201
drwav_uninit(&wav);
84-
return false;
202+
TRY_CONVERSION;
85203
}
86204

87205
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
88-
fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
206+
tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
89207
drwav_uninit(&wav);
90-
return false;
208+
TRY_CONVERSION;
91209
}
92210

93211
if (wav.bitsPerSample != 16) {
94-
fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
212+
tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
95213
drwav_uninit(&wav);
96-
return false;
214+
TRY_CONVERSION;
97215
}
98216

99217
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
@@ -171,7 +289,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
171289
energy_last /= n_samples_last;
172290

173291
if (verbose) {
174-
fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
292+
tinylogf("%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
175293
}
176294

177295
if (energy_last > vad_thold*energy_all) {

whisper.cpp/dr_wav.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#define DR_WAV_IMPLEMENTATION
2+
#include "dr_wav.h"

whisper.cpp/main.1

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ Puts program in HTTP server mode.
3232
Path of Whisper model weights. See
3333
https://huggingface.co/ggerganov/whisper.cpp
3434
.It Fl f Ar FNAME , Fl Fl file Ar FNAME
35-
Path of WAV file to transcribe.
35+
Path of audio file to transcribe. The preferred audio format is a 16khz
36+
16-bit signed linear WAV file, which can be stereo or mono. It's also
37+
permissible to pass an MP3, FLAC, or OGG file, in which case it'll be
38+
converted to .wav file in your temp directory before transcribing.
3639
.It Fl tr , Fl Fl translate
3740
Translate audio into English text.
3841
.It Fl ot Ar N , Fl Fl offset-t Ar N

whisper.cpp/main.1.asc

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@
3030
face.co/ggerganov/whisper.cpp
3131
3232
-f FNAME, --file FNAME
33-
Path of WAV file to transcribe.
33+
Path of audio file to transcribe. The preferred audio format is
34+
a 16khz 16-bit signed linear WAV file, which can be stereo or
35+
mono. It's also permissible to pass an MP3, FLAC, or OGG file,
36+
in which case it'll be converted to .wav file in your temp di‐
37+
rectory before transcribing.
3438
3539
-tr, --translate
3640
Translate audio into English text.
@@ -47,8 +51,9 @@
4751
-pc, --print-colors
4852
Enables CLI printing of ANSI color codes.
4953
50-
Transcribed text will appear in the terminal on a spectrum of
51-
color ranging from green to red. Green means the model
54+
Transcribed text will appear in the terminal on a spectrum of
55+
color ranging from green to red. Green represents confidence
56+
whereas red represents uncertainty.
5257
5358
-t N, --threads N
5459
Overrides number of threads to use.

whisper.cpp/miniaudio.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "stb/stb_vorbis.h"
22

33
#define MA_NO_DEVICE_IO
4+
#define MA_NO_RUNTIME_LINKING
45
#define MINIAUDIO_IMPLEMENTATION
56
#pragma GCC diagnostic ignored "-Wstringop-overflow"
67
#include "miniaudio.h"

0 commit comments

Comments
 (0)