Skip to content

Commit 74dfd21

Browse files
committed
Rewrite audio file loader code
We now have a new function slurp_audio_file() which replaces read_wav(). This function has simpler code, and allows us to avoid a temporary file. See #568
1 parent 180a013 commit 74dfd21

File tree

6 files changed

+128
-287
lines changed

6 files changed

+128
-287
lines changed

whisper.cpp/common.cpp

Lines changed: 0 additions & 200 deletions
Original file line numberDiff line numberDiff line change
@@ -27,206 +27,6 @@
2727
#include "stb/stb_vorbis.h"
2828
#include "miniaudio.h"
2929

30-
#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096
31-
32-
static std::string delete_me;
33-
34-
static void on_exit(void) {
35-
if (!delete_me.empty()) {
36-
unlink(delete_me.c_str());
37-
}
38-
}
39-
40-
static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
41-
ma_result rc = MA_SUCCESS;
42-
for (;;) {
43-
ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
44-
ma_uint64 framesReadThisIteration;
45-
ma_uint64 framesToReadThisIteration;
46-
framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
47-
rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
48-
if (rc != MA_SUCCESS) {
49-
break;
50-
}
51-
ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
52-
if (framesReadThisIteration < framesToReadThisIteration) {
53-
break;
54-
}
55-
}
56-
return rc;
57-
}
58-
59-
// converts audio file to signed 16-bit 16000hz wav
60-
static std::string convert_audio_file(const std::string & fname, bool stereo) {
61-
62-
// create temporary filename
63-
std::string newpath;
64-
newpath = __get_tmpdir();
65-
newpath += "/whisperfile.";
66-
newpath += std::to_string(_rand64());
67-
newpath += ".wav";
68-
69-
// create decoder
70-
ma_decoder_config decoderConfig =
71-
ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
72-
decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
73-
decoderConfig.resampling.linear.lpfOrder = 8;
74-
75-
// open input file
76-
ma_decoder decoder;
77-
ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
78-
if (rc != MA_SUCCESS) {
79-
fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
80-
fname.c_str(), ma_result_description(rc));
81-
return "";
82-
}
83-
84-
// create encoder
85-
ma_encoder encoder;
86-
ma_encoder_config encoderConfig = ma_encoder_config_init(
87-
ma_encoding_format_wav,
88-
decoder.outputFormat,
89-
decoder.outputChannels,
90-
decoder.outputSampleRate);
91-
rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
92-
if (rc != MA_SUCCESS) {
93-
ma_decoder_uninit(&decoder);
94-
fprintf(stderr, "%s: failed to open output file: %s\n",
95-
newpath.c_str(), ma_result_description(rc));
96-
return "";
97-
}
98-
99-
// perform the conversion
100-
rc = perform_audio_conversion(&decoder, &encoder);
101-
ma_encoder_uninit(&encoder);
102-
ma_decoder_uninit(&decoder);
103-
if (rc != MA_SUCCESS) {
104-
fprintf(stderr, "%s: failed to convert audio file: %s\n",
105-
fname.c_str(), ma_result_description(rc));
106-
return "";
107-
}
108-
109-
// return new path
110-
delete_me = newpath;
111-
atexit(on_exit);
112-
return newpath;
113-
}
114-
115-
#define TRY_CONVERSION \
116-
do { \
117-
if (did_conversion) { \
118-
fprintf(stderr, "error: failed to open audio file\n"); \
119-
return false; \
120-
} \
121-
std::string fname2 = convert_audio_file(fname, stereo); \
122-
if (fname2.empty()) { \
123-
return false; \
124-
} \
125-
fname = fname2; \
126-
did_conversion = true; \
127-
goto TryAgain; \
128-
} while (0)
129-
130-
bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
131-
drwav wav;
132-
std::vector<uint8_t> wav_data; // used for pipe input from stdin
133-
std::string fname = fname_;
134-
bool did_conversion = false;
135-
136-
TryAgain:
137-
if (fname == "-") {
138-
{
139-
#ifdef _WIN32
140-
_setmode(_fileno(stdin), _O_BINARY);
141-
#endif
142-
143-
uint8_t buf[1024];
144-
while (true)
145-
{
146-
const size_t n = fread(buf, 1, sizeof(buf), stdin);
147-
if (n == 0) {
148-
break;
149-
}
150-
wav_data.insert(wav_data.end(), buf, buf + n);
151-
}
152-
}
153-
154-
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
155-
fprintf(stderr, "error: failed to open WAV file from stdin\n");
156-
return false;
157-
}
158-
159-
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
160-
}
161-
else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
162-
tinylogf("%s: converting to wav...\n", fname.c_str());
163-
TRY_CONVERSION;
164-
}
165-
166-
if (stereo && wav.channels < 2) {
167-
fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
168-
drwav_uninit(&wav);
169-
return false;
170-
}
171-
172-
if (wav.channels != 1 && wav.channels != 2) {
173-
tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
174-
drwav_uninit(&wav);
175-
TRY_CONVERSION;
176-
}
177-
178-
if (stereo && wav.channels != 2) {
179-
tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
180-
drwav_uninit(&wav);
181-
TRY_CONVERSION;
182-
}
183-
184-
if (wav.sampleRate != COMMON_SAMPLE_RATE) {
185-
tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
186-
drwav_uninit(&wav);
187-
TRY_CONVERSION;
188-
}
189-
190-
if (wav.bitsPerSample != 16) {
191-
tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
192-
drwav_uninit(&wav);
193-
TRY_CONVERSION;
194-
}
195-
196-
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
197-
198-
std::vector<int16_t> pcm16;
199-
pcm16.resize(n*wav.channels);
200-
drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
201-
drwav_uninit(&wav);
202-
203-
// convert to mono, float
204-
pcmf32.resize(n);
205-
if (wav.channels == 1) {
206-
for (uint64_t i = 0; i < n; i++) {
207-
pcmf32[i] = float(pcm16[i])/32768.0f;
208-
}
209-
} else {
210-
for (uint64_t i = 0; i < n; i++) {
211-
pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
212-
}
213-
}
214-
215-
if (stereo) {
216-
// convert to stereo, float
217-
pcmf32s.resize(2);
218-
219-
pcmf32s[0].resize(n);
220-
pcmf32s[1].resize(n);
221-
for (uint64_t i = 0; i < n; i++) {
222-
pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
223-
pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
224-
}
225-
}
226-
227-
return true;
228-
}
229-
23030
void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
23131
const float rc = 1.0f / (2.0f * M_PI * cutoff);
23232
const float dt = 1.0f / sample_rate;

whisper.cpp/common.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,6 @@
1919
// Check if a buffer is a WAV audio file
2020
bool is_wav_buffer(const std::string buf);
2121

22-
// Read WAV audio file and store the PCM data into pcmf32
23-
// fname can be a buffer of WAV data instead of a filename
24-
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
25-
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
26-
bool read_wav(
27-
const std::string & fname,
28-
std::vector<float> & pcmf32,
29-
std::vector<std::vector<float>> & pcmf32s,
30-
bool stereo);
31-
3222
// Write PCM data into WAV audio file
3323
class wav_writer {
3424
private:

whisper.cpp/main.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "llamafile/llamafile.h"
66
#include "llama.cpp/cores.h"
77
#include "common.h"
8+
#include "slurp.h"
89

910
#include "whisper.h"
1011
#include "grammar-parser.h"
@@ -1108,8 +1109,8 @@ int main(int argc, char ** argv) {
11081109
std::vector<float> pcmf32; // mono-channel F32 PCM
11091110
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
11101111

1111-
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
1112-
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
1112+
if (!slurp_audio_file(fname_inp.c_str(), pcmf32, pcmf32s, params.diarize)) {
1113+
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
11131114
continue;
11141115
}
11151116

whisper.cpp/server.cpp

Lines changed: 8 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
33
#include "llamafile/debug.h"
44
#include "common.h"
5+
#include "slurp.h"
56

67
#include "whisper.h"
78
#include "httplib.h"
@@ -44,8 +45,6 @@ struct server_params
4445
int32_t port = 8080;
4546
int32_t read_timeout = 600;
4647
int32_t write_timeout = 600;
47-
48-
bool ffmpeg_converter = false;
4948
};
5049

5150
struct whisper_params {
@@ -138,7 +137,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
138137
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
139138
fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str());
140139
fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
141-
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server\n", sparams.ffmpeg_converter ? "true" : "false");
142140
fprintf(stderr, " --recompile [%-7s] Force GPU support to be recompiled at runtime if possible.\n", FLAG_recompile ? "true" : "false");
143141
fprintf(stderr, " --nocompile [%-7s] Never compile GPU support at runtime.", FLAG_nocompile ? "true" : "false");
144142
fprintf(stderr, "\n");
@@ -224,7 +222,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
224222
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
225223
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
226224
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
227-
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
228225
else if ( arg == "--recompile") { FLAG_recompile = true; }
229226
else if ( arg == "--nocompile") { FLAG_nocompile = true; }
230227
else if ( arg == "--tinyblas") { FLAG_tinyblas = true; }
@@ -262,45 +259,6 @@ struct whisper_print_user_data {
262259
int progress_prev;
263260
};
264261

265-
void check_ffmpeg_availibility() {
266-
int result = system("ffmpeg -version");
267-
268-
if (result == 0) {
269-
std::cout << "ffmpeg is available." << std::endl;
270-
} else {
271-
// ffmpeg is not available
272-
std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
273-
std::cout << "and that its executable is included in your system's PATH. ";
274-
exit(0);
275-
}
276-
}
277-
278-
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
279-
std::ostringstream cmd_stream;
280-
std::string converted_filename_temp = temp_filename + "_temp.wav";
281-
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
282-
std::string cmd = cmd_stream.str();
283-
284-
int status = std::system(cmd.c_str());
285-
if (status != 0) {
286-
error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
287-
return false;
288-
}
289-
290-
// Remove the original file
291-
if (remove(temp_filename.c_str()) != 0) {
292-
error_resp = "{\"error\":\"Failed to remove the original file.\"}";
293-
return false;
294-
}
295-
296-
// Rename the temporary file to match the original filename
297-
if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
298-
error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
299-
return false;
300-
}
301-
return true;
302-
}
303-
304262
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
305263
std::string speaker = "";
306264
const int64_t n_samples = pcmf32s[0].size();
@@ -558,9 +516,6 @@ int whisper_server_main(int argc, char ** argv) {
558516
exit(0);
559517
}
560518

561-
if (sparams.ffmpeg_converter) {
562-
check_ffmpeg_availibility();
563-
}
564519
// whisper init
565520
struct whisper_context_params cparams = whisper_context_default_params();
566521

@@ -741,36 +696,14 @@ int whisper_server_main(int argc, char ** argv) {
741696
temp_file << audio_file.content;
742697
temp_file.close();
743698

744-
if (sparams.ffmpeg_converter) {
745-
746-
std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
747-
const bool is_converted = convert_to_wav(temp_filename, error_resp);
748-
if (!is_converted) {
749-
res.set_content(error_resp, "application/json");
750-
return;
751-
}
752-
753-
// read wav content into pcmf32
754-
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
755-
{
756-
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
757-
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
758-
res.set_content(error_resp, "application/json");
759-
std::remove(temp_filename.c_str());
760-
return;
761-
}
762-
} else {
763-
if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
764-
{
765-
fprintf(stderr, "error: failed to read WAV file\n");
766-
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
767-
res.set_content(error_resp, "application/json");
768-
return;
769-
}
699+
bool ok = slurp_audio_file(temp_filename.c_str(), pcmf32, pcmf32s, params.diarize);
700+
unlink(temp_filename.c_str());
701+
if (!ok) {
702+
fprintf(stderr, "error: failed to read audio file\n");
703+
const std::string error_resp = "{\"error\":\"failed to read audio file\"}";
704+
res.set_content(error_resp, "application/json");
705+
return;
770706
}
771-
// remove temp file
772-
std::remove(temp_filename.c_str());
773-
774707

775708
printf("Successfully loaded %s\n", filename.c_str());
776709

0 commit comments

Comments
 (0)