Rewrite audio file loader code

jart · jart · commit 74dfd21eacf2 · 2024-09-27T22:57:09.000-07:00
We now have a new function slurp_audio_file() which replaces read_wav(). This function has simpler code, and allows us to avoid a temporary file. See #568
diff --git a/whisper.cpp/common.cpp b/whisper.cpp/common.cpp
@@ -27,206 +27,6 @@
 #include "stb/stb_vorbis.h"
 #include "miniaudio.h"
 
-#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096
-
-static std::string delete_me;
-
-static void on_exit(void) {
-    if (!delete_me.empty()) {
-        unlink(delete_me.c_str());
-    }
-}
-
-static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) {
-    ma_result rc = MA_SUCCESS;
-    for (;;) {
-        ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
-        ma_uint64 framesReadThisIteration;
-        ma_uint64 framesToReadThisIteration;
-        framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels);
-        rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
-        if (rc != MA_SUCCESS) {
-            break;
-        }
-        ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL);
-        if (framesReadThisIteration < framesToReadThisIteration) {
-            break;
-        }
-    }
-    return rc;
-}
-
-// converts audio file to signed 16-bit 16000hz wav
-static std::string convert_audio_file(const std::string & fname, bool stereo) {
-
-    // create temporary filename
-    std::string newpath;
-    newpath = __get_tmpdir();
-    newpath += "/whisperfile.";
-    newpath += std::to_string(_rand64());
-    newpath += ".wav";
-
-    // create decoder
-    ma_decoder_config decoderConfig =
-            ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
-    decoderConfig.resampling.algorithm = ma_resample_algorithm_linear;
-    decoderConfig.resampling.linear.lpfOrder = 8;
-
-    // open input file
-    ma_decoder decoder;
-    ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder);
-    if (rc != MA_SUCCESS) {
-        fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n",
-                fname.c_str(), ma_result_description(rc));
-        return "";
-    }
-
-    // create encoder
-    ma_encoder encoder;
-    ma_encoder_config encoderConfig = ma_encoder_config_init(
-        ma_encoding_format_wav,
-        decoder.outputFormat,
-        decoder.outputChannels,
-        decoder.outputSampleRate);
-    rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder);
-    if (rc != MA_SUCCESS) {
-        ma_decoder_uninit(&decoder);
-        fprintf(stderr, "%s: failed to open output file: %s\n",
-                newpath.c_str(), ma_result_description(rc));
-        return "";
-    }
-
-    // perform the conversion
-    rc = perform_audio_conversion(&decoder, &encoder);
-    ma_encoder_uninit(&encoder);
-    ma_decoder_uninit(&decoder);
-    if (rc != MA_SUCCESS) {
-        fprintf(stderr, "%s: failed to convert audio file: %s\n",
-                fname.c_str(), ma_result_description(rc));
-        return "";
-    }
-
-    // return new path
-    delete_me = newpath;
-    atexit(on_exit);
-    return newpath;
-}
-
-#define TRY_CONVERSION                                                  \
-    do {                                                                \
-        if (did_conversion) {                                           \
-            fprintf(stderr, "error: failed to open audio file\n");      \
-            return false;                                               \
-        }                                                               \
-        std::string fname2 = convert_audio_file(fname, stereo);         \
-        if (fname2.empty()) {                                           \
-            return false;                                               \
-        }                                                               \
-        fname = fname2;                                                 \
-        did_conversion = true;                                          \
-        goto TryAgain;                                                  \
-    } while (0)
-
-bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
-    std::string fname = fname_;
-    bool did_conversion = false;
-
-TryAgain:
-    if (fname == "-") {
-        {
-            #ifdef _WIN32
-            _setmode(_fileno(stdin), _O_BINARY);
-            #endif
-
-            uint8_t buf[1024];
-            while (true)
-            {
-                const size_t n = fread(buf, 1, sizeof(buf), stdin);
-                if (n == 0) {
-                    break;
-                }
-                wav_data.insert(wav_data.end(), buf, buf + n);
-            }
-        }
-
-        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from stdin\n");
-            return false;
-        }
-
-        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
-    }
-    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-        tinylogf("%s: converting to wav...\n", fname.c_str());
-        TRY_CONVERSION;
-    }
-
-    if (stereo && wav.channels < 2) {
-        fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str());
-        drwav_uninit(&wav);
-        return false;
-    }
-
-    if (wav.channels != 1 && wav.channels != 2) {
-        tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels);
-        drwav_uninit(&wav);
-        TRY_CONVERSION;
-    }
-
-    if (stereo && wav.channels != 2) {
-        tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels);
-        drwav_uninit(&wav);
-        TRY_CONVERSION;
-    }
-
-    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate);
-        drwav_uninit(&wav);
-        TRY_CONVERSION;
-    }
-
-    if (wav.bitsPerSample != 16) {
-        tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample);
-        drwav_uninit(&wav);
-        TRY_CONVERSION;
-    }
-
-    const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
-
-    std::vector<int16_t> pcm16;
-    pcm16.resize(n*wav.channels);
-    drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-    drwav_uninit(&wav);
-
-    // convert to mono, float
-    pcmf32.resize(n);
-    if (wav.channels == 1) {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[i])/32768.0f;
-        }
-    } else {
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
-        }
-    }
-
-    if (stereo) {
-        // convert to stereo, float
-        pcmf32s.resize(2);
-
-        pcmf32s[0].resize(n);
-        pcmf32s[1].resize(n);
-        for (uint64_t i = 0; i < n; i++) {
-            pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
-            pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
-        }
-    }
-
-    return true;
-}
-
 void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
     const float rc = 1.0f / (2.0f * M_PI * cutoff);
     const float dt = 1.0f / sample_rate;
diff --git a/whisper.cpp/common.h b/whisper.cpp/common.h
@@ -19,16 +19,6 @@
 // Check if a buffer is a WAV audio file
 bool is_wav_buffer(const std::string buf);
 
-// Read WAV audio file and store the PCM data into pcmf32
-// fname can be a buffer of WAV data instead of a filename
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
-bool read_wav(
-        const std::string & fname,
-        std::vector<float> & pcmf32,
-        std::vector<std::vector<float>> & pcmf32s,
-        bool stereo);
-
 // Write PCM data into WAV audio file
 class wav_writer {
 private:
diff --git a/whisper.cpp/main.cpp b/whisper.cpp/main.cpp
@@ -5,6 +5,7 @@
 #include "llamafile/llamafile.h"
 #include "llama.cpp/cores.h"
 #include "common.h"
+#include "slurp.h"
 
 #include "whisper.h"
 #include "grammar-parser.h"
@@ -1108,8 +1109,8 @@ int main(int argc, char ** argv) {
         std::vector<float> pcmf32;               // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
 
-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
+        if (!slurp_audio_file(fname_inp.c_str(), pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
             continue;
         }
 
diff --git a/whisper.cpp/server.cpp b/whisper.cpp/server.cpp
@@ -2,6 +2,7 @@
 // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 #include "llamafile/debug.h"
 #include "common.h"
+#include "slurp.h"
 
 #include "whisper.h"
 #include "httplib.h"
@@ -44,8 +45,6 @@ struct server_params
     int32_t port          = 8080;
     int32_t read_timeout  = 600;
     int32_t write_timeout = 600;
-
-    bool ffmpeg_converter = false;
 };
 
 struct whisper_params {
@@ -138,7 +137,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
     fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
     fprintf(stderr, "  --inference-path PATH,         [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
-    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server\n", sparams.ffmpeg_converter ? "true" : "false");
     fprintf(stderr, "  --recompile                    [%-7s] Force GPU support to be recompiled at runtime if possible.\n", FLAG_recompile ? "true" : "false");
     fprintf(stderr, "  --nocompile                    [%-7s] Never compile GPU support at runtime.", FLAG_nocompile ? "true" : "false");
     fprintf(stderr, "\n");
@@ -224,7 +222,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
         else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
         else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
-        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
         else if (                  arg == "--recompile")       { FLAG_recompile = true; }
         else if (                  arg == "--nocompile")       { FLAG_nocompile = true; }
         else if (                  arg == "--tinyblas")        { FLAG_tinyblas = true; }
@@ -262,45 +259,6 @@ struct whisper_print_user_data {
     int progress_prev;
 };
 
-void check_ffmpeg_availibility() {
-    int result = system("ffmpeg -version");
-
-    if (result == 0) {
-        std::cout << "ffmpeg is available." << std::endl;
-    } else {
-        // ffmpeg is not available
-        std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
-        std::cout << "and that its executable is included in your system's PATH. ";
-        exit(0);
-    }
-}
-
-bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
-    std::ostringstream cmd_stream;
-    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
-    std::string cmd = cmd_stream.str();
-
-    int status = std::system(cmd.c_str());
-    if (status != 0) {
-        error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
-        return false;
-    }
-
-    // Remove the original file
-    if (remove(temp_filename.c_str()) != 0) {
-        error_resp = "{\"error\":\"Failed to remove the original file.\"}";
-        return false;
-    }
-
-    // Rename the temporary file to match the original filename
-    if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
-        error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
-        return false;
-    }
-    return true;
-}
-
 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
     std::string speaker = "";
     const int64_t n_samples = pcmf32s[0].size();
@@ -558,9 +516,6 @@ int whisper_server_main(int argc, char ** argv) {
         exit(0);
     }
 
-    if (sparams.ffmpeg_converter) {
-        check_ffmpeg_availibility();
-    }
     // whisper init
     struct whisper_context_params cparams = whisper_context_default_params();
 
@@ -741,36 +696,14 @@ int whisper_server_main(int argc, char ** argv) {
         temp_file << audio_file.content;
         temp_file.close();
 
-        if (sparams.ffmpeg_converter) {
-
-            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
-            const bool is_converted = convert_to_wav(temp_filename, error_resp);
-            if (!is_converted) {
-                res.set_content(error_resp, "application/json");
-                return;
-            }
-
-            // read wav content into pcmf32
-            if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
-                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-                res.set_content(error_resp, "application/json");
-                std::remove(temp_filename.c_str());
-                return;
-            }
-        } else {
-            if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read WAV file\n");
-                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-                res.set_content(error_resp, "application/json");
-                return;
-            }
+        bool ok = slurp_audio_file(temp_filename.c_str(), pcmf32, pcmf32s, params.diarize);
+        unlink(temp_filename.c_str());
+        if (!ok) {
+            fprintf(stderr, "error: failed to read audio file\n");
+            const std::string error_resp = "{\"error\":\"failed to read audio file\"}";
+            res.set_content(error_resp, "application/json");
+            return;
         }
-        // remove temp file
-        std::remove(temp_filename.c_str());
-
 
         printf("Successfully loaded %s\n", filename.c_str());
 
diff --git a/whisper.cpp/slurp.cpp b/whisper.cpp/slurp.cpp
diff --git a/whisper.cpp/slurp.h b/whisper.cpp/slurp.h