22// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
33#define _USE_MATH_DEFINES // for M_PI
44
5+ #include " llamafile/log.h"
6+ #include " llamafile/llamafile.h"
57#include " common.h"
68
79// third-party utilities
810// use your favorite implementations
9- #define DR_WAV_IMPLEMENTATION
11+ // #define DR_WAV_IMPLEMENTATION // [jart] comment out
1012#include " dr_wav.h"
1113
1214#if defined(_MSC_VER)
1820#include < io.h>
1921#endif
2022
23+ #include < cosmo.h>
24+ #include < stdlib.h>
25+ #include < unistd.h>
26+
27+ #include " stb/stb_vorbis.h"
28+ #include " miniaudio.h"
29+
30+ #define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096
31+
32+ static std::string delete_me;
33+
34+ static void on_exit (void ) {
35+ if (!delete_me.empty ()) {
36+ unlink (delete_me.c_str ());
37+ }
38+ }
39+
2140bool is_wav_buffer (const std::string buf) {
2241 // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
2342 // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
@@ -33,10 +52,103 @@ bool is_wav_buffer(const std::string buf) {
3352 return true ;
3453}
3554
36- bool read_wav (const std::string & fname, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
55+ static ma_result perform_audio_conversion (ma_decoder* pDecoder, ma_encoder* pEncoder) {
56+ ma_result rc = MA_SUCCESS;
57+ for (;;) {
58+ ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE];
59+ ma_uint64 framesReadThisIteration;
60+ ma_uint64 framesToReadThisIteration;
61+ framesToReadThisIteration = sizeof (pRawData) / ma_get_bytes_per_frame (pDecoder->outputFormat , pDecoder->outputChannels );
62+ rc = ma_decoder_read_pcm_frames (pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration);
63+ if (rc != MA_SUCCESS) {
64+ break ;
65+ }
66+ ma_encoder_write_pcm_frames (pEncoder, pRawData, framesReadThisIteration, NULL );
67+ if (framesReadThisIteration < framesToReadThisIteration) {
68+ break ;
69+ }
70+ }
71+ return rc;
72+ }
73+
74+ // converts audio file to signed 16-bit 16000hz wav
75+ static std::string convert_audio_file (const std::string & fname, bool stereo) {
76+
77+ // create temporary filename
78+ std::string newpath;
79+ newpath = __get_tmpdir ();
80+ newpath += " /whisperfile." ;
81+ newpath += std::to_string (_rand64 ());
82+ newpath += " .wav" ;
83+
84+ // create decoder
85+ ma_decoder_config decoderConfig =
86+ ma_decoder_config_init (ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE);
87+ decoderConfig.resampling .algorithm = ma_resample_algorithm_linear;
88+ decoderConfig.resampling .linear .lpfOrder = 8 ;
89+
90+ // open input file
91+ ma_decoder decoder;
92+ ma_result rc = ma_decoder_init_file (fname.c_str (), &decoderConfig, &decoder);
93+ if (rc != MA_SUCCESS) {
94+ fprintf (stderr, " %s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n " ,
95+ fname.c_str (), ma_result_description (rc));
96+ return " " ;
97+ }
98+
99+ // create encoder
100+ ma_encoder encoder;
101+ ma_encoder_config encoderConfig = ma_encoder_config_init (
102+ ma_encoding_format_wav,
103+ decoder.outputFormat ,
104+ decoder.outputChannels ,
105+ decoder.outputSampleRate );
106+ rc = ma_encoder_init_file (newpath.c_str (), &encoderConfig, &encoder);
107+ if (rc != MA_SUCCESS) {
108+ ma_decoder_uninit (&decoder);
109+ fprintf (stderr, " %s: failed to open output file: %s\n " ,
110+ newpath.c_str (), ma_result_description (rc));
111+ return " " ;
112+ }
113+
114+ // perform the conversion
115+ rc = perform_audio_conversion (&decoder, &encoder);
116+ ma_encoder_uninit (&encoder);
117+ ma_decoder_uninit (&decoder);
118+ if (rc != MA_SUCCESS) {
119+ fprintf (stderr, " %s: failed to convert audio file: %s\n " ,
120+ fname.c_str (), ma_result_description (rc));
121+ return " " ;
122+ }
123+
124+ // return new path
125+ delete_me = newpath;
126+ atexit (on_exit);
127+ return newpath;
128+ }
129+
130+ #define TRY_CONVERSION \
131+ do { \
132+ if (did_conversion) { \
133+ fprintf (stderr, " error: failed to open audio file\n " ); \
134+ return false ; \
135+ } \
136+ std::string fname2 = convert_audio_file (fname, stereo); \
137+ if (fname2.empty ()) { \
138+ return false ; \
139+ } \
140+ fname = fname2; \
141+ did_conversion = true ; \
142+ goto TryAgain; \
143+ } while (0 )
144+
145+ bool read_wav (const std::string & fname_, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
37146 drwav wav;
38147 std::vector<uint8_t > wav_data; // used for pipe input from stdin
148+ std::string fname = fname_;
149+ bool did_conversion = false ;
39150
151+ TryAgain:
40152 if (fname == " -" ) {
41153 {
42154 #ifdef _WIN32
@@ -68,32 +180,38 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
68180 }
69181 }
70182 else if (drwav_init_file (&wav, fname.c_str (), nullptr ) == false ) {
71- fprintf (stderr, " error: failed to open '%s' as WAV file\n " , fname.c_str ());
183+ tinylogf (" %s: converting to wav...\n " , fname.c_str ());
184+ TRY_CONVERSION;
185+ }
186+
187+ if (stereo && wav.channels < 2 ) {
188+ fprintf (stderr, " %s: audio file must be stereo for diarization\n " , fname.c_str ());
189+ drwav_uninit (&wav);
72190 return false ;
73191 }
74192
75193 if (wav.channels != 1 && wav.channels != 2 ) {
76- fprintf (stderr, " %s: WAV file '%s' must be mono or stereo \n " , __func__, fname.c_str ());
194+ tinylogf ( " %s: audio file has %d channels \n " , fname.c_str (), wav. channels );
77195 drwav_uninit (&wav);
78- return false ;
196+ TRY_CONVERSION ;
79197 }
80198
81199 if (stereo && wav.channels != 2 ) {
82- fprintf (stderr, " %s: WAV file '%s' must be stereo for diarization\n " , __func__, fname.c_str ());
200+ tinylogf ( " %s: audio file has %d channels (we want diarization) \n " , fname.c_str (), wav. channels );
83201 drwav_uninit (&wav);
84- return false ;
202+ TRY_CONVERSION ;
85203 }
86204
87205 if (wav.sampleRate != COMMON_SAMPLE_RATE) {
88- fprintf (stderr, " %s: WAV file '%s' must be %i kHz \n " , __func__, fname.c_str (), COMMON_SAMPLE_RATE/ 1000 );
206+ tinylogf ( " %s: audio file has %d sample rate \n " , fname.c_str (), wav. sampleRate );
89207 drwav_uninit (&wav);
90- return false ;
208+ TRY_CONVERSION ;
91209 }
92210
93211 if (wav.bitsPerSample != 16 ) {
94- fprintf (stderr, " %s: WAV file '%s' must be 16-bit \n " , __func__, fname.c_str ());
212+ tinylogf ( " %s: audio file has %d bits per sample \n " , fname.c_str (), wav. bitsPerSample );
95213 drwav_uninit (&wav);
96- return false ;
214+ TRY_CONVERSION ;
97215 }
98216
99217 const uint64_t n = wav_data.empty () ? wav.totalPCMFrameCount : wav_data.size ()/(wav.channels *wav.bitsPerSample /8 );
@@ -171,7 +289,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
171289 energy_last /= n_samples_last;
172290
173291 if (verbose) {
174- fprintf (stderr, " %s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n " , __func__, energy_all, energy_last, vad_thold, freq_thold);
292+ tinylogf ( " %s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n " , __func__, energy_all, energy_last, vad_thold, freq_thold);
175293 }
176294
177295 if (energy_last > vad_thold*energy_all) {
0 commit comments