44
55// third-party utilities
66// use your favorite implementations
7- #define DR_WAV_IMPLEMENTATION
8- #include " dr_wav.h"
7+ #define STB_VORBIS_HEADER_ONLY
8+ #include " stb_vorbis.c" /* Enables Vorbis decoding. */
9+
10+ #define MA_NO_DEVICE_IO
11+ #define MA_NO_THREADING
12+ #define MA_NO_ENCODING
13+ #define MA_NO_GENERATION
14+ #define MA_NO_RESOURCE_MANAGER
15+ #define MA_NO_NODE_GRAPH
16+ #define MINIAUDIO_IMPLEMENTATION
17+ #include " miniaudio.h"
918
1019#include < cmath>
1120#include < cstring>
@@ -639,9 +648,14 @@ bool is_wav_buffer(const std::string buf) {
639648 return true ;
640649}
641650
642- bool read_wav (const std::string & fname, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
643- drwav wav;
644- std::vector<uint8_t > wav_data; // used for pipe input from stdin or ffmpeg decoding output
651+ bool read_audio_data (const std::string & fname, std::vector<float >& pcmf32, std::vector<std::vector<float >>& pcmf32s, bool stereo) {
652+ std::vector<uint8_t > audio_data; // used for pipe input from stdin or ffmpeg decoding output
653+
654+ ma_result result;
655+ ma_decoder_config decoder_config;
656+ ma_decoder decoder;
657+
658+ decoder_config = ma_decoder_config_init (ma_format_f32, stereo ? 2 : 1 , COMMON_SAMPLE_RATE);
645659
646660 if (fname == " -" ) {
647661 {
@@ -656,94 +670,78 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
656670 if (n == 0 ) {
657671 break ;
658672 }
659- wav_data .insert (wav_data .end (), buf, buf + n);
673+ audio_data .insert (audio_data .end (), buf, buf + n);
660674 }
661675 }
662676
663- if (drwav_init_memory (&wav, wav_data.data (), wav_data.size (), nullptr ) == false ) {
664- fprintf (stderr, " error: failed to open WAV file from stdin\n " );
665- return false ;
666- }
677+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
678+
679+ fprintf (stderr, " Error: failed to open audio data from stdin (%s)\n " , ma_result_description (result));
667680
668- fprintf (stderr, " %s: read %zu bytes from stdin\n " , __func__, wav_data.size ());
681+ return false ;
682+ }
683+
684+ fprintf (stderr, " %s: read %zu bytes from stdin\n " , __func__, audio_data.size ());
669685 }
670686 else if (is_wav_buffer (fname)) {
671- if (drwav_init_memory (&wav, fname.c_str (), fname.size (), nullptr ) == false ) {
672- fprintf (stderr, " error: failed to open WAV file from fname buffer\n " );
673- return false ;
674- }
687+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
688+
689+ fprintf (stderr, " Error: failed to open audio data from fname buffer (%s)\n " , ma_result_description (result));
690+
691+ return false ;
692+ }
675693 }
676- else if (drwav_init_file (&wav, fname.c_str (), nullptr ) == false ) {
694+ else if ((result = ma_decoder_init_file ( fname.c_str (), &decoder_config, &decoder)) != MA_SUCCESS ) {
677695#if defined(WHISPER_FFMPEG)
678- if (ffmpeg_decode_audio (fname, wav_data) != 0 ) {
679- fprintf (stderr, " error: failed to ffmpeg decode '%s' \n " , fname.c_str ());
680- return false ;
681- }
682- if (drwav_init_memory (&wav, wav_data.data (), wav_data.size (), nullptr ) == false ) {
683- fprintf (stderr, " error: failed to read wav data as wav \n " );
684- return false ;
685- }
696+ if (ffmpeg_decode_audio (fname, audio_data) != 0 ) {
697+
698+ fprintf (stderr, " error: failed to ffmpeg decode '%s'\n " , fname.c_str ());
699+
700+ return false ;
701+ }
702+ if ((result = ma_decoder_init_memory (audio_data.data (), audio_data.size (), &decoder_config, &decoder)) != MA_SUCCESS) {
703+
704+ fprintf (stderr, " error: failed to read audio data as wav (%s)\n " , ma_result_description (result));
705+
706+ return false ;
707+ }
686708#else
687- fprintf (stderr, " error: failed to open '%s' as WAV file\n " , fname.c_str ());
688- return false ;
709+ fprintf (stderr, " error: failed to open '%s' file (%s)\n " , fname.c_str (), ma_result_description (result));
710+
711+ return false ;
689712#endif
690713 }
691714
692- if (wav.channels != 1 && wav.channels != 2 ) {
693- fprintf (stderr, " %s: WAV file '%s' must be mono or stereo\n " , __func__, fname.c_str ());
694- drwav_uninit (&wav);
695- return false ;
696- }
715+ ma_uint64 frame_count;
716+ ma_uint64 frames_read;
697717
698- if (stereo && wav.channels != 2 ) {
699- fprintf (stderr, " %s: WAV file '%s' must be stereo for diarization\n " , __func__, fname.c_str ());
700- drwav_uninit (&wav);
701- return false ;
702- }
718+ if ((result = ma_decoder_get_length_in_pcm_frames (&decoder, &frame_count)) != MA_SUCCESS) {
703719
704- if (wav.sampleRate != COMMON_SAMPLE_RATE) {
705- fprintf (stderr, " %s: WAV file '%s' must be %i kHz\n " , __func__, fname.c_str (), COMMON_SAMPLE_RATE/1000 );
706- drwav_uninit (&wav);
707- return false ;
708- }
720+ fprintf (stderr, " error: failed to retrieve the length of the audio data (%s)\n " , ma_result_description (result));
709721
710- if (wav.bitsPerSample != 16 ) {
711- fprintf (stderr, " %s: WAV file '%s' must be 16-bit\n " , __func__, fname.c_str ());
712- drwav_uninit (&wav);
713- return false ;
722+ return false ;
714723 }
715724
716- const uint64_t n = wav_data. empty () ? wav. totalPCMFrameCount : wav_data. size ()/(wav. channels *wav. bitsPerSample / 8 );
725+ pcmf32. resize (stereo ? frame_count* 2 : frame_count );
717726
718- std::vector<int16_t > pcm16;
719- pcm16.resize (n*wav.channels );
720- drwav_read_pcm_frames_s16 (&wav, n, pcm16.data ());
721- drwav_uninit (&wav);
727+ if ((result = ma_decoder_read_pcm_frames (&decoder, pcmf32.data (), frame_count, &frames_read)) != MA_SUCCESS) {
722728
723- // convert to mono, float
724- pcmf32.resize (n);
725- if (wav.channels == 1 ) {
726- for (uint64_t i = 0 ; i < n; i++) {
727- pcmf32[i] = float (pcm16[i])/32768 .0f ;
728- }
729- } else {
730- for (uint64_t i = 0 ; i < n; i++) {
731- pcmf32[i] = float (pcm16[2 *i] + pcm16[2 *i + 1 ])/65536 .0f ;
732- }
729+ fprintf (stderr, " error: failed to read the frames of the audio data (%s)\n " , ma_result_description (result));
730+
731+ return false ;
733732 }
734733
735734 if (stereo) {
736- // convert to stereo, float
737- pcmf32s.resize (2 );
738-
739- pcmf32s[0 ].resize (n);
740- pcmf32s[1 ].resize (n);
741- for (uint64_t i = 0 ; i < n; i++) {
742- pcmf32s[0 ][i] = float (pcm16[2 *i])/32768 .0f ;
743- pcmf32s[1 ][i] = float (pcm16[2 *i + 1 ])/32768 .0f ;
744- }
735+ pcmf32s[0 ].resize (frame_count);
736+ pcmf32s[1 ].resize (frame_count);
737+ for (uint64_t i = 0 ; i < frame_count; i++) {
738+ pcmf32s[0 ][i] = pcmf32[2 *i];
739+ pcmf32s[1 ][i] = pcmf32[2 *i + 1 ];
740+ }
745741 }
746742
743+ ma_decoder_uninit (&decoder);
744+
747745 return true ;
748746}
749747
@@ -909,3 +907,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons
909907 }
910908 return true ;
911909}
910+
911+ #undef STB_VORBIS_HEADER_ONLY
912+ #include " stb_vorbis.c"
0 commit comments