|
4 | 4 |
|
5 | 5 | // third-party utilities |
6 | 6 | // use your favorite implementations |
7 | | -#define DR_WAV_IMPLEMENTATION |
8 | | -#include "dr_wav.h" |
| 7 | +#define STB_VORBIS_HEADER_ONLY |
| 8 | +#include "stb_vorbis.c" /* Enables Vorbis decoding. */ |
| 9 | + |
| 10 | +#define MA_NO_DEVICE_IO |
| 11 | +#define MA_NO_THREADING |
| 12 | +#define MA_NO_ENCODING |
| 13 | +#define MA_NO_GENERATION |
| 14 | +#define MA_NO_RESOURCE_MANAGER |
| 15 | +#define MA_NO_NODE_GRAPH |
| 16 | +#define MINIAUDIO_IMPLEMENTATION |
| 17 | +#include "miniaudio.h" |
9 | 18 |
|
10 | 19 | #include <cmath> |
11 | 20 | #include <cstring> |
@@ -639,111 +648,95 @@ bool is_wav_buffer(const std::string buf) { |
639 | 648 | return true; |
640 | 649 | } |
641 | 650 |
|
642 | | -bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { |
643 | | - drwav wav; |
644 | | - std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output |
| 651 | +bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { |
| 652 | + std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output |
| 653 | + |
| 654 | + ma_result result; |
| 655 | + ma_decoder_config decoder_config; |
| 656 | + ma_decoder decoder; |
| 657 | + |
| 658 | + decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, COMMON_SAMPLE_RATE); |
645 | 659 |
|
646 | 660 | if (fname == "-") { |
647 | | - { |
648 | | - #ifdef _WIN32 |
649 | | - _setmode(_fileno(stdin), _O_BINARY); |
650 | | - #endif |
651 | | - |
652 | | - uint8_t buf[1024]; |
653 | | - while (true) |
654 | | - { |
655 | | - const size_t n = fread(buf, 1, sizeof(buf), stdin); |
656 | | - if (n == 0) { |
657 | | - break; |
658 | | - } |
659 | | - wav_data.insert(wav_data.end(), buf, buf + n); |
660 | | - } |
661 | | - } |
| 661 | + #ifdef _WIN32 |
| 662 | + _setmode(_fileno(stdin), _O_BINARY); |
| 663 | + #endif |
662 | 664 |
|
663 | | - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
664 | | - fprintf(stderr, "error: failed to open WAV file from stdin\n"); |
665 | | - return false; |
666 | | - } |
| 665 | + uint8_t buf[1024]; |
| 666 | + while (true) |
| 667 | + { |
| 668 | + const size_t n = fread(buf, 1, sizeof(buf), stdin); |
| 669 | + if (n == 0) { |
| 670 | + break; |
| 671 | + } |
| 672 | + audio_data.insert(audio_data.end(), buf, buf + n); |
| 673 | + } |
| 674 | + |
| 675 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 676 | + |
| 677 | + fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result)); |
667 | 678 |
|
668 | | - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); |
| 679 | + return false; |
| 680 | + } |
| 681 | + |
| 682 | + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size()); |
669 | 683 | } |
670 | 684 | else if (is_wav_buffer(fname)) { |
671 | | - if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) { |
672 | | - fprintf(stderr, "error: failed to open WAV file from fname buffer\n"); |
673 | | - return false; |
674 | | - } |
| 685 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 686 | + fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result)); |
| 687 | + |
| 688 | + return false; |
| 689 | + } |
675 | 690 | } |
676 | | - else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { |
| 691 | + else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) { |
677 | 692 | #if defined(WHISPER_FFMPEG) |
678 | | - if (ffmpeg_decode_audio(fname, wav_data) != 0) { |
679 | | - fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str()); |
680 | | - return false; |
681 | | - } |
682 | | - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
683 | | - fprintf(stderr, "error: failed to read wav data as wav \n"); |
684 | | - return false; |
685 | | - } |
| 693 | + if (ffmpeg_decode_audio(fname, audio_data) != 0) { |
| 694 | + fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str()); |
| 695 | + |
| 696 | + return false; |
| 697 | + } |
| 698 | + |
| 699 | + if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) { |
| 700 | + fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result)); |
| 701 | + |
| 702 | + return false; |
| 703 | + } |
686 | 704 | #else |
687 | | - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); |
688 | | - return false; |
689 | | -#endif |
690 | | - } |
| 705 | + fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result)); |
691 | 706 |
|
692 | | - if (wav.channels != 1 && wav.channels != 2) { |
693 | | - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str()); |
694 | | - drwav_uninit(&wav); |
695 | | - return false; |
| 707 | + return false; |
| 708 | +#endif |
696 | 709 | } |
697 | 710 |
|
698 | | - if (stereo && wav.channels != 2) { |
699 | | - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str()); |
700 | | - drwav_uninit(&wav); |
701 | | - return false; |
702 | | - } |
| 711 | + ma_uint64 frame_count; |
| 712 | + ma_uint64 frames_read; |
703 | 713 |
|
704 | | - if (wav.sampleRate != COMMON_SAMPLE_RATE) { |
705 | | - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000); |
706 | | - drwav_uninit(&wav); |
707 | | - return false; |
708 | | - } |
| 714 | + if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) { |
| 715 | + fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result)); |
709 | 716 |
|
710 | | - if (wav.bitsPerSample != 16) { |
711 | | - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str()); |
712 | | - drwav_uninit(&wav); |
713 | | - return false; |
| 717 | + return false; |
714 | 718 | } |
715 | 719 |
|
716 | | - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); |
| 720 | + pcmf32.resize(stereo ? frame_count*2 : frame_count); |
717 | 721 |
|
718 | | - std::vector<int16_t> pcm16; |
719 | | - pcm16.resize(n*wav.channels); |
720 | | - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); |
721 | | - drwav_uninit(&wav); |
| 722 | + if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) { |
| 723 | + fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result)); |
722 | 724 |
|
723 | | - // convert to mono, float |
724 | | - pcmf32.resize(n); |
725 | | - if (wav.channels == 1) { |
726 | | - for (uint64_t i = 0; i < n; i++) { |
727 | | - pcmf32[i] = float(pcm16[i])/32768.0f; |
728 | | - } |
729 | | - } else { |
730 | | - for (uint64_t i = 0; i < n; i++) { |
731 | | - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; |
732 | | - } |
| 725 | + return false; |
733 | 726 | } |
734 | 727 |
|
735 | 728 | if (stereo) { |
736 | | - // convert to stereo, float |
737 | | - pcmf32s.resize(2); |
738 | | - |
739 | | - pcmf32s[0].resize(n); |
740 | | - pcmf32s[1].resize(n); |
741 | | - for (uint64_t i = 0; i < n; i++) { |
742 | | - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; |
743 | | - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; |
744 | | - } |
| 729 | + pcmf32s.resize(2); |
| 730 | + pcmf32s[0].resize(frame_count); |
| 731 | + pcmf32s[1].resize(frame_count); |
| 732 | + for (uint64_t i = 0; i < frame_count; i++) { |
| 733 | + pcmf32s[0][i] = pcmf32[2*i]; |
| 734 | + pcmf32s[1][i] = pcmf32[2*i + 1]; |
| 735 | + } |
745 | 736 | } |
746 | 737 |
|
| 738 | + ma_decoder_uninit(&decoder); |
| 739 | + |
747 | 740 | return true; |
748 | 741 | } |
749 | 742 |
|
@@ -909,3 +902,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons |
909 | 902 | } |
910 | 903 | return true; |
911 | 904 | } |
| 905 | + |
| 906 | +#undef STB_VORBIS_HEADER_ONLY |
| 907 | +#include "stb_vorbis.c" |
0 commit comments