|
27 | 27 | #include "stb/stb_vorbis.h" |
28 | 28 | #include "miniaudio.h" |
29 | 29 |
|
30 | | -#define MA_DATA_CONVERTER_STACK_BUFFER_SIZE 4096 |
31 | | - |
32 | | -static std::string delete_me; |
33 | | - |
34 | | -static void on_exit(void) { |
35 | | - if (!delete_me.empty()) { |
36 | | - unlink(delete_me.c_str()); |
37 | | - } |
38 | | -} |
39 | | - |
40 | | -static ma_result perform_audio_conversion(ma_decoder* pDecoder, ma_encoder* pEncoder) { |
41 | | - ma_result rc = MA_SUCCESS; |
42 | | - for (;;) { |
43 | | - ma_uint8 pRawData[MA_DATA_CONVERTER_STACK_BUFFER_SIZE]; |
44 | | - ma_uint64 framesReadThisIteration; |
45 | | - ma_uint64 framesToReadThisIteration; |
46 | | - framesToReadThisIteration = sizeof(pRawData) / ma_get_bytes_per_frame(pDecoder->outputFormat, pDecoder->outputChannels); |
47 | | - rc = ma_decoder_read_pcm_frames(pDecoder, pRawData, framesToReadThisIteration, &framesReadThisIteration); |
48 | | - if (rc != MA_SUCCESS) { |
49 | | - break; |
50 | | - } |
51 | | - ma_encoder_write_pcm_frames(pEncoder, pRawData, framesReadThisIteration, NULL); |
52 | | - if (framesReadThisIteration < framesToReadThisIteration) { |
53 | | - break; |
54 | | - } |
55 | | - } |
56 | | - return rc; |
57 | | -} |
58 | | - |
59 | | -// converts audio file to signed 16-bit 16000hz wav |
60 | | -static std::string convert_audio_file(const std::string & fname, bool stereo) { |
61 | | - |
62 | | - // create temporary filename |
63 | | - std::string newpath; |
64 | | - newpath = __get_tmpdir(); |
65 | | - newpath += "/whisperfile."; |
66 | | - newpath += std::to_string(_rand64()); |
67 | | - newpath += ".wav"; |
68 | | - |
69 | | - // create decoder |
70 | | - ma_decoder_config decoderConfig = |
71 | | - ma_decoder_config_init(ma_format_s16, 1 + stereo, COMMON_SAMPLE_RATE); |
72 | | - decoderConfig.resampling.algorithm = ma_resample_algorithm_linear; |
73 | | - decoderConfig.resampling.linear.lpfOrder = 8; |
74 | | - |
75 | | - // open input file |
76 | | - ma_decoder decoder; |
77 | | - ma_result rc = ma_decoder_init_file(fname.c_str(), &decoderConfig, &decoder); |
78 | | - if (rc != MA_SUCCESS) { |
79 | | - fprintf(stderr, "%s: failed to open audio file: %s (we support .wav, .mp3, .flac, and .ogg)\n", |
80 | | - fname.c_str(), ma_result_description(rc)); |
81 | | - return ""; |
82 | | - } |
83 | | - |
84 | | - // create encoder |
85 | | - ma_encoder encoder; |
86 | | - ma_encoder_config encoderConfig = ma_encoder_config_init( |
87 | | - ma_encoding_format_wav, |
88 | | - decoder.outputFormat, |
89 | | - decoder.outputChannels, |
90 | | - decoder.outputSampleRate); |
91 | | - rc = ma_encoder_init_file(newpath.c_str(), &encoderConfig, &encoder); |
92 | | - if (rc != MA_SUCCESS) { |
93 | | - ma_decoder_uninit(&decoder); |
94 | | - fprintf(stderr, "%s: failed to open output file: %s\n", |
95 | | - newpath.c_str(), ma_result_description(rc)); |
96 | | - return ""; |
97 | | - } |
98 | | - |
99 | | - // perform the conversion |
100 | | - rc = perform_audio_conversion(&decoder, &encoder); |
101 | | - ma_encoder_uninit(&encoder); |
102 | | - ma_decoder_uninit(&decoder); |
103 | | - if (rc != MA_SUCCESS) { |
104 | | - fprintf(stderr, "%s: failed to convert audio file: %s\n", |
105 | | - fname.c_str(), ma_result_description(rc)); |
106 | | - return ""; |
107 | | - } |
108 | | - |
109 | | - // return new path |
110 | | - delete_me = newpath; |
111 | | - atexit(on_exit); |
112 | | - return newpath; |
113 | | -} |
114 | | - |
115 | | -#define TRY_CONVERSION \ |
116 | | - do { \ |
117 | | - if (did_conversion) { \ |
118 | | - fprintf(stderr, "error: failed to open audio file\n"); \ |
119 | | - return false; \ |
120 | | - } \ |
121 | | - std::string fname2 = convert_audio_file(fname, stereo); \ |
122 | | - if (fname2.empty()) { \ |
123 | | - return false; \ |
124 | | - } \ |
125 | | - fname = fname2; \ |
126 | | - did_conversion = true; \ |
127 | | - goto TryAgain; \ |
128 | | - } while (0) |
129 | | - |
130 | | -bool read_wav(const std::string & fname_, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { |
131 | | - drwav wav; |
132 | | - std::vector<uint8_t> wav_data; // used for pipe input from stdin |
133 | | - std::string fname = fname_; |
134 | | - bool did_conversion = false; |
135 | | - |
136 | | -TryAgain: |
137 | | - if (fname == "-") { |
138 | | - { |
139 | | - #ifdef _WIN32 |
140 | | - _setmode(_fileno(stdin), _O_BINARY); |
141 | | - #endif |
142 | | - |
143 | | - uint8_t buf[1024]; |
144 | | - while (true) |
145 | | - { |
146 | | - const size_t n = fread(buf, 1, sizeof(buf), stdin); |
147 | | - if (n == 0) { |
148 | | - break; |
149 | | - } |
150 | | - wav_data.insert(wav_data.end(), buf, buf + n); |
151 | | - } |
152 | | - } |
153 | | - |
154 | | - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
155 | | - fprintf(stderr, "error: failed to open WAV file from stdin\n"); |
156 | | - return false; |
157 | | - } |
158 | | - |
159 | | - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); |
160 | | - } |
161 | | - else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { |
162 | | - tinylogf("%s: converting to wav...\n", fname.c_str()); |
163 | | - TRY_CONVERSION; |
164 | | - } |
165 | | - |
166 | | - if (stereo && wav.channels < 2) { |
167 | | - fprintf(stderr, "%s: audio file must be stereo for diarization\n", fname.c_str()); |
168 | | - drwav_uninit(&wav); |
169 | | - return false; |
170 | | - } |
171 | | - |
172 | | - if (wav.channels != 1 && wav.channels != 2) { |
173 | | - tinylogf("%s: audio file has %d channels\n", fname.c_str(), wav.channels); |
174 | | - drwav_uninit(&wav); |
175 | | - TRY_CONVERSION; |
176 | | - } |
177 | | - |
178 | | - if (stereo && wav.channels != 2) { |
179 | | - tinylogf("%s: audio file has %d channels (we want diarization)\n", fname.c_str(), wav.channels); |
180 | | - drwav_uninit(&wav); |
181 | | - TRY_CONVERSION; |
182 | | - } |
183 | | - |
184 | | - if (wav.sampleRate != COMMON_SAMPLE_RATE) { |
185 | | - tinylogf("%s: audio file has %d sample rate\n", fname.c_str(), wav.sampleRate); |
186 | | - drwav_uninit(&wav); |
187 | | - TRY_CONVERSION; |
188 | | - } |
189 | | - |
190 | | - if (wav.bitsPerSample != 16) { |
191 | | - tinylogf("%s: audio file has %d bits per sample\n", fname.c_str(), wav.bitsPerSample); |
192 | | - drwav_uninit(&wav); |
193 | | - TRY_CONVERSION; |
194 | | - } |
195 | | - |
196 | | - const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); |
197 | | - |
198 | | - std::vector<int16_t> pcm16; |
199 | | - pcm16.resize(n*wav.channels); |
200 | | - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); |
201 | | - drwav_uninit(&wav); |
202 | | - |
203 | | - // convert to mono, float |
204 | | - pcmf32.resize(n); |
205 | | - if (wav.channels == 1) { |
206 | | - for (uint64_t i = 0; i < n; i++) { |
207 | | - pcmf32[i] = float(pcm16[i])/32768.0f; |
208 | | - } |
209 | | - } else { |
210 | | - for (uint64_t i = 0; i < n; i++) { |
211 | | - pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; |
212 | | - } |
213 | | - } |
214 | | - |
215 | | - if (stereo) { |
216 | | - // convert to stereo, float |
217 | | - pcmf32s.resize(2); |
218 | | - |
219 | | - pcmf32s[0].resize(n); |
220 | | - pcmf32s[1].resize(n); |
221 | | - for (uint64_t i = 0; i < n; i++) { |
222 | | - pcmf32s[0][i] = float(pcm16[2*i])/32768.0f; |
223 | | - pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f; |
224 | | - } |
225 | | - } |
226 | | - |
227 | | - return true; |
228 | | -} |
229 | | - |
230 | 30 | void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) { |
231 | 31 | const float rc = 1.0f / (2.0f * M_PI * cutoff); |
232 | 32 | const float dt = 1.0f / sample_rate; |
|
0 commit comments