@@ -123,6 +123,62 @@ std::vector<float> readAudioFile(const char *filename, int targetSampleRate, AVS
123123 return audioFrames;
124124}
125125
126+ std::vector<float > resampleAudio (const std::vector<float >& inputAudio, int inputSampleRate, int targetSampleRate) {
127+ SwrContext *swrCtx = nullptr ;
128+ int ret;
129+ AVChannelLayout ch_layout;
130+ av_channel_layout_from_string (&ch_layout, " 2 channels" );
131+
132+ ret = swr_alloc_set_opts2 (&swrCtx, &ch_layout, AV_SAMPLE_FMT_FLT,
133+ targetSampleRate, &ch_layout, AV_SAMPLE_FMT_FLT,
134+ inputSampleRate, 0 , nullptr );
135+
136+ if (ret < 0 ) {
137+ char errbuf[AV_ERROR_MAX_STRING_SIZE];
138+ av_strerror (ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
139+ geode::log::error (" Failed to set up swr context: {}" , errbuf);
140+ return {};
141+ }
142+ ret = swr_init (swrCtx);
143+ if (ret < 0 ) {
144+ char errbuf[AV_ERROR_MAX_STRING_SIZE];
145+ av_strerror (ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
146+ geode::log::error (" Failed to initialize swr context: {}" , errbuf);
147+ return {};
148+ }
149+
150+ const int chunkSize = 4096 ;
151+ const int numChannels = 2 ;
152+
153+ int maxOutputSamples = av_rescale_rnd (chunkSize, targetSampleRate, inputSampleRate, AV_ROUND_UP);
154+ std::vector<float > outputAudio;
155+ std::vector<float > outputChunk (maxOutputSamples * numChannels);
156+
157+ const uint8_t * inData[1 ] = { nullptr };
158+ uint8_t * outData[1 ] = { reinterpret_cast <uint8_t *>(outputChunk.data ()) };
159+
160+ for (size_t i = 0 ; i < inputAudio.size (); i += chunkSize * numChannels) {
161+ size_t currentChunkSize = std::min ((size_t )(chunkSize * numChannels), inputAudio.size () - i);
162+
163+ inData[0 ] = reinterpret_cast <const uint8_t *>(&inputAudio[i]);
164+ int inputSamples = currentChunkSize / numChannels;
165+
166+ int resampledSamples = swr_convert (swrCtx, outData, maxOutputSamples, inData, inputSamples);
167+ if (ret < 0 ) {
168+ char errbuf[AV_ERROR_MAX_STRING_SIZE];
169+ av_strerror (ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
170+ geode::log::error (" Failed to convert audio frame: {}" , errbuf);
171+ return {};
172+ }
173+
174+ outputAudio.insert (outputAudio.end (), outputChunk.begin (), outputChunk.begin () + resampledSamples * numChannels);
175+ }
176+
177+ swr_free (&swrCtx);
178+
179+ return outputAudio;
180+ }
181+
126182namespace ffmpeg {
127183 void AudioMixer::mixVideoAudio (std::filesystem::path videoFile, std::filesystem::path audioFile, std::filesystem::path outputMp4File) {
128184 constexpr int frameSize = 1024 ;
@@ -143,9 +199,7 @@ namespace ffmpeg {
143199 }
144200
145201 void AudioMixer::mixVideoRaw (std::filesystem::path videoFile, const std::vector<float >& raw, std::filesystem::path outputMp4File, uint32_t sampleRate) {
146- const int frameSize = 1024 ;
147-
148- geode::log::debug (" raw size {}" , raw.size ());
202+ const int frameSize = 1024 ;
149203
150204 AVFormatContext* videoFormatContext = nullptr ;
151205 if (avformat_open_input (&videoFormatContext, videoFile.string ().c_str (), nullptr , nullptr ) < 0 ) {
@@ -187,6 +241,17 @@ namespace ffmpeg {
187241
188242 constexpr int channels = 2 ;
189243
244+ avformat_find_stream_info (videoFormatContext, nullptr );
245+ auto duration = static_cast <double >(videoFormatContext->duration ) / AV_TIME_BASE;
246+ auto newSampleRate = raw.size () / duration / channels;
247+
248+ std::vector<float > resampled = resampleAudio (raw, newSampleRate, 44100 );
249+
250+ if (resampled.empty ()) {
251+ geode::log::error (" Failed to resample audio." );
252+ return ;
253+ }
254+
190255 outputAudioStream->codecpar ->codec_tag = 0 ;
191256 outputAudioStream->codecpar ->codec_type = AVMEDIA_TYPE_AUDIO;
192257 outputAudioStream->codecpar ->sample_rate = sampleRate;
@@ -242,9 +307,7 @@ namespace ffmpeg {
242307 avformat_close_input (&videoFormatContext);
243308 return ;
244309 }
245-
246- geode::log::debug (" 1 timebase {} {} {} {}" , videoFormatContext->streams [videoStreamIndex]->time_base .num , videoFormatContext->streams [videoStreamIndex]->time_base .den , outputVideoStream->time_base .num , outputVideoStream->time_base .den );
247-
310+
248311 AVPacket packet;
249312 while (true ) {
250313 if (av_read_frame (videoFormatContext, &packet) >= 0 ) {
@@ -268,8 +331,8 @@ namespace ffmpeg {
268331 audioFrame->format = AV_SAMPLE_FMT_FLTP;
269332 audioFrame->ch_layout = AV_CHANNEL_LAYOUT_STEREO;
270333
271- for (size_t i = 0 ; i < raw .size (); i += frameSize * channels) {
272- int samplesToEncode = std::min (frameSize, static_cast <int >((raw .size () - i) / channels));
334+ for (size_t i = 0 ; i < resampled .size (); i += frameSize * channels) {
335+ int samplesToEncode = std::min (frameSize, static_cast <int >((resampled .size () - i) / channels));
273336
274337 audioFrame->nb_samples = samplesToEncode;
275338 audioFrame->pts = pts;
@@ -282,8 +345,8 @@ namespace ffmpeg {
282345 }
283346
284347 for (int j = 0 ; j < samplesToEncode; ++j) {
285- reinterpret_cast <float *>(audioFrame->data [0 ])[j] = raw [i + j * channels];
286- reinterpret_cast <float *>(audioFrame->data [1 ])[j] = raw [i + j * channels + 1 ];
348+ reinterpret_cast <float *>(audioFrame->data [0 ])[j] = resampled [i + j * channels];
349+ reinterpret_cast <float *>(audioFrame->data [1 ])[j] = resampled [i + j * channels + 1 ];
287350 }
288351
289352 if (avcodec_send_frame (audio_codec_context_encoder, audioFrame) < 0 ) {
@@ -296,8 +359,6 @@ namespace ffmpeg {
296359 audioPacket.data = nullptr ;
297360 audioPacket.size = 0 ;
298361
299- geode::log::debug (" 2 timebase {} {} {} {}" , audio_codec_context_encoder->time_base .num , audio_codec_context_encoder->time_base .den , outputAudioStream->time_base .num , outputAudioStream->time_base .den );
300-
301362 while (true ) {
302363 int ret = avcodec_receive_packet (audio_codec_context_encoder, &audioPacket);
303364 if (ret == 0 ) {
0 commit comments