Skip to content

Commit 0bbadf2

Browse files
committed
resample raw audio
1 parent a143e7e commit 0bbadf2

File tree

1 file changed

+73
-12
lines changed

1 file changed

+73
-12
lines changed

src/audio_mixer.cpp

Lines changed: 73 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,62 @@ std::vector<float> readAudioFile(const char *filename, int targetSampleRate, AVS
123123
return audioFrames;
124124
}
125125

126+
std::vector<float> resampleAudio(const std::vector<float>& inputAudio, int inputSampleRate, int targetSampleRate) {
127+
SwrContext *swrCtx = nullptr;
128+
int ret;
129+
AVChannelLayout ch_layout;
130+
av_channel_layout_from_string(&ch_layout, "2 channels");
131+
132+
ret = swr_alloc_set_opts2(&swrCtx, &ch_layout, AV_SAMPLE_FMT_FLT,
133+
targetSampleRate, &ch_layout, AV_SAMPLE_FMT_FLT,
134+
inputSampleRate, 0, nullptr);
135+
136+
if (ret < 0) {
137+
char errbuf[AV_ERROR_MAX_STRING_SIZE];
138+
av_strerror(ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
139+
geode::log::error("Failed to set up swr context: {}", errbuf);
140+
return {};
141+
}
142+
ret = swr_init(swrCtx);
143+
if (ret < 0) {
144+
char errbuf[AV_ERROR_MAX_STRING_SIZE];
145+
av_strerror(ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
146+
geode::log::error("Failed to initialize swr context: {}", errbuf);
147+
return {};
148+
}
149+
150+
const int chunkSize = 4096;
151+
const int numChannels = 2;
152+
153+
int maxOutputSamples = av_rescale_rnd(chunkSize, targetSampleRate, inputSampleRate, AV_ROUND_UP);
154+
std::vector<float> outputAudio;
155+
std::vector<float> outputChunk(maxOutputSamples * numChannels);
156+
157+
const uint8_t* inData[1] = { nullptr };
158+
uint8_t* outData[1] = { reinterpret_cast<uint8_t*>(outputChunk.data()) };
159+
160+
for (size_t i = 0; i < inputAudio.size(); i += chunkSize * numChannels) {
161+
size_t currentChunkSize = std::min((size_t)(chunkSize * numChannels), inputAudio.size() - i);
162+
163+
inData[0] = reinterpret_cast<const uint8_t*>(&inputAudio[i]);
164+
int inputSamples = currentChunkSize / numChannels;
165+
166+
int resampledSamples = swr_convert(swrCtx, outData, maxOutputSamples, inData, inputSamples);
167+
if (ret < 0) {
168+
char errbuf[AV_ERROR_MAX_STRING_SIZE];
169+
av_strerror(ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
170+
geode::log::error("Failed to convert audio frame: {}", errbuf);
171+
return {};
172+
}
173+
174+
outputAudio.insert(outputAudio.end(), outputChunk.begin(), outputChunk.begin() + resampledSamples * numChannels);
175+
}
176+
177+
swr_free(&swrCtx);
178+
179+
return outputAudio;
180+
}
181+
126182
namespace ffmpeg {
127183
void AudioMixer::mixVideoAudio(std::filesystem::path videoFile, std::filesystem::path audioFile, std::filesystem::path outputMp4File) {
128184
constexpr int frameSize = 1024;
@@ -143,9 +199,7 @@ namespace ffmpeg {
143199
}
144200

145201
void AudioMixer::mixVideoRaw(std::filesystem::path videoFile, const std::vector<float>& raw, std::filesystem::path outputMp4File, uint32_t sampleRate) {
146-
const int frameSize = 1024;
147-
148-
geode::log::debug("raw size {}", raw.size());
202+
const int frameSize = 1024;
149203

150204
AVFormatContext* videoFormatContext = nullptr;
151205
if (avformat_open_input(&videoFormatContext, videoFile.string().c_str(), nullptr, nullptr) < 0) {
@@ -187,6 +241,17 @@ namespace ffmpeg {
187241

188242
constexpr int channels = 2;
189243

244+
avformat_find_stream_info(videoFormatContext, nullptr);
245+
auto duration = static_cast<double>(videoFormatContext->duration) / AV_TIME_BASE;
246+
auto newSampleRate = raw.size() / duration / channels;
247+
248+
std::vector<float> resampled = resampleAudio(raw, newSampleRate, 44100);
249+
250+
if(resampled.empty()) {
251+
geode::log::error("Failed to resample audio.");
252+
return;
253+
}
254+
190255
outputAudioStream->codecpar->codec_tag = 0;
191256
outputAudioStream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
192257
outputAudioStream->codecpar->sample_rate = sampleRate;
@@ -242,9 +307,7 @@ namespace ffmpeg {
242307
avformat_close_input(&videoFormatContext);
243308
return;
244309
}
245-
246-
geode::log::debug("1 timebase {} {} {} {}", videoFormatContext->streams[videoStreamIndex]->time_base.num, videoFormatContext->streams[videoStreamIndex]->time_base.den, outputVideoStream->time_base.num, outputVideoStream->time_base.den);
247-
310+
248311
AVPacket packet;
249312
while (true) {
250313
if (av_read_frame(videoFormatContext, &packet) >= 0) {
@@ -268,8 +331,8 @@ namespace ffmpeg {
268331
audioFrame->format = AV_SAMPLE_FMT_FLTP;
269332
audioFrame->ch_layout = AV_CHANNEL_LAYOUT_STEREO;
270333

271-
for (size_t i = 0; i < raw.size(); i += frameSize * channels) {
272-
int samplesToEncode = std::min(frameSize, static_cast<int>((raw.size() - i) / channels));
334+
for (size_t i = 0; i < resampled.size(); i += frameSize * channels) {
335+
int samplesToEncode = std::min(frameSize, static_cast<int>((resampled.size() - i) / channels));
273336

274337
audioFrame->nb_samples = samplesToEncode;
275338
audioFrame->pts = pts;
@@ -282,8 +345,8 @@ namespace ffmpeg {
282345
}
283346

284347
for (int j = 0; j < samplesToEncode; ++j) {
285-
reinterpret_cast<float*>(audioFrame->data[0])[j] = raw[i + j * channels];
286-
reinterpret_cast<float*>(audioFrame->data[1])[j] = raw[i + j * channels + 1];
348+
reinterpret_cast<float*>(audioFrame->data[0])[j] = resampled[i + j * channels];
349+
reinterpret_cast<float*>(audioFrame->data[1])[j] = resampled[i + j * channels + 1];
287350
}
288351

289352
if (avcodec_send_frame(audio_codec_context_encoder, audioFrame) < 0) {
@@ -296,8 +359,6 @@ namespace ffmpeg {
296359
audioPacket.data = nullptr;
297360
audioPacket.size = 0;
298361

299-
geode::log::debug("2 timebase {} {} {} {}", audio_codec_context_encoder->time_base.num, audio_codec_context_encoder->time_base.den, outputAudioStream->time_base.num, outputAudioStream->time_base.den);
300-
301362
while (true) {
302363
int ret = avcodec_receive_packet(audio_codec_context_encoder, &audioPacket);
303364
if (ret == 0) {

0 commit comments

Comments
 (0)