Skip to content

Commit 20e811f

Browse files
author
pytorchbot
committed
2025-03-18 nightly release (5713507)
1 parent 9d01356 commit 20e811f

File tree

9 files changed

+250
-49
lines changed

9 files changed

+250
-49
lines changed

src/torchcodec/decoders/_core/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})
7777
)
7878

7979

80-
make_torchcodec_library(libtorchcodec4 ffmpeg4)
8180
make_torchcodec_library(libtorchcodec7 ffmpeg7)
8281
make_torchcodec_library(libtorchcodec6 ffmpeg6)
8382
make_torchcodec_library(libtorchcodec5 ffmpeg5)
83+
make_torchcodec_library(libtorchcodec4 ffmpeg4)
8484

8585
else()
8686
message(
@@ -97,6 +97,7 @@ else()
9797
libavformat
9898
libavcodec
9999
libavutil
100+
libswresample
100101
libswscale
101102
)
102103

src/torchcodec/decoders/_core/FFMPEGCommon.cpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ int64_t getDuration(const AVFrame* frame) {
6060
#endif
6161
}
6262

63-
int getNumChannels(const AVFrame* avFrame) {
63+
int getNumChannels(const UniqueAVFrame& avFrame) {
6464
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
6565
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
6666
return avFrame->ch_layout.nb_channels;
@@ -78,6 +78,57 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
7878
#endif
7979
}
8080

81+
void setChannelLayout(
82+
UniqueAVFrame& dstAVFrame,
83+
const UniqueAVFrame& srcAVFrame) {
84+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
85+
dstAVFrame->ch_layout = srcAVFrame->ch_layout;
86+
#else
87+
dstAVFrame->channel_layout = srcAVFrame->channel_layout;
88+
#endif
89+
}
90+
91+
SwrContext* allocateSwrContext(
92+
UniqueAVCodecContext& avCodecContext,
93+
int sampleRate,
94+
AVSampleFormat sourceSampleFormat,
95+
AVSampleFormat desiredSampleFormat) {
96+
SwrContext* swrContext = nullptr;
97+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
98+
AVChannelLayout layout = avCodecContext->ch_layout;
99+
auto status = swr_alloc_set_opts2(
100+
&swrContext,
101+
&layout,
102+
desiredSampleFormat,
103+
sampleRate,
104+
&layout,
105+
sourceSampleFormat,
106+
sampleRate,
107+
0,
108+
nullptr);
109+
110+
TORCH_CHECK(
111+
status == AVSUCCESS,
112+
"Couldn't create SwrContext: ",
113+
getFFMPEGErrorStringFromErrorCode(status));
114+
#else
115+
int64_t layout = static_cast<int64_t>(avCodecContext->channel_layout);
116+
swrContext = swr_alloc_set_opts(
117+
nullptr,
118+
layout,
119+
desiredSampleFormat,
120+
sampleRate,
121+
layout,
122+
sourceSampleFormat,
123+
sampleRate,
124+
0,
125+
nullptr);
126+
#endif
127+
128+
TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
129+
return swrContext;
130+
}
131+
81132
AVIOBytesContext::AVIOBytesContext(
82133
const void* data,
83134
size_t dataSize,

src/torchcodec/decoders/_core/FFMPEGCommon.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ extern "C" {
2222
#include <libavutil/opt.h>
2323
#include <libavutil/pixfmt.h>
2424
#include <libavutil/version.h>
25+
#include <libswresample/swresample.h>
2526
#include <libswscale/swscale.h>
2627
}
2728

@@ -67,6 +68,8 @@ using UniqueAVIOContext = std::
6768
unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
6869
using UniqueSwsContext =
6970
std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
71+
using UniqueSwrContext =
72+
std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
7073

7174
// These 2 classes share the same underlying AVPacket object. They are meant to
7275
// be used in tandem, like so:
@@ -139,9 +142,18 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
139142
int64_t getDuration(const UniqueAVFrame& frame);
140143
int64_t getDuration(const AVFrame* frame);
141144

142-
int getNumChannels(const AVFrame* avFrame);
145+
int getNumChannels(const UniqueAVFrame& avFrame);
143146
int getNumChannels(const UniqueAVCodecContext& avCodecContext);
144147

148+
void setChannelLayout(
149+
UniqueAVFrame& dstAVFrame,
150+
const UniqueAVFrame& srcAVFrame);
151+
SwrContext* allocateSwrContext(
152+
UniqueAVCodecContext& avCodecContext,
153+
int sampleRate,
154+
AVSampleFormat sourceSampleFormat,
155+
AVSampleFormat desiredSampleFormat);
156+
145157
// Returns true if sws_scale can handle unaligned data.
146158
bool canSwsScaleHandleUnalignedData();
147159

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 110 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ extern "C" {
2323
#include <libavutil/imgutils.h>
2424
#include <libavutil/log.h>
2525
#include <libavutil/pixdesc.h>
26+
#include <libswresample/swresample.h>
2627
#include <libswscale/swscale.h>
2728
}
2829

@@ -467,6 +468,7 @@ void VideoDecoder::addStream(
467468
TORCH_CHECK_EQ(retVal, AVSUCCESS);
468469

469470
streamInfo.codecContext->thread_count = ffmpegThreadCount.value_or(0);
471+
streamInfo.codecContext->pkt_timebase = streamInfo.stream->time_base;
470472

471473
// TODO_CODE_QUALITY same as above.
472474
if (mediaType == AVMEDIA_TYPE_VIDEO && device.type() == torch::kCUDA) {
@@ -558,6 +560,12 @@ void VideoDecoder::addAudioStream(int streamIndex) {
558560
static_cast<int64_t>(streamInfo.codecContext->sample_rate);
559561
streamMetadata.numChannels =
560562
static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
563+
564+
// FFmpeg docs say that the decoder will try to decode natively in this
565+
// format, if it can. Docs don't say what the decoder does when it doesn't
566+
// support that format, but it looks like it does nothing, so this probably
567+
// doesn't hurt.
568+
streamInfo.codecContext->request_sample_fmt = AV_SAMPLE_FMT_FLTP;
561569
}
562570

563571
// --------------------------------------------------------------------------
@@ -566,13 +574,15 @@ void VideoDecoder::addAudioStream(int streamIndex) {
566574

567575
VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
568576
auto output = getNextFrameInternal();
569-
output.data = maybePermuteHWC2CHW(output.data);
577+
if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
578+
output.data = maybePermuteHWC2CHW(output.data);
579+
}
570580
return output;
571581
}
572582

573583
VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal(
574584
std::optional<torch::Tensor> preAllocatedOutputTensor) {
575-
validateActiveStream(AVMEDIA_TYPE_VIDEO);
585+
validateActiveStream();
576586
AVFrameStream avFrameStream = decodeAVFrame(
577587
[this](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
578588
return convertAVFrameToFrameOutput(avFrameStream, preAllocatedOutputTensor);
@@ -868,7 +878,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
868878
// If we need to seek backwards, then we have to seek back to the beginning
869879
// of the stream.
870880
// TODO-AUDIO: document why this is needed in a big comment.
871-
setCursorPtsInSeconds(INT64_MIN);
881+
setCursorPtsInSecondsInternal(INT64_MIN);
872882
}
873883

874884
// TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
@@ -914,6 +924,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
914924
// --------------------------------------------------------------------------
915925

916926
void VideoDecoder::setCursorPtsInSeconds(double seconds) {
927+
validateActiveStream(AVMEDIA_TYPE_VIDEO);
928+
setCursorPtsInSecondsInternal(seconds);
929+
}
930+
931+
void VideoDecoder::setCursorPtsInSecondsInternal(double seconds) {
917932
cursorWasJustSet_ = true;
918933
cursor_ =
919934
secondsToClosestPts(seconds, streamInfos_[activeStreamIndex_].timeBase);
@@ -1342,37 +1357,89 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13421357
!preAllocatedOutputTensor.has_value(),
13431358
"pre-allocated audio tensor not supported yet.");
13441359

1345-
const AVFrame* avFrame = avFrameStream.avFrame.get();
1360+
AVSampleFormat sourceSampleFormat =
1361+
static_cast<AVSampleFormat>(avFrameStream.avFrame->format);
1362+
AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
1363+
1364+
UniqueAVFrame convertedAVFrame;
1365+
if (sourceSampleFormat != desiredSampleFormat) {
1366+
convertedAVFrame = convertAudioAVFrameSampleFormat(
1367+
avFrameStream.avFrame, sourceSampleFormat, desiredSampleFormat);
1368+
}
1369+
const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
1370+
? convertedAVFrame
1371+
: avFrameStream.avFrame;
1372+
1373+
AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
1374+
TORCH_CHECK(
1375+
format == desiredSampleFormat,
1376+
"Something went wrong, the frame didn't get converted to the desired format. ",
1377+
"Desired format = ",
1378+
av_get_sample_fmt_name(desiredSampleFormat),
1379+
"source format = ",
1380+
av_get_sample_fmt_name(format));
13461381

13471382
auto numSamples = avFrame->nb_samples; // per channel
13481383
auto numChannels = getNumChannels(avFrame);
13491384
torch::Tensor outputData =
13501385
torch::empty({numChannels, numSamples}, torch::kFloat32);
13511386

1352-
AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
1353-
// TODO-AUDIO Implement all formats.
1354-
switch (format) {
1355-
case AV_SAMPLE_FMT_FLTP: {
1356-
uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
1357-
auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
1358-
for (auto channel = 0; channel < numChannels;
1359-
++channel, outputChannelData += numBytesPerChannel) {
1360-
memcpy(
1361-
outputChannelData,
1362-
avFrame->extended_data[channel],
1363-
numBytesPerChannel);
1364-
}
1365-
break;
1366-
}
1367-
default:
1368-
TORCH_CHECK(
1369-
false,
1370-
"Unsupported audio format (yet!): ",
1371-
av_get_sample_fmt_name(format));
1387+
uint8_t* outputChannelData = static_cast<uint8_t*>(outputData.data_ptr());
1388+
auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
1389+
for (auto channel = 0; channel < numChannels;
1390+
++channel, outputChannelData += numBytesPerChannel) {
1391+
memcpy(
1392+
outputChannelData, avFrame->extended_data[channel], numBytesPerChannel);
13721393
}
13731394
frameOutput.data = outputData;
13741395
}
13751396

1397+
UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
1398+
const UniqueAVFrame& avFrame,
1399+
AVSampleFormat sourceSampleFormat,
1400+
AVSampleFormat desiredSampleFormat
1401+
1402+
) {
1403+
auto& streamInfo = streamInfos_[activeStreamIndex_];
1404+
const auto& streamMetadata =
1405+
containerMetadata_.allStreamMetadata[activeStreamIndex_];
1406+
int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());
1407+
1408+
if (!streamInfo.swrContext) {
1409+
createSwrContext(
1410+
streamInfo, sampleRate, sourceSampleFormat, desiredSampleFormat);
1411+
}
1412+
1413+
UniqueAVFrame convertedAVFrame(av_frame_alloc());
1414+
TORCH_CHECK(
1415+
convertedAVFrame,
1416+
"Could not allocate frame for sample format conversion.");
1417+
1418+
setChannelLayout(convertedAVFrame, avFrame);
1419+
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
1420+
convertedAVFrame->sample_rate = avFrame->sample_rate;
1421+
convertedAVFrame->nb_samples = avFrame->nb_samples;
1422+
1423+
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
1424+
TORCH_CHECK(
1425+
status == AVSUCCESS,
1426+
"Could not allocate frame buffers for sample format conversion: ",
1427+
getFFMPEGErrorStringFromErrorCode(status));
1428+
1429+
auto numSampleConverted = swr_convert(
1430+
streamInfo.swrContext.get(),
1431+
convertedAVFrame->data,
1432+
convertedAVFrame->nb_samples,
1433+
static_cast<const uint8_t**>(const_cast<const uint8_t**>(avFrame->data)),
1434+
avFrame->nb_samples);
1435+
TORCH_CHECK(
1436+
numSampleConverted > 0,
1437+
"Error in swr_convert: ",
1438+
getFFMPEGErrorStringFromErrorCode(numSampleConverted));
1439+
1440+
return convertedAVFrame;
1441+
}
1442+
13761443
// --------------------------------------------------------------------------
13771444
// OUTPUT ALLOCATION AND SHAPE CONVERSION
13781445
// --------------------------------------------------------------------------
@@ -1606,6 +1673,25 @@ void VideoDecoder::createSwsContext(
16061673
streamInfo.swsContext.reset(swsContext);
16071674
}
16081675

1676+
void VideoDecoder::createSwrContext(
1677+
StreamInfo& streamInfo,
1678+
int sampleRate,
1679+
AVSampleFormat sourceSampleFormat,
1680+
AVSampleFormat desiredSampleFormat) {
1681+
auto swrContext = allocateSwrContext(
1682+
streamInfo.codecContext,
1683+
sampleRate,
1684+
sourceSampleFormat,
1685+
desiredSampleFormat);
1686+
1687+
auto status = swr_init(swrContext);
1688+
TORCH_CHECK(
1689+
status == AVSUCCESS,
1690+
"Couldn't initialize SwrContext: ",
1691+
getFFMPEGErrorStringFromErrorCode(status));
1692+
streamInfo.swrContext.reset(swrContext);
1693+
}
1694+
16091695
// --------------------------------------------------------------------------
16101696
// PTS <-> INDEX CONVERSIONS
16111697
// --------------------------------------------------------------------------

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ class VideoDecoder {
355355
FilterGraphContext filterGraphContext;
356356
ColorConversionLibrary colorConversionLibrary = FILTERGRAPH;
357357
UniqueSwsContext swsContext;
358+
UniqueSwrContext swrContext;
358359

359360
// Used to know whether a new FilterGraphContext or UniqueSwsContext should
360361
// be created before decoding a new frame.
@@ -370,6 +371,7 @@ class VideoDecoder {
370371
// DECODING APIS AND RELATED UTILS
371372
// --------------------------------------------------------------------------
372373

374+
void setCursorPtsInSecondsInternal(double seconds);
373375
bool canWeAvoidSeeking() const;
374376

375377
void maybeSeekToBeforeDesiredPts();
@@ -401,6 +403,11 @@ class VideoDecoder {
401403
const AVFrame* avFrame,
402404
torch::Tensor& outputTensor);
403405

406+
UniqueAVFrame convertAudioAVFrameSampleFormat(
407+
const UniqueAVFrame& avFrame,
408+
AVSampleFormat sourceSampleFormat,
409+
AVSampleFormat desiredSampleFormat);
410+
404411
// --------------------------------------------------------------------------
405412
// COLOR CONVERSION LIBRARIES HANDLERS CREATION
406413
// --------------------------------------------------------------------------
@@ -415,6 +422,12 @@ class VideoDecoder {
415422
const DecodedFrameContext& frameContext,
416423
const enum AVColorSpace colorspace);
417424

425+
void createSwrContext(
426+
StreamInfo& streamInfo,
427+
int sampleRate,
428+
AVSampleFormat sourceSampleFormat,
429+
AVSampleFormat desiredSampleFormat);
430+
418431
// --------------------------------------------------------------------------
419432
// PTS <-> INDEX CONVERSIONS
420433
// --------------------------------------------------------------------------

0 commit comments

Comments
 (0)