Skip to content

Commit 4ee20c6

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into reorg_header
2 parents e5713c0 + 746401c commit 4ee20c6

File tree

4 files changed

+60
-21
lines changed

4 files changed

+60
-21
lines changed

.github/workflows/linux_cuda_wheel.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ jobs:
6767
# include more python versions.
6868
python-version: ['3.9']
6969
cuda-version: ['11.8', '12.4', '12.6']
70-
ffmpeg-version-for-tests: ['5', '6', '7']
70+
# TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
71+
ffmpeg-version-for-tests: ['6', '7']
7172
container:
7273
image: "pytorch/manylinux2_28-builder:cuda${{ matrix.cuda-version }}"
7374
options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"

src/torchcodec/decoders/_core/FFMPEGCommon.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ class AutoAVPacket {
9292

9393
public:
9494
AutoAVPacket();
95+
AutoAVPacket(const AutoAVPacket& other) = delete;
96+
AutoAVPacket& operator=(const AutoAVPacket& other) = delete;
9597
~AutoAVPacket();
9698
};
9799

@@ -100,7 +102,9 @@ class ReferenceAVPacket {
100102
AVPacket* avPacket_;
101103

102104
public:
103-
ReferenceAVPacket(AutoAVPacket& shared);
105+
explicit ReferenceAVPacket(AutoAVPacket& shared);
106+
ReferenceAVPacket(const ReferenceAVPacket& other) = delete;
107+
ReferenceAVPacket& operator=(const ReferenceAVPacket& other) = delete;
104108
~ReferenceAVPacket();
105109
AVPacket* get();
106110
AVPacket* operator->();

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -803,16 +803,20 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
803803
}
804804
}
805805

806-
VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
806+
VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
807807
std::function<bool(int, AVFrame*)> filterFunction) {
808808
if (activeStreamIndices_.size() == 0) {
809809
throw std::runtime_error("No active streams configured.");
810810
}
811+
811812
resetDecodeStats();
813+
814+
// Seek if needed.
812815
if (desiredPtsSeconds_.has_value()) {
813816
maybeSeekToBeforeDesiredPts();
814817
desiredPtsSeconds_ = std::nullopt;
815818
}
819+
816820
// Need to get the next frame or error from PopFrame.
817821
UniqueAVFrame avFrame(av_frame_alloc());
818822
AutoAVPacket autoAVPacket;
@@ -822,42 +826,58 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
822826
while (true) {
823827
frameStreamIndex = -1;
824828
bool gotPermanentErrorOnAnyActiveStream = false;
829+
830+
// Get a frame on an active stream. Note that we don't know ahead of time
831+
// which streams have frames to receive, so we linearly try the active
832+
// streams.
825833
for (int streamIndex : activeStreamIndices_) {
826834
StreamInfo& streamInfo = streamInfos_[streamIndex];
827835
ffmpegStatus =
828836
avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
829-
bool gotNonRetriableError =
830-
ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN);
831-
if (gotNonRetriableError) {
837+
838+
if (ffmpegStatus != AVSUCCESS && ffmpegStatus != AVERROR(EAGAIN)) {
832839
gotPermanentErrorOnAnyActiveStream = true;
833840
break;
834841
}
842+
835843
if (ffmpegStatus == AVSUCCESS) {
836844
frameStreamIndex = streamIndex;
837845
break;
838846
}
839847
}
848+
840849
if (gotPermanentErrorOnAnyActiveStream) {
841850
break;
842851
}
852+
843853
decodeStats_.numFramesReceivedByDecoder++;
844-
bool gotNeededFrame = ffmpegStatus == AVSUCCESS &&
845-
filterFunction(frameStreamIndex, avFrame.get());
846-
if (gotNeededFrame) {
854+
855+
// Is this the kind of frame we're looking for?
856+
if (ffmpegStatus == AVSUCCESS &&
857+
filterFunction(frameStreamIndex, avFrame.get())) {
858+
// Yes, this is the frame we'll return; break out of the decoding loop.
847859
break;
848860
} else if (ffmpegStatus == AVSUCCESS) {
849-
// No need to send more packets here as the decoder may have frames in
850-
// its buffer.
861+
// No, but we received a valid frame - just not the kind we're looking
862+
// for. The logic below will read packets and send them to the decoder.
863+
// But since we did just receive a frame, we should skip reading more
864+
// packets and sending them to the decoder and just try to receive more
865+
// frames from the decoder.
851866
continue;
852867
}
868+
853869
if (reachedEOF) {
854870
// We don't have any more packets to send to the decoder. So keep on
855871
// pulling frames from its internal buffers.
856872
continue;
857873
}
874+
875+
// We still haven't found the frame we're looking for. So let's read more
876+
// packets and send them to the decoder.
858877
ReferenceAVPacket packet(autoAVPacket);
859878
ffmpegStatus = av_read_frame(formatContext_.get(), packet.get());
860879
decodeStats_.numPacketsRead++;
880+
861881
if (ffmpegStatus == AVERROR_EOF) {
862882
// End of file reached. We must drain all codecs by sending a nullptr
863883
// packet.
@@ -872,27 +892,38 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
872892
getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
873893
}
874894
}
895+
896+
// We've reached the end of file so we can't read any more packets from
897+
// it, but the decoder may still have frames to read in its buffer.
898+
// Continue iterating to try reading frames.
875899
reachedEOF = true;
876900
continue;
877901
}
902+
878903
if (ffmpegStatus < AVSUCCESS) {
879904
throw std::runtime_error(
880905
"Could not read frame from input file: " +
881906
getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
882907
}
908+
883909
if (activeStreamIndices_.count(packet->stream_index) == 0) {
884910
// This packet is not for any of the active streams.
885911
continue;
886912
}
913+
914+
// We got a valid packet. Send it to the decoder, and we'll receive it in
915+
// the next iteration.
887916
ffmpegStatus = avcodec_send_packet(
888917
streamInfos_[packet->stream_index].codecContext.get(), packet.get());
889918
if (ffmpegStatus < AVSUCCESS) {
890919
throw std::runtime_error(
891920
"Could not push packet to decoder: " +
892921
getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
893922
}
923+
894924
decodeStats_.numPacketsSentToDecoder++;
895925
}
926+
896927
if (ffmpegStatus < AVSUCCESS) {
897928
if (reachedEOF || ffmpegStatus == AVERROR_EOF) {
898929
throw VideoDecoder::EndOfFileException(
@@ -903,6 +934,7 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
903934
"Could not receive frame from decoder: " +
904935
getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
905936
}
937+
906938
// Note that we don't flush the decoder when we reach EOF (even though that's
907939
// mentioned in https://ffmpeg.org/doxygen/trunk/group__lavc__encdec.html).
908940
// This is because we may have packets internally in the decoder that we
@@ -912,10 +944,8 @@ VideoDecoder::AVFrameStream VideoDecoder::getAVFrameUsingFilterFunction(
912944
StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
913945
activeStreamInfo.currentPts = avFrame->pts;
914946
activeStreamInfo.currentDuration = getDuration(avFrame);
915-
AVFrameStream avFrameStream;
916-
avFrameStream.streamIndex = frameStreamIndex;
917-
avFrameStream.avFrame = std::move(avFrame);
918-
return avFrameStream;
947+
948+
return AVFrameStream(std::move(avFrame), frameStreamIndex);
919949
}
920950

921951
VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
@@ -1079,8 +1109,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAtNoDemux(
10791109
}
10801110

10811111
setCursorPtsInSeconds(seconds);
1082-
AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
1083-
[seconds, this](int frameStreamIndex, AVFrame* avFrame) {
1112+
AVFrameStream avFrameStream =
1113+
decodeAVFrame([seconds, this](int frameStreamIndex, AVFrame* avFrame) {
10841114
StreamInfo& streamInfo = streamInfos_[frameStreamIndex];
10851115
double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
10861116
double frameEndTime = ptsToSeconds(
@@ -1480,8 +1510,8 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemux() {
14801510

14811511
VideoDecoder::FrameOutput VideoDecoder::getNextFrameNoDemuxInternal(
14821512
std::optional<torch::Tensor> preAllocatedOutputTensor) {
1483-
AVFrameStream avFrameStream = getAVFrameUsingFilterFunction(
1484-
[this](int frameStreamIndex, AVFrame* avFrame) {
1513+
AVFrameStream avFrameStream =
1514+
decodeAVFrame([this](int frameStreamIndex, AVFrame* avFrame) {
14851515
StreamInfo& activeStreamInfo = streamInfos_[frameStreamIndex];
14861516
return avFrame->pts >= activeStreamInfo.discardFramesBeforePts;
14871517
});

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,9 @@ class VideoDecoder {
247247
UniqueAVFrame avFrame;
248248
// The stream index of the decoded frame.
249249
int streamIndex;
250+
251+
explicit AVFrameStream(UniqueAVFrame&& a, int s)
252+
: avFrame(std::move(a)), streamIndex(s) {}
250253
};
251254

252255
// Once getFrameAtIndex supports the preAllocatedOutputTensor parameter, we
@@ -390,8 +393,9 @@ class VideoDecoder {
390393

391394
void maybeSeekToBeforeDesiredPts();
392395

393-
AVFrameStream getAVFrameUsingFilterFunction(
394-
std::function<bool(int, AVFrame*)>);
396+
AVFrameStream decodeAVFrame(
397+
std::function<bool(int, AVFrame*)> filterFunction);
398+
395399
// Once we create a decoder can update the metadata with the codec context.
396400
// For example, for video streams, we can add the height and width of the
397401
// decoded stream.

0 commit comments

Comments
 (0)