@@ -587,9 +587,9 @@ VideoDecoder::FrameOutput VideoDecoder::getNextFrame() {
587587VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal (
588588 std::optional<torch::Tensor> preAllocatedOutputTensor) {
589589 validateActiveStream ();
590- AVFrameStream avFrameStream = decodeAVFrame (
591- [this ](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
592- return convertAVFrameToFrameOutput (avFrameStream , preAllocatedOutputTensor);
590+ UniqueAVFrame avFrame = decodeAVFrame (
591+ [this ](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
592+ return convertAVFrameToFrameOutput (avFrame , preAllocatedOutputTensor);
593593}
594594
595595VideoDecoder::FrameOutput VideoDecoder::getFrameAtIndex (int64_t frameIndex) {
@@ -719,8 +719,8 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
719719 }
720720
721721 setCursorPtsInSeconds (seconds);
722- AVFrameStream avFrameStream =
723- decodeAVFrame ([seconds, this ](AVFrame* avFrame) {
722+ UniqueAVFrame avFrame =
723+ decodeAVFrame ([seconds, this ](const UniqueAVFrame& avFrame) {
724724 StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
725725 double frameStartTime = ptsToSeconds (avFrame->pts , streamInfo.timeBase );
726726 double frameEndTime = ptsToSeconds (
@@ -739,7 +739,7 @@ VideoDecoder::FrameOutput VideoDecoder::getFramePlayedAt(double seconds) {
739739 });
740740
741741 // Convert the frame to tensor.
742- FrameOutput frameOutput = convertAVFrameToFrameOutput (avFrameStream );
742+ FrameOutput frameOutput = convertAVFrameToFrameOutput (avFrame );
743743 frameOutput.data = maybePermuteHWC2CHW (frameOutput.data );
744744 return frameOutput;
745745}
@@ -895,14 +895,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
895895 auto finished = false ;
896896 while (!finished) {
897897 try {
898- AVFrameStream avFrameStream = decodeAVFrame ([startPts](AVFrame* avFrame) {
899- return startPts < avFrame->pts + getDuration (avFrame);
900- });
901- // TODO: it's not great that we are getting a FrameOutput, which is
902- // intended for videos. We should consider bypassing
903- // convertAVFrameToFrameOutput and directly call
904- // convertAudioAVFrameToFrameOutputOnCPU.
905- auto frameOutput = convertAVFrameToFrameOutput (avFrameStream);
898+ UniqueAVFrame avFrame =
899+ decodeAVFrame ([startPts](const UniqueAVFrame& avFrame) {
900+ return startPts < avFrame->pts + getDuration (avFrame);
901+ });
902+ auto frameOutput = convertAVFrameToFrameOutput (avFrame);
906903 firstFramePtsSeconds =
907904 std::min (firstFramePtsSeconds, frameOutput.ptsSeconds );
908905 frames.push_back (frameOutput.data );
@@ -1039,8 +1036,8 @@ void VideoDecoder::maybeSeekToBeforeDesiredPts() {
10391036// LOW-LEVEL DECODING
10401037// --------------------------------------------------------------------------
10411038
1042- VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame (
1043- std::function<bool (AVFrame* )> filterFunction) {
1039+ UniqueAVFrame VideoDecoder::decodeAVFrame (
1040+ std::function<bool (const UniqueAVFrame& )> filterFunction) {
10441041 validateActiveStream ();
10451042
10461043 resetDecodeStats ();
@@ -1068,7 +1065,7 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
10681065
10691066 decodeStats_.numFramesReceivedByDecoder ++;
10701067 // Is this the kind of frame we're looking for?
1071- if (status == AVSUCCESS && filterFunction (avFrame. get () )) {
1068+ if (status == AVSUCCESS && filterFunction (avFrame)) {
10721069 // Yes, this is the frame we'll return; break out of the decoding loop.
10731070 break ;
10741071 } else if (status == AVSUCCESS) {
@@ -1154,37 +1151,35 @@ VideoDecoder::AVFrameStream VideoDecoder::decodeAVFrame(
11541151 streamInfo.lastDecodedAvFramePts = avFrame->pts ;
11551152 streamInfo.lastDecodedAvFrameDuration = getDuration (avFrame);
11561153
1157- return AVFrameStream ( std::move ( avFrame), activeStreamIndex_) ;
1154+ return avFrame;
11581155}
11591156
11601157// --------------------------------------------------------------------------
11611158// AVFRAME <-> FRAME OUTPUT CONVERSION
11621159// --------------------------------------------------------------------------
11631160
11641161VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput (
1165- VideoDecoder::AVFrameStream& avFrameStream ,
1162+ UniqueAVFrame& avFrame ,
11661163 std::optional<torch::Tensor> preAllocatedOutputTensor) {
11671164 // Convert the frame to tensor.
11681165 FrameOutput frameOutput;
1169- int streamIndex = avFrameStream.streamIndex ;
1170- AVFrame* avFrame = avFrameStream.avFrame .get ();
1171- frameOutput.streamIndex = streamIndex;
1172- auto & streamInfo = streamInfos_[streamIndex];
1166+ auto & streamInfo = streamInfos_[activeStreamIndex_];
11731167 frameOutput.ptsSeconds = ptsToSeconds (
1174- avFrame->pts , formatContext_->streams [streamIndex ]->time_base );
1168+ avFrame->pts , formatContext_->streams [activeStreamIndex_ ]->time_base );
11751169 frameOutput.durationSeconds = ptsToSeconds (
1176- getDuration (avFrame), formatContext_->streams [streamIndex]->time_base );
1170+ getDuration (avFrame),
1171+ formatContext_->streams [activeStreamIndex_]->time_base );
11771172 if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
11781173 convertAudioAVFrameToFrameOutputOnCPU (
1179- avFrameStream , frameOutput, preAllocatedOutputTensor);
1174+ avFrame , frameOutput, preAllocatedOutputTensor);
11801175 } else if (streamInfo.videoStreamOptions .device .type () == torch::kCPU ) {
11811176 convertAVFrameToFrameOutputOnCPU (
1182- avFrameStream , frameOutput, preAllocatedOutputTensor);
1177+ avFrame , frameOutput, preAllocatedOutputTensor);
11831178 } else if (streamInfo.videoStreamOptions .device .type () == torch::kCUDA ) {
11841179 convertAVFrameToFrameOutputOnCuda (
11851180 streamInfo.videoStreamOptions .device ,
11861181 streamInfo.videoStreamOptions ,
1187- avFrameStream ,
1182+ avFrame ,
11881183 frameOutput,
11891184 preAllocatedOutputTensor);
11901185 } else {
@@ -1205,14 +1200,13 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
12051200// Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
12061201// `dimension_order` parameter. It's up to callers to re-shape it if needed.
12071202void VideoDecoder::convertAVFrameToFrameOutputOnCPU (
1208- VideoDecoder::AVFrameStream& avFrameStream ,
1203+ UniqueAVFrame& avFrame ,
12091204 FrameOutput& frameOutput,
12101205 std::optional<torch::Tensor> preAllocatedOutputTensor) {
1211- AVFrame* avFrame = avFrameStream.avFrame .get ();
12121206 auto & streamInfo = streamInfos_[activeStreamIndex_];
12131207
12141208 auto frameDims = getHeightAndWidthFromOptionsOrAVFrame (
1215- streamInfo.videoStreamOptions , * avFrame);
1209+ streamInfo.videoStreamOptions , avFrame);
12161210 int expectedOutputHeight = frameDims.height ;
12171211 int expectedOutputWidth = frameDims.width ;
12181212
@@ -1306,7 +1300,7 @@ void VideoDecoder::convertAVFrameToFrameOutputOnCPU(
13061300}
13071301
13081302int VideoDecoder::convertAVFrameToTensorUsingSwsScale (
1309- const AVFrame* avFrame,
1303+ const UniqueAVFrame& avFrame,
13101304 torch::Tensor& outputTensor) {
13111305 StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
13121306 SwsContext* swsContext = activeStreamInfo.swsContext .get ();
@@ -1326,11 +1320,11 @@ int VideoDecoder::convertAVFrameToTensorUsingSwsScale(
13261320}
13271321
13281322torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph (
1329- const AVFrame* avFrame) {
1323+ const UniqueAVFrame& avFrame) {
13301324 FilterGraphContext& filterGraphContext =
13311325 streamInfos_[activeStreamIndex_].filterGraphContext ;
13321326 int status =
1333- av_buffersrc_write_frame (filterGraphContext.sourceContext , avFrame);
1327+ av_buffersrc_write_frame (filterGraphContext.sourceContext , avFrame. get () );
13341328 if (status < AVSUCCESS) {
13351329 throw std::runtime_error (" Failed to add frame to buffer source context" );
13361330 }
@@ -1354,18 +1348,18 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
13541348}
13551349
13561350void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU (
1357- VideoDecoder::AVFrameStream& avFrameStream ,
1351+ UniqueAVFrame& srcAVFrame ,
13581352 FrameOutput& frameOutput,
13591353 std::optional<torch::Tensor> preAllocatedOutputTensor) {
13601354 TORCH_CHECK (
13611355 !preAllocatedOutputTensor.has_value (),
13621356 " pre-allocated audio tensor not supported yet." );
13631357
13641358 AVSampleFormat sourceSampleFormat =
1365- static_cast <AVSampleFormat>(avFrameStream. avFrame ->format );
1359+ static_cast <AVSampleFormat>(srcAVFrame ->format );
13661360 AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
13671361
1368- int sourceSampleRate = avFrameStream. avFrame ->sample_rate ;
1362+ int sourceSampleRate = srcAVFrame ->sample_rate ;
13691363 int desiredSampleRate =
13701364 streamInfos_[activeStreamIndex_].audioStreamOptions .sampleRate .value_or (
13711365 sourceSampleRate);
@@ -1377,14 +1371,13 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
13771371 UniqueAVFrame convertedAVFrame;
13781372 if (mustConvert) {
13791373 convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate (
1380- avFrameStream. avFrame ,
1374+ srcAVFrame ,
13811375 sourceSampleFormat,
13821376 desiredSampleFormat,
13831377 sourceSampleRate,
13841378 desiredSampleRate);
13851379 }
1386- const UniqueAVFrame& avFrame =
1387- mustConvert ? convertedAVFrame : avFrameStream.avFrame ;
1380+ const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
13881381
13891382 AVSampleFormat format = static_cast <AVSampleFormat>(avFrame->format );
13901383 TORCH_CHECK (
@@ -1981,10 +1974,10 @@ FrameDims getHeightAndWidthFromOptionsOrMetadata(
19811974
19821975FrameDims getHeightAndWidthFromOptionsOrAVFrame (
19831976 const VideoDecoder::VideoStreamOptions& videoStreamOptions,
1984- const AVFrame & avFrame) {
1977+ const UniqueAVFrame & avFrame) {
19851978 return FrameDims (
1986- videoStreamOptions.height .value_or (avFrame. height ),
1987- videoStreamOptions.width .value_or (avFrame. width ));
1979+ videoStreamOptions.height .value_or (avFrame-> height ),
1980+ videoStreamOptions.width .value_or (avFrame-> width ));
19881981}
19891982
19901983} // namespace facebook::torchcodec
0 commit comments