@@ -169,20 +169,6 @@ void VideoDecoder::initializeDecoder() {
169169 }
170170 containerMetadata_.numVideoStreams ++;
171171 } else if (avStream->codecpar ->codec_type == AVMEDIA_TYPE_AUDIO) {
172- // TODO-AUDIO Remove this, we shouldn't need it. We should probably write
173- // a pts-based "getFramesPlayedInRange" from scratch without going back to
174- // indices.
175- int numSamplesPerFrame = avStream->codecpar ->frame_size ;
176- int sampleRate = avStream->codecpar ->sample_rate ;
177- if (numSamplesPerFrame > 0 && sampleRate > 0 ) {
178- // This should allow the approximate mode to do its magic.
179- // fps is numFrames / duration where
180- // - duration = numSamplesTotal / sampleRate and
181- // - numSamplesTotal = numSamplesPerFrame * numFrames
182- // so fps = numFrames * sampleRate / (numSamplesPerFrame * numFrames)
183- streamMetadata.averageFps =
184- static_cast <double >(sampleRate) / numSamplesPerFrame;
185- }
186172 containerMetadata_.numAudioStreams ++;
187173 }
188174
@@ -562,20 +548,9 @@ void VideoDecoder::addAudioStream(int streamIndex) {
562548 addStream (streamIndex, AVMEDIA_TYPE_AUDIO);
563549
564550 auto & streamInfo = streamInfos_[activeStreamIndex_];
565-
566- // TODO-AUDIO
567- TORCH_CHECK (
568- streamInfo.codecContext ->frame_size > 0 ,
569- " No support for audio variable framerate yet." );
570-
571551 auto & streamMetadata =
572552 containerMetadata_.allStreamMetadata [activeStreamIndex_];
573553
574- // TODO-AUDIO
575- TORCH_CHECK (
576- streamMetadata.averageFps .has_value (),
577- " frame_size or sample_rate aren't known. Cannot decode." );
578-
579554 streamMetadata.sampleRate =
580555 static_cast <int64_t >(streamInfo.codecContext ->sample_rate );
581556 streamMetadata.numChannels = getNumChannels (streamInfo.codecContext );
@@ -786,29 +761,18 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedAt(
786761VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange (
787762 double startSeconds,
788763 double stopSeconds) {
789- validateActiveStream ();
790- // Because we currently never seek with audio streams, we prevent users from
791- // calling this method twice. We could allow multiple calls in the future.
792- // Assuming 2 consecutive calls:
793- // ```
794- // getFramesPlayedInRange(startSeconds1, stopSeconds1);
795- // getFramesPlayedInRange(startSeconds2, stopSeconds2);
796- // ```
797- // We would need to seek back to 0 iff startSeconds2 <= stopSeconds1. This
798- // logic is not implemented for now, so we just error.
799-
800- TORCH_CHECK (
801- streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO ||
802- !alreadyCalledGetFramesPlayedInRange_,
803- " Can only decode once with audio stream. Re-create a decoder object if needed." )
804- alreadyCalledGetFramesPlayedInRange_ = true ;
805-
764+ validateActiveStream (AVMEDIA_TYPE_VIDEO);
765+ const auto & streamMetadata =
766+ containerMetadata_.allStreamMetadata [activeStreamIndex_];
806767 TORCH_CHECK (
807768 startSeconds <= stopSeconds,
808769 " Start seconds (" + std::to_string (startSeconds) +
809770 " ) must be less than or equal to stop seconds (" +
810771 std::to_string (stopSeconds) + " ." );
811772
773+ const auto & streamInfo = streamInfos_[activeStreamIndex_];
774+ const auto & videoStreamOptions = streamInfo.videoStreamOptions ;
775+
812776 // Special case needed to implement a half-open range. At first glance, this
813777 // may seem unnecessary, as our search for stopFrame can return the end, and
814778 // we don't include stopFramIndex in our output. However, consider the
@@ -827,14 +791,11 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
827791 // values of the intervals will map to the same frame indices below. Hence, we
828792 // need this special case below.
829793 if (startSeconds == stopSeconds) {
830- FrameBatchOutput frameBatchOutput = makeFrameBatchOutput ( 0 );
831- frameBatchOutput.data = maybePermuteOutputTensor (frameBatchOutput.data );
794+ FrameBatchOutput frameBatchOutput ( 0 , videoStreamOptions, streamMetadata );
795+ frameBatchOutput.data = maybePermuteHWC2CHW (frameBatchOutput.data );
832796 return frameBatchOutput;
833797 }
834798
835- const auto & streamMetadata =
836- containerMetadata_.allStreamMetadata [activeStreamIndex_];
837-
838799 double minSeconds = getMinSeconds (streamMetadata);
839800 double maxSeconds = getMaxSeconds (streamMetadata);
840801 TORCH_CHECK (
@@ -865,14 +826,15 @@ VideoDecoder::FrameBatchOutput VideoDecoder::getFramesPlayedInRange(
865826 int64_t stopFrameIndex = secondsToIndexUpperBound (stopSeconds);
866827 int64_t numFrames = stopFrameIndex - startFrameIndex;
867828
868- FrameBatchOutput frameBatchOutput = makeFrameBatchOutput (numFrames);
829+ FrameBatchOutput frameBatchOutput (
830+ numFrames, videoStreamOptions, streamMetadata);
869831 for (int64_t i = startFrameIndex, f = 0 ; i < stopFrameIndex; ++i, ++f) {
870832 FrameOutput frameOutput =
871833 getFrameAtIndexInternal (i, frameBatchOutput.data [f]);
872834 frameBatchOutput.ptsSeconds [f] = frameOutput.ptsSeconds ;
873835 frameBatchOutput.durationSeconds [f] = frameOutput.durationSeconds ;
874836 }
875- frameBatchOutput.data = maybePermuteOutputTensor (frameBatchOutput.data );
837+ frameBatchOutput.data = maybePermuteHWC2CHW (frameBatchOutput.data );
876838
877839 return frameBatchOutput;
878840}
@@ -1405,41 +1367,6 @@ VideoDecoder::FrameBatchOutput::FrameBatchOutput(
14051367 height, width, videoStreamOptions.device , numFrames);
14061368}
14071369
1408- VideoDecoder::FrameBatchOutput::FrameBatchOutput (
1409- int64_t numFrames,
1410- int64_t numChannels,
1411- int64_t numSamples)
1412- : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64 })),
1413- durationSeconds (torch::empty({numFrames}, {torch::kFloat64 })) {
1414- // TODO handle dtypes other than float
1415- auto tensorOptions = torch::TensorOptions ()
1416- .dtype (torch::kFloat32 )
1417- .layout (torch::kStrided )
1418- .device (torch::kCPU );
1419- data = torch::empty ({numFrames, numChannels, numSamples}, tensorOptions);
1420- }
1421-
1422- VideoDecoder::FrameBatchOutput VideoDecoder::makeFrameBatchOutput (
1423- int64_t numFrames) {
1424- const auto & streamInfo = streamInfos_[activeStreamIndex_];
1425- if (streamInfo.avMediaType == AVMEDIA_TYPE_VIDEO) {
1426- const auto & videoStreamOptions = streamInfo.videoStreamOptions ;
1427- const auto & streamMetadata =
1428- containerMetadata_.allStreamMetadata [activeStreamIndex_];
1429- return FrameBatchOutput (numFrames, videoStreamOptions, streamMetadata);
1430- } else {
1431- // TODO-AUDIO
1432- // We asserted that frame_size is non-zero when we added the stream, but it
1433- // may not always be the case.
1434- // When it's 0, we can't pre-allocate the output tensor as we don't know the
1435- // number of samples per channel, and it may be non-constant. We'll have to
1436- // find a way to make the batch-APIs work without pre-allocation.
1437- int64_t numSamples = streamInfo.codecContext ->frame_size ;
1438- int64_t numChannels = getNumChannels (streamInfo.codecContext );
1439- return FrameBatchOutput (numFrames, numChannels, numSamples);
1440- }
1441- }
1442-
14431370torch::Tensor allocateEmptyHWCTensor (
14441371 int height,
14451372 int width,
0 commit comments