@@ -567,13 +567,15 @@ void VideoDecoder::addAudioStream(int streamIndex) {
567567
568568VideoDecoder::FrameOutput VideoDecoder::getNextFrame () {
569569 auto output = getNextFrameInternal ();
570- output.data = maybePermuteHWC2CHW (output.data );
570+ if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
571+ output.data = maybePermuteHWC2CHW (output.data );
572+ }
571573 return output;
572574}
573575
574576VideoDecoder::FrameOutput VideoDecoder::getNextFrameInternal (
575577 std::optional<torch::Tensor> preAllocatedOutputTensor) {
576- validateActiveStream (AVMEDIA_TYPE_VIDEO );
578+ validateActiveStream ();
577579 AVFrameStream avFrameStream = decodeAVFrame (
578580 [this ](AVFrame* avFrame) { return avFrame->pts >= cursor_; });
579581 return convertAVFrameToFrameOutput (avFrameStream, preAllocatedOutputTensor);
@@ -869,7 +871,7 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
869871 // If we need to seek backwards, then we have to seek back to the beginning
870872 // of the stream.
871873 // TODO-AUDIO: document why this is needed in a big comment.
872- setCursorPtsInSeconds (INT64_MIN);
874+ setCursorPtsInSecondsInternal (INT64_MIN);
873875 }
874876
875877 // TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
@@ -915,6 +917,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
915917// --------------------------------------------------------------------------
916918
917919void VideoDecoder::setCursorPtsInSeconds (double seconds) {
920+ validateActiveStream (AVMEDIA_TYPE_VIDEO);
921+ setCursorPtsInSecondsInternal (seconds);
922+ }
923+
924+ void VideoDecoder::setCursorPtsInSecondsInternal (double seconds) {
918925 cursorWasJustSet_ = true ;
919926 cursor_ =
920927 secondsToClosestPts (seconds, streamInfos_[activeStreamIndex_].timeBase );
0 commit comments