1717namespace facebook ::torchcodec {
1818namespace {
1919
20- double ptsToSeconds (int64_t pts, int den) {
21- return static_cast <double >(pts) / den;
22- }
23-
2420double ptsToSeconds (int64_t pts, const AVRational& timeBase) {
25- return ptsToSeconds (pts, timeBase.den ) ;
21+ return static_cast < double > (pts) * timeBase.num / timeBase. den ;
2622}
2723
2824int64_t secondsToClosestPts (double seconds, const AVRational& timeBase) {
29- return static_cast <int64_t >(std::round (seconds * timeBase.den ));
25+ return static_cast <int64_t >(
26+ std::round (seconds * timeBase.den / timeBase.num ));
27+ }
28+
29+ // Some videos aren't properly encoded and do not specify pts values for
30+ // packets, and thus for frames. Unset values correspond to INT64_MIN. When that
31+ // happens, we fallback to the dts value which hopefully exists and is correct.
32+ // Accessing AVFrames and AVPackets's pts values should **always** go through
33+ // the helpers below. Then, the "pts" fields in our structs like FrameInfo.pts
34+ // should be interpreted as "pts if it exists, dts otherwise".
35+ int64_t getPtsOrDts (ReferenceAVPacket& packet) {
36+ return packet->pts == INT64_MIN ? packet->dts : packet->pts ;
37+ }
38+
39+ int64_t getPtsOrDts (const UniqueAVFrame& avFrame) {
40+ return avFrame->pts == INT64_MIN ? avFrame->pkt_dts : avFrame->pts ;
3041}
3142
3243} // namespace
@@ -151,8 +162,9 @@ void SingleStreamDecoder::initializeDecoder() {
151162 }
152163
153164 if (formatContext_->duration > 0 ) {
165+ AVRational defaultTimeBase{1 , AV_TIME_BASE};
154166 containerMetadata_.durationSeconds =
155- ptsToSeconds (formatContext_->duration , AV_TIME_BASE );
167+ ptsToSeconds (formatContext_->duration , defaultTimeBase );
156168 }
157169
158170 if (formatContext_->bit_rate > 0 ) {
@@ -225,16 +237,16 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
225237 int streamIndex = packet->stream_index ;
226238 auto & streamMetadata = containerMetadata_.allStreamMetadata [streamIndex];
227239 streamMetadata.minPtsFromScan = std::min (
228- streamMetadata.minPtsFromScan .value_or (INT64_MAX), packet-> pts );
240+ streamMetadata.minPtsFromScan .value_or (INT64_MAX), getPtsOrDts ( packet) );
229241 streamMetadata.maxPtsFromScan = std::max (
230242 streamMetadata.maxPtsFromScan .value_or (INT64_MIN),
231- packet-> pts + packet->duration );
243+ getPtsOrDts ( packet) + packet->duration );
232244 streamMetadata.numFramesFromScan =
233245 streamMetadata.numFramesFromScan .value_or (0 ) + 1 ;
234246
235247 // Note that we set the other value in this struct, nextPts, only after
236248 // we have scanned all packets and sorted by pts.
237- FrameInfo frameInfo = {packet-> pts };
249+ FrameInfo frameInfo = {getPtsOrDts ( packet) };
238250 if (packet->flags & AV_PKT_FLAG_KEY) {
239251 frameInfo.isKeyFrame = true ;
240252 streamInfos_[streamIndex].keyFrames .push_back (frameInfo);
@@ -495,8 +507,9 @@ FrameOutput SingleStreamDecoder::getNextFrame() {
495507FrameOutput SingleStreamDecoder::getNextFrameInternal (
496508 std::optional<torch::Tensor> preAllocatedOutputTensor) {
497509 validateActiveStream ();
498- UniqueAVFrame avFrame = decodeAVFrame (
499- [this ](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
510+ UniqueAVFrame avFrame = decodeAVFrame ([this ](const UniqueAVFrame& avFrame) {
511+ return getPtsOrDts (avFrame) >= cursor_;
512+ });
500513 return convertAVFrameToFrameOutput (avFrame, preAllocatedOutputTensor);
501514}
502515
@@ -632,9 +645,10 @@ FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
632645 UniqueAVFrame avFrame =
633646 decodeAVFrame ([seconds, this ](const UniqueAVFrame& avFrame) {
634647 StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
635- double frameStartTime = ptsToSeconds (avFrame->pts , streamInfo.timeBase );
648+ double frameStartTime =
649+ ptsToSeconds (getPtsOrDts (avFrame), streamInfo.timeBase );
636650 double frameEndTime = ptsToSeconds (
637- avFrame-> pts + getDuration (avFrame), streamInfo.timeBase );
651+ getPtsOrDts ( avFrame) + getDuration (avFrame), streamInfo.timeBase );
638652 if (frameStartTime > seconds) {
639653 // FFMPEG seeked past the frame we are looking for even though we
640654 // set max_ts to be our needed timestamp in avformat_seek_file()
@@ -861,8 +875,8 @@ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
861875 try {
862876 UniqueAVFrame avFrame =
863877 decodeAVFrame ([startPts, stopPts](const UniqueAVFrame& avFrame) {
864- return startPts < avFrame-> pts + getDuration (avFrame) &&
865- stopPts > avFrame-> pts ;
878+ return startPts < getPtsOrDts ( avFrame) + getDuration (avFrame) &&
879+ stopPts > getPtsOrDts ( avFrame) ;
866880 });
867881 auto frameOutput = convertAVFrameToFrameOutput (avFrame);
868882 if (!firstFramePtsSeconds.has_value ()) {
@@ -1132,7 +1146,7 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
11321146 // haven't received as frames. Eventually we will either hit AVERROR_EOF from
11331147 // av_receive_frame() or the user will have seeked to a different location in
11341148 // the file and that will flush the decoder.
1135- streamInfo.lastDecodedAvFramePts = avFrame-> pts ;
1149+ streamInfo.lastDecodedAvFramePts = getPtsOrDts ( avFrame) ;
11361150 streamInfo.lastDecodedAvFrameDuration = getDuration (avFrame);
11371151
11381152 return avFrame;
@@ -1149,7 +1163,8 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
11491163 FrameOutput frameOutput;
11501164 auto & streamInfo = streamInfos_[activeStreamIndex_];
11511165 frameOutput.ptsSeconds = ptsToSeconds (
1152- avFrame->pts , formatContext_->streams [activeStreamIndex_]->time_base );
1166+ getPtsOrDts (avFrame),
1167+ formatContext_->streams [activeStreamIndex_]->time_base );
11531168 frameOutput.durationSeconds = ptsToSeconds (
11541169 getDuration (avFrame),
11551170 formatContext_->streams [activeStreamIndex_]->time_base );
0 commit comments