From 3e9ae6f439d3ec2c31014fbad040e1e5c8a77aaf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 7 Nov 2025 11:29:08 +0000 Subject: [PATCH 1/5] Avoid seeking checks when decoding frames sequentially --- src/torchcodec/_core/SingleStreamDecoder.cpp | 12 +++++++++--- src/torchcodec/_core/SingleStreamDecoder.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 72cd7afac..0db09f058 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -618,9 +618,15 @@ FrameOutput SingleStreamDecoder::getFrameAtIndexInternal( } validateFrameIndex(streamMetadata, frameIndex); - int64_t pts = getPts(frameIndex); - setCursorPtsInSeconds(ptsToSeconds(pts, streamInfo.timeBase)); - return getNextFrameInternal(preAllocatedOutputTensor); + // Only set cursor if we're not decoding sequentially + if (frameIndex != lastDecodedFrameIndex_ + 1) { + int64_t pts = getPts(frameIndex); + setCursorPtsInSeconds(ptsToSeconds(pts, streamInfo.timeBase)); + } + + auto result = getNextFrameInternal(preAllocatedOutputTensor); + lastDecodedFrameIndex_ = frameIndex; + return result; } FrameBatchOutput SingleStreamDecoder::getFramesAtIndices( diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 4b41811ff..96768989a 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -345,6 +345,7 @@ class SingleStreamDecoder { bool cursorWasJustSet_ = false; int64_t lastDecodedAvFramePts_ = 0; int64_t lastDecodedAvFrameDuration_ = 0; + int64_t lastDecodedFrameIndex_ = INT64_MIN; // Stores various internal decoding stats. DecodeStats decodeStats_; From 86ec99ede58da518ced7d3fbfba02d841e72ce04 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 11 Nov 2025 15:56:20 +0000 Subject: [PATCH 2/5] Simplify seek skipping logic --- src/torchcodec/_core/SingleStreamDecoder.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 4a81c9a8f..b6cdf055b 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -1100,6 +1100,9 @@ I P P P I P P P I P P I P P I P */ bool SingleStreamDecoder::canWeAvoidSeeking() const { const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_); + if (!cursorWasJustSet_) { + return true; + } if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) { // For audio, we only need to seek if a backwards seek was requested // within getFramesPlayedInRangeAudio(), when setCursorPtsInSeconds() was @@ -1181,10 +1184,8 @@ UniqueAVFrame SingleStreamDecoder::decodeAVFrame( resetDecodeStats(); - if (cursorWasJustSet_) { - maybeSeekToBeforeDesiredPts(); - cursorWasJustSet_ = false; - } + maybeSeekToBeforeDesiredPts(); + cursorWasJustSet_ = false; UniqueAVFrame avFrame(av_frame_alloc()); AutoAVPacket autoAVPacket; From 42409a97c48e433a90eec9900ec59aa20a613bee Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 11 Nov 2025 17:28:55 +0000 Subject: [PATCH 3/5] comment --- src/torchcodec/_core/SingleStreamDecoder.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 1272e5c83..ca7b3dcb6 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -626,7 +626,9 @@ FrameOutput SingleStreamDecoder::getFrameAtIndexInternal( } validateFrameIndex(streamMetadata, frameIndex); - // Only set cursor if we're not decoding sequentially + // Only set cursor if we're not decoding sequentially: when decoding + // sequentially, we don't need to seek anywhere, so by *not* setting the + // cursor we allow canWeAvoidSeeking() to return true early. if (frameIndex != lastDecodedFrameIndex_ + 1) { int64_t pts = getPts(frameIndex); setCursorPtsInSeconds(ptsToSeconds(pts, streamInfo.timeBase)); From 22952417efed05236d581569fbf0b80f7995940d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Nov 2025 11:01:40 +0000 Subject: [PATCH 4/5] Fix --- src/torchcodec/_core/SingleStreamDecoder.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 74d92c584..32d6f9d99 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -1108,9 +1108,6 @@ I P P P I P P P I P P I P P I P */ bool SingleStreamDecoder::canWeAvoidSeeking() const { const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_); - if (!cursorWasJustSet_) { - return true; - } if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) { // For audio, we only need to seek if a backwards seek was requested // within getFramesPlayedInRangeAudio(), when setCursorPtsInSeconds() was From 4ace1d3df3a2d4c98717e9a8fba36e090a85725b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 12 Nov 2025 11:15:19 +0000 Subject: [PATCH 5/5] Comments --- src/torchcodec/_core/SingleStreamDecoder.cpp | 68 +++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 32d6f9d99..08dc8f980 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -1081,32 +1081,17 @@ void SingleStreamDecoder::setCursor(int64_t pts) { cursor_ = pts; } -/* -Videos have I frames and non-I frames (P and B frames). Non-I frames need data -from the previous I frame to be decoded. - -Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x for -brevity) and we wish to seek to a user-specified PTS=y. - -If y < x, we don't have a choice but to seek backwards to the highest I frame -before y. - -If y > x, we have two choices: - -1. We could keep decoding forward until we hit y. Illustrated below: - -I P P P I P P P I P P I P P I P - x y - -2. We could try to jump to an I frame between x and y (indicated by j below). -And then start decoding until we encounter y. Illustrated below: - -I P P P I P P P I P P I P P I P - x j y - -(2) is more efficient than (1) if there is an I frame between x and y. -*/ bool SingleStreamDecoder::canWeAvoidSeeking() const { + // Returns true if we can avoid seeking in the AVFormatContext based on + // heuristics that rely on the target cursor_ and the last decoded frame. + // Seeking is expensive, so we try to avoid it when possible. + // Note that this function itself isn't always that cheap to call: in + // particular the calls to getKeyFrameIndexForPts below in approximate mode + // are sometimes slow. + // TODO we should understand why (is it because it reads the file?) and + // potentially optimize it. E.g. we may not want to ever seek, or even *check* + // if we need to seek in some cases, like if we're going to decode 80% of the + // frames anyway. const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_); if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) { // For audio, we only need to seek if a backwards seek was requested @@ -1129,13 +1114,34 @@ bool SingleStreamDecoder::canWeAvoidSeeking() const { // implement caching. return false; } - // We are seeking forwards. - // We can only skip a seek if both lastDecodedAvFramePts and - // cursor_ share the same keyframe. - int lastDecodedAvFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_); + // We are seeking forwards. We can skip a seek if both the last decoded frame + // and cursor_ share the same keyframe: + // Videos have I frames and non-I frames (P and B frames). Non-I frames need + // data from the previous I frame to be decoded. + // + // Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x + // for brevity) and we wish to seek to a user-specified PTS=y. + // + // If y < x, we don't have a choice but to seek backwards to the highest I + // frame before y. + // + // If y > x, we have two choices: + // + // 1. We could keep decoding forward until we hit y. Illustrated below: + // + // I P P P I P P P I P P I P + // x y + // + // 2. We could try to jump to an I frame between x and y (indicated by j + // below). And then start decoding until we encounter y. Illustrated below: + // + // I P P P I P P P I P P I P + // x j y + // (2) is only more efficient than (1) if there is an I frame between x and y. + int lastKeyFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts_); int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_); - return lastDecodedAvFrameIndex >= 0 && targetKeyFrameIndex >= 0 && - lastDecodedAvFrameIndex == targetKeyFrameIndex; + return lastKeyFrameIndex >= 0 && targetKeyFrameIndex >= 0 && + lastKeyFrameIndex == targetKeyFrameIndex; } // This method looks at currentPts and desiredPts and seeks in the