From a60968ce4d6bb0a90e35f2ddffd25c040017397c Mon Sep 17 00:00:00 2001 From: danielflores3 Date: Thu, 29 May 2025 11:16:54 -0700 Subject: [PATCH 01/13] Update C++ metadata names to match python --- src/torchcodec/_core/Metadata.h | 6 ++--- src/torchcodec/_core/SingleStreamDecoder.cpp | 14 +++++----- src/torchcodec/_core/_metadata.py | 10 +++---- src/torchcodec/_core/custom_ops.cpp | 28 ++++++++++---------- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index a8f300f49..a5b8a8f17 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -25,9 +25,9 @@ struct StreamMetadata { AVMediaType mediaType; std::optional codecId; std::optional codecName; - std::optional durationSeconds; + std::optional durationSecondsFromHeader; std::optional beginStreamFromHeader; - std::optional numFrames; + std::optional numFramesFromHeader; std::optional numKeyFrames; std::optional averageFps; std::optional bitRate; @@ -58,7 +58,7 @@ struct ContainerMetadata { int numVideoStreams = 0; // Note that this is the container-level duration, which is usually the max // of all stream durations available in the container. - std::optional durationSeconds; + std::optional durationSecondsFromHeader; // Total BitRate level information at the container level in bit/s std::optional bitRate; // If set, this is the index to the default audio stream. diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 9bc003a9b..c1a03567f 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -125,11 +125,11 @@ void SingleStreamDecoder::initializeDecoder() { int64_t frameCount = avStream->nb_frames; if (frameCount > 0) { - streamMetadata.numFrames = frameCount; + streamMetadata.numFramesFromHeader = frameCount; } if (avStream->duration > 0 && avStream->time_base.den > 0) { - streamMetadata.durationSeconds = + streamMetadata.durationSecondsFromHeader = av_q2d(avStream->time_base) * avStream->duration; } if (avStream->start_time != AV_NOPTS_VALUE) { @@ -163,7 +163,7 @@ void SingleStreamDecoder::initializeDecoder() { if (formatContext_->duration > 0) { AVRational defaultTimeBase{1, AV_TIME_BASE}; - containerMetadata_.durationSeconds = + containerMetadata_.durationSecondsFromHeader = ptsToSeconds(formatContext_->duration, defaultTimeBase); } @@ -1463,9 +1463,9 @@ int64_t SingleStreamDecoder::getNumFrames( return streamMetadata.numFramesFromScan.value(); case SeekMode::approximate: { TORCH_CHECK( - streamMetadata.numFrames.has_value(), + streamMetadata.numFramesFromHeader.has_value(), "Cannot use approximate mode since we couldn't find the number of frames from the metadata."); - return streamMetadata.numFrames.value(); + return streamMetadata.numFramesFromHeader.value(); } default: throw std::runtime_error("Unknown SeekMode"); @@ -1491,9 +1491,9 @@ double SingleStreamDecoder::getMaxSeconds( return streamMetadata.maxPtsSecondsFromScan.value(); case SeekMode::approximate: { TORCH_CHECK( - streamMetadata.durationSeconds.has_value(), + streamMetadata.durationSecondsFromHeader.has_value(), "Cannot use approximate mode since we couldn't find the duration from the metadata."); - return streamMetadata.durationSeconds.value(); + return streamMetadata.durationSecondsFromHeader.value(); } default: throw std::runtime_error("Unknown SeekMode"); diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py index 58c163669..591fe259f 100644 --- a/src/torchcodec/_core/_metadata.py +++ b/src/torchcodec/_core/_metadata.py @@ -225,9 +225,9 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: for stream_index in range(container_dict["numStreams"]): stream_dict = json.loads(_get_stream_json_metadata(decoder, stream_index)) common_meta = dict( - duration_seconds_from_header=stream_dict.get("durationSeconds"), + duration_seconds_from_header=stream_dict.get("durationSecondsFromHeader"), bit_rate=stream_dict.get("bitRate"), - begin_stream_seconds_from_header=stream_dict.get("beginStreamFromHeader"), + begin_stream_seconds_from_header=stream_dict.get("beginStreamSecondsFromHeader"), codec=stream_dict.get("codec"), stream_index=stream_index, ) @@ -242,9 +242,9 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: ), width=stream_dict.get("width"), height=stream_dict.get("height"), - num_frames_from_header=stream_dict.get("numFrames"), + num_frames_from_header=stream_dict.get("numFramesFromHeader"), num_frames_from_content=stream_dict.get("numFramesFromScan"), - average_fps_from_header=stream_dict.get("averageFps"), + average_fps_from_header=stream_dict.get("averageFpsFromHeader"), **common_meta, ) ) @@ -264,7 +264,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: streams_metadata.append(StreamMetadata(**common_meta)) return ContainerMetadata( - duration_seconds_from_header=container_dict.get("durationSeconds"), + duration_seconds_from_header=container_dict.get("durationSecondsFromHeader"), bit_rate_from_header=container_dict.get("bitRate"), best_video_stream_index=container_dict.get("bestVideoStreamIndex"), best_audio_stream_index=container_dict.get("bestAudioStreamIndex"), diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 4a1c414b0..8794ccac0 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -456,18 +456,18 @@ std::string get_json_metadata(at::Tensor& decoder) { std::map metadataMap; // serialize the metadata into a string std::stringstream ss; - double durationSeconds = 0; + double durationSecondsFromHeader = 0; if (maybeBestVideoStreamIndex.has_value() && videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] - .durationSeconds.has_value()) { - durationSeconds = + .durationSecondsFromHeader.has_value()) { + durationSecondsFromHeader = videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] - .durationSeconds.value_or(0); + .durationSecondsFromHeader.value_or(0); } else { // Fallback to container-level duration if stream duration is not found. - durationSeconds = videoMetadata.durationSeconds.value_or(0); + durationSecondsFromHeader = videoMetadata.durationSecondsFromHeader.value_or(0); } - metadataMap["durationSeconds"] = std::to_string(durationSeconds); + metadataMap["durationSecondsFromHeader"] = std::to_string(durationSecondsFromHeader); if (videoMetadata.bitRate.has_value()) { metadataMap["bitRate"] = std::to_string(videoMetadata.bitRate.value()); @@ -523,8 +523,8 @@ std::string get_container_json_metadata(at::Tensor& decoder) { std::map map; - if (containerMetadata.durationSeconds.has_value()) { - map["durationSeconds"] = std::to_string(*containerMetadata.durationSeconds); + if (containerMetadata.durationSecondsFromHeader.has_value()) { + map["durationSecondsFromHeader"] = std::to_string(*containerMetadata.durationSecondsFromHeader); } if (containerMetadata.bitRate.has_value()) { @@ -562,8 +562,8 @@ std::string get_stream_json_metadata( std::map map; - if (streamMetadata.durationSeconds.has_value()) { - map["durationSeconds"] = std::to_string(*streamMetadata.durationSeconds); + if (streamMetadata.durationSecondsFromHeader.has_value()) { + map["durationSecondsFromHeader"] = std::to_string(*streamMetadata.durationSecondsFromHeader); } if (streamMetadata.bitRate.has_value()) { map["bitRate"] = std::to_string(*streamMetadata.bitRate); @@ -572,11 +572,11 @@ std::string get_stream_json_metadata( map["numFramesFromScan"] = std::to_string(*streamMetadata.numFramesFromScan); } - if (streamMetadata.numFrames.has_value()) { - map["numFrames"] = std::to_string(*streamMetadata.numFrames); + if (streamMetadata.numFramesFromHeader.has_value()) { + map["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromHeader); } if (streamMetadata.beginStreamFromHeader.has_value()) { - map["beginStreamFromHeader"] = + map["beginStreamSecondsFromHeader"] = std::to_string(*streamMetadata.beginStreamFromHeader); } if (streamMetadata.minPtsSecondsFromScan.has_value()) { @@ -597,7 +597,7 @@ std::string get_stream_json_metadata( map["height"] = std::to_string(*streamMetadata.height); } if (streamMetadata.averageFps.has_value()) { - map["averageFps"] = std::to_string(*streamMetadata.averageFps); + map["averageFpsFromHeader"] = std::to_string(*streamMetadata.averageFps); } if (streamMetadata.sampleRate.has_value()) { map["sampleRate"] = std::to_string(*streamMetadata.sampleRate); From 6a9eca166e33583e4d2cb192d2605e2752b31264 Mon Sep 17 00:00:00 2001 From: danielflores3 Date: Thu, 29 May 2025 11:16:54 -0700 Subject: [PATCH 02/13] Update C++ metadata names to match python --- src/torchcodec/_core/Metadata.h | 4 +-- src/torchcodec/_core/SingleStreamDecoder.cpp | 19 +++++----- src/torchcodec/_core/custom_ops.cpp | 38 +++++++++++++------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index a5b8a8f17..0bdcdeaba 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -26,10 +26,10 @@ struct StreamMetadata { std::optional codecId; std::optional codecName; std::optional durationSecondsFromHeader; - std::optional beginStreamFromHeader; + std::optional beginStreamSecondsFromHeader; std::optional numFramesFromHeader; std::optional numKeyFrames; - std::optional averageFps; + std::optional averageFpsFromHeader; std::optional bitRate; // More accurate duration, obtained by scanning the file. diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index c1a03567f..aa4292bc0 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -133,14 +133,14 @@ void SingleStreamDecoder::initializeDecoder() { av_q2d(avStream->time_base) * avStream->duration; } if (avStream->start_time != AV_NOPTS_VALUE) { - streamMetadata.beginStreamFromHeader = + streamMetadata.beginStreamSecondsFromHeader = av_q2d(avStream->time_base) * avStream->start_time; } if (avStream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { double fps = av_q2d(avStream->r_frame_rate); if (fps > 0) { - streamMetadata.averageFps = fps; + streamMetadata.averageFpsFromHeader = fps; } containerMetadata_.numVideoStreams++; } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) { @@ -445,7 +445,7 @@ void SingleStreamDecoder::addVideoStream( containerMetadata_.allStreamMetadata[activeStreamIndex_]; if (seekMode_ == SeekMode::approximate && - !streamMetadata.averageFps.has_value()) { + !streamMetadata.averageFpsFromHeader.has_value()) { throw std::runtime_error( "Seek mode is approximate, but stream " + std::to_string(activeStreamIndex_) + @@ -1397,9 +1397,9 @@ int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) { auto& streamMetadata = containerMetadata_.allStreamMetadata[activeStreamIndex_]; TORCH_CHECK( - streamMetadata.averageFps.has_value(), + streamMetadata.averageFpsFromHeader.has_value(), "Cannot use approximate mode since we couldn't find the average fps from the metadata."); - return std::floor(seconds * streamMetadata.averageFps.value()); + return std::floor(seconds * streamMetadata.averageFpsFromHeader.value()); } default: throw std::runtime_error("Unknown SeekMode"); @@ -1424,9 +1424,9 @@ int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) { auto& streamMetadata = containerMetadata_.allStreamMetadata[activeStreamIndex_]; TORCH_CHECK( - streamMetadata.averageFps.has_value(), + streamMetadata.averageFpsFromHeader.has_value(), "Cannot use approximate mode since we couldn't find the average fps from the metadata."); - return std::ceil(seconds * streamMetadata.averageFps.value()); + return std::ceil(seconds * streamMetadata.averageFpsFromHeader.value()); } default: throw std::runtime_error("Unknown SeekMode"); @@ -1442,10 +1442,11 @@ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) { auto& streamMetadata = containerMetadata_.allStreamMetadata[activeStreamIndex_]; TORCH_CHECK( - streamMetadata.averageFps.has_value(), + streamMetadata.averageFpsFromHeader.has_value(), "Cannot use approximate mode since we couldn't find the average fps from the metadata."); return secondsToClosestPts( - frameIndex / streamMetadata.averageFps.value(), streamInfo.timeBase); + frameIndex / streamMetadata.averageFpsFromHeader.value(), + streamInfo.timeBase); } default: throw std::runtime_error("Unknown SeekMode"); diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 8794ccac0..8d1758304 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -460,14 +460,20 @@ std::string get_json_metadata(at::Tensor& decoder) { if (maybeBestVideoStreamIndex.has_value() && videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] .durationSecondsFromHeader.has_value()) { +<<<<<<< HEAD durationSecondsFromHeader = +======= + durationSecondsFromHeader = +>>>>>>> e932590 (Update C++ metadata names to match python) videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] .durationSecondsFromHeader.value_or(0); } else { // Fallback to container-level duration if stream duration is not found. - durationSecondsFromHeader = videoMetadata.durationSecondsFromHeader.value_or(0); + durationSecondsFromHeader = + videoMetadata.durationSecondsFromHeader.value_or(0); } - metadataMap["durationSecondsFromHeader"] = std::to_string(durationSecondsFromHeader); + metadataMap["durationSecondsFromHeader"] = + std::to_string(durationSecondsFromHeader); if (videoMetadata.bitRate.has_value()) { metadataMap["bitRate"] = std::to_string(videoMetadata.bitRate.value()); @@ -479,8 +485,9 @@ std::string get_json_metadata(at::Tensor& decoder) { if (streamMetadata.numFramesFromScan.has_value()) { metadataMap["numFrames"] = std::to_string(*streamMetadata.numFramesFromScan); - } else if (streamMetadata.numFrames.has_value()) { - metadataMap["numFrames"] = std::to_string(*streamMetadata.numFrames); + } else if (streamMetadata.numFramesFromHeader.has_value()) { + metadataMap["numFrames"] = + std::to_string(*streamMetadata.numFramesFromHeader); } if (streamMetadata.minPtsSecondsFromScan.has_value()) { metadataMap["minPtsSecondsFromScan"] = @@ -499,8 +506,9 @@ std::string get_json_metadata(at::Tensor& decoder) { if (streamMetadata.height.has_value()) { metadataMap["height"] = std::to_string(*streamMetadata.height); } - if (streamMetadata.averageFps.has_value()) { - metadataMap["averageFps"] = std::to_string(*streamMetadata.averageFps); + if (streamMetadata.averageFpsFromHeader.has_value()) { + metadataMap["averageFpsFromHeader"] = + std::to_string(*streamMetadata.averageFpsFromHeader); } } if (videoMetadata.bestVideoStreamIndex.has_value()) { @@ -524,7 +532,8 @@ std::string get_container_json_metadata(at::Tensor& decoder) { std::map map; if (containerMetadata.durationSecondsFromHeader.has_value()) { - map["durationSecondsFromHeader"] = std::to_string(*containerMetadata.durationSecondsFromHeader); + map["durationSecondsFromHeader"] = + std::to_string(*containerMetadata.durationSecondsFromHeader); } if (containerMetadata.bitRate.has_value()) { @@ -563,7 +572,8 @@ std::string get_stream_json_metadata( std::map map; if (streamMetadata.durationSecondsFromHeader.has_value()) { - map["durationSecondsFromHeader"] = std::to_string(*streamMetadata.durationSecondsFromHeader); + map["durationSecondsFromHeader"] = + std::to_string(*streamMetadata.durationSecondsFromHeader); } if (streamMetadata.bitRate.has_value()) { map["bitRate"] = std::to_string(*streamMetadata.bitRate); @@ -573,11 +583,12 @@ std::string get_stream_json_metadata( std::to_string(*streamMetadata.numFramesFromScan); } if (streamMetadata.numFramesFromHeader.has_value()) { - map["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromHeader); + map["numFramesFromHeader"] = + std::to_string(*streamMetadata.numFramesFromHeader); } - if (streamMetadata.beginStreamFromHeader.has_value()) { + if (streamMetadata.beginStreamSecondsFromHeader.has_value()) { map["beginStreamSecondsFromHeader"] = - std::to_string(*streamMetadata.beginStreamFromHeader); + std::to_string(*streamMetadata.beginStreamSecondsFromHeader); } if (streamMetadata.minPtsSecondsFromScan.has_value()) { map["minPtsSecondsFromScan"] = @@ -596,8 +607,9 @@ std::string get_stream_json_metadata( if (streamMetadata.height.has_value()) { map["height"] = std::to_string(*streamMetadata.height); } - if (streamMetadata.averageFps.has_value()) { - map["averageFpsFromHeader"] = std::to_string(*streamMetadata.averageFps); + if (streamMetadata.averageFpsFromHeader.has_value()) { + map["averageFpsFromHeader"] = + std::to_string(*streamMetadata.averageFpsFromHeader); } if (streamMetadata.sampleRate.has_value()) { map["sampleRate"] = std::to_string(*streamMetadata.sampleRate); From df6072d20e9b9361249b6031fbc077e155a3011b Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 08:39:09 -0700 Subject: [PATCH 03/13] resolve merge conflicts --- src/torchcodec/_core/custom_ops.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 8d1758304..388bd5eff 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -460,11 +460,7 @@ std::string get_json_metadata(at::Tensor& decoder) { if (maybeBestVideoStreamIndex.has_value() && videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] .durationSecondsFromHeader.has_value()) { -<<<<<<< HEAD - durationSecondsFromHeader = -======= durationSecondsFromHeader = ->>>>>>> e932590 (Update C++ metadata names to match python) videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex] .durationSecondsFromHeader.value_or(0); } else { From 1a8b3f8de22f8e676fc6ffcdfb225d71b75552b7 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 09:45:15 -0700 Subject: [PATCH 04/13] fixes to pass metadata tests --- output.mp3 | 0 src/torchcodec/_core/custom_ops.cpp | 4 ++-- src/torchcodec/_samplers/video_clip_sampler.py | 6 +++--- test/test_ops.py | 10 ++++++---- 4 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 output.mp3 diff --git a/output.mp3 b/output.mp3 new file mode 100644 index 000000000..e69de29bb diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 388bd5eff..56d599056 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -479,10 +479,10 @@ std::string get_json_metadata(at::Tensor& decoder) { auto streamMetadata = videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex]; if (streamMetadata.numFramesFromScan.has_value()) { - metadataMap["numFrames"] = + metadataMap["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromScan); } else if (streamMetadata.numFramesFromHeader.has_value()) { - metadataMap["numFrames"] = + metadataMap["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromHeader); } if (streamMetadata.minPtsSecondsFromScan.has_value()) { diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py index 8a92400f2..9fc49cae5 100644 --- a/src/torchcodec/_samplers/video_clip_sampler.py +++ b/src/torchcodec/_samplers/video_clip_sampler.py @@ -213,7 +213,7 @@ def _get_clips_for_index_based_sampling( sample_end_index = ( min( index_based_sampler_args.sample_end_index + 1, - metadata_json["numFrames"], + metadata_json["numFramesFromHeader"], ) - index_based_sampler_args.video_frame_dilation * index_based_sampler_args.frames_per_clip @@ -263,13 +263,13 @@ def _get_start_seconds( Returns: (`List[float]`): List of the sampled clip start position in seconds """ - video_duration_in_seconds = metadata_json["durationSeconds"] + video_duration_in_seconds = metadata_json["durationSecondsFromHeader"] clip_duration_in_seconds = ( time_based_sampler_args.frames_per_clip * time_based_sampler_args.video_frame_dilation + 1 - ) / metadata_json["averageFps"] + ) / metadata_json["averageFpsFromHeader"] minPtsSecondsFromScan = ( metadata_json["minPtsSecondsFromScan"] diff --git a/test/test_ops.py b/test/test_ops.py index 77be702b6..af9b9f2f3 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -215,7 +215,7 @@ def test_pts_apis_against_index_ref(self, device): metadata = get_json_metadata(decoder) metadata_dict = json.loads(metadata) - num_frames = metadata_dict["numFrames"] + num_frames = metadata_dict["numFramesFromHeader"] assert num_frames == 390 _, all_pts_seconds_ref, _ = zip( @@ -395,9 +395,11 @@ def test_video_get_json_metadata(self): metadata_dict = json.loads(metadata) # We should be able to see all of this metadata without adding a video stream - assert metadata_dict["durationSeconds"] == pytest.approx(13.013, abs=0.001) - assert metadata_dict["numFrames"] == 390 - assert metadata_dict["averageFps"] == pytest.approx(29.97, abs=0.001) + assert metadata_dict["durationSecondsFromHeader"] == pytest.approx( + 13.013, abs=0.001 + ) + assert metadata_dict["numFramesFromHeader"] == 390 + assert metadata_dict["averageFpsFromHeader"] == pytest.approx(29.97, abs=0.001) assert metadata_dict["codec"] == "h264" ffmpeg_dict = get_ffmpeg_library_versions() if ffmpeg_dict["libavformat"][0] >= 60: From 4219df45f32bba4ba81574b56f0ac3154c628703 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 10:31:01 -0700 Subject: [PATCH 05/13] Update minPtsSecondsFromScan, maxPtsSecondsFromScan, numFramesFromScan --- src/torchcodec/_core/Metadata.h | 6 ++-- src/torchcodec/_core/SingleStreamDecoder.cpp | 16 ++++----- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- src/torchcodec/_core/_metadata.py | 10 +++--- src/torchcodec/_core/custom_ops.cpp | 34 +++++++++---------- .../_samplers/video_clip_sampler.py | 16 ++++----- test/VideoDecoderTest.cpp | 12 +++---- test/test_ops.py | 4 +-- 8 files changed, 51 insertions(+), 49 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index 0bdcdeaba..6b71dbd25 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -37,10 +37,10 @@ struct StreamMetadata { std::optional minPtsFromScan; std::optional maxPtsFromScan; // These presentation timestamps are in seconds. - std::optional minPtsSecondsFromScan; - std::optional maxPtsSecondsFromScan; + std::optional beginStreamSecondsFromContent; + std::optional endStreamFromContentSeconds; // This can be useful for index-based seeking. - std::optional numFramesFromScan; + std::optional numFramesFromContent; // Video-only fields derived from the AVCodecContext. std::optional width; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index aa4292bc0..3b1d77836 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -241,8 +241,8 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { streamMetadata.maxPtsFromScan = std::max( streamMetadata.maxPtsFromScan.value_or(INT64_MIN), getPtsOrDts(packet) + packet->duration); - streamMetadata.numFramesFromScan = - streamMetadata.numFramesFromScan.value_or(0) + 1; + streamMetadata.numFramesFromContent = + streamMetadata.numFramesFromContent.value_or(0) + 1; // Note that we set the other value in this struct, nextPts, only after // we have scanned all packets and sorted by pts. @@ -262,15 +262,15 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex]; auto avStream = formatContext_->streams[streamIndex]; - streamMetadata.numFramesFromScan = + streamMetadata.numFramesFromContent = streamInfos_[streamIndex].allFrames.size(); if (streamMetadata.minPtsFromScan.has_value()) { - streamMetadata.minPtsSecondsFromScan = + streamMetadata.beginStreamSecondsFromContent = *streamMetadata.minPtsFromScan * av_q2d(avStream->time_base); } if (streamMetadata.maxPtsFromScan.has_value()) { - streamMetadata.maxPtsSecondsFromScan = + streamMetadata.endStreamFromContentSeconds = *streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base); } } @@ -1461,7 +1461,7 @@ int64_t SingleStreamDecoder::getNumFrames( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.numFramesFromScan.value(); + return streamMetadata.numFramesFromContent.value(); case SeekMode::approximate: { TORCH_CHECK( streamMetadata.numFramesFromHeader.has_value(), @@ -1477,7 +1477,7 @@ double SingleStreamDecoder::getMinSeconds( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.minPtsSecondsFromScan.value(); + return streamMetadata.beginStreamSecondsFromContent.value(); case SeekMode::approximate: return 0; default: @@ -1489,7 +1489,7 @@ double SingleStreamDecoder::getMaxSeconds( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.maxPtsSecondsFromScan.value(); + return streamMetadata.endStreamFromContentSeconds.value(); case SeekMode::approximate: { TORCH_CHECK( streamMetadata.durationSecondsFromHeader.has_value(), diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index cbacb8477..4ec618824 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -121,7 +121,7 @@ class SingleStreamDecoder { // // Valid values for startSeconds and stopSeconds are: // - // [minPtsSecondsFromScan, maxPtsSecondsFromScan) + // [beginStreamSecondsFromContent, maxPtsSecondsFromScan) FrameBatchOutput getFramesPlayedInRange( double startSeconds, double stopSeconds); diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py index 591fe259f..25d236dde 100644 --- a/src/torchcodec/_core/_metadata.py +++ b/src/torchcodec/_core/_metadata.py @@ -227,7 +227,9 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: common_meta = dict( duration_seconds_from_header=stream_dict.get("durationSecondsFromHeader"), bit_rate=stream_dict.get("bitRate"), - begin_stream_seconds_from_header=stream_dict.get("beginStreamSecondsFromHeader"), + begin_stream_seconds_from_header=stream_dict.get( + "beginStreamSecondsFromHeader" + ), codec=stream_dict.get("codec"), stream_index=stream_index, ) @@ -235,15 +237,15 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: streams_metadata.append( VideoStreamMetadata( begin_stream_seconds_from_content=stream_dict.get( - "minPtsSecondsFromScan" + "beginStreamSecondsFromContent" ), end_stream_seconds_from_content=stream_dict.get( - "maxPtsSecondsFromScan" + "endStreamFromContentSeconds" ), width=stream_dict.get("width"), height=stream_dict.get("height"), num_frames_from_header=stream_dict.get("numFramesFromHeader"), - num_frames_from_content=stream_dict.get("numFramesFromScan"), + num_frames_from_content=stream_dict.get("numFramesFromContent"), average_fps_from_header=stream_dict.get("averageFpsFromHeader"), **common_meta, ) diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 56d599056..ef79d1ae4 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -478,20 +478,20 @@ std::string get_json_metadata(at::Tensor& decoder) { if (maybeBestVideoStreamIndex.has_value()) { auto streamMetadata = videoMetadata.allStreamMetadata[*maybeBestVideoStreamIndex]; - if (streamMetadata.numFramesFromScan.has_value()) { + if (streamMetadata.numFramesFromContent.has_value()) { metadataMap["numFramesFromHeader"] = - std::to_string(*streamMetadata.numFramesFromScan); + std::to_string(*streamMetadata.numFramesFromContent); } else if (streamMetadata.numFramesFromHeader.has_value()) { metadataMap["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromHeader); } - if (streamMetadata.minPtsSecondsFromScan.has_value()) { - metadataMap["minPtsSecondsFromScan"] = - std::to_string(*streamMetadata.minPtsSecondsFromScan); + if (streamMetadata.beginStreamSecondsFromContent.has_value()) { + metadataMap["beginStreamSecondsFromContent"] = + std::to_string(*streamMetadata.beginStreamSecondsFromContent); } - if (streamMetadata.maxPtsSecondsFromScan.has_value()) { - metadataMap["maxPtsSecondsFromScan"] = - std::to_string(*streamMetadata.maxPtsSecondsFromScan); + if (streamMetadata.endStreamFromContentSeconds.has_value()) { + metadataMap["endStreamFromContentSeconds"] = + std::to_string(*streamMetadata.endStreamFromContentSeconds); } if (streamMetadata.codecName.has_value()) { metadataMap["codec"] = quoteValue(streamMetadata.codecName.value()); @@ -574,9 +574,9 @@ std::string get_stream_json_metadata( if (streamMetadata.bitRate.has_value()) { map["bitRate"] = std::to_string(*streamMetadata.bitRate); } - if (streamMetadata.numFramesFromScan.has_value()) { - map["numFramesFromScan"] = - std::to_string(*streamMetadata.numFramesFromScan); + if (streamMetadata.numFramesFromContent.has_value()) { + map["numFramesFromContent"] = + std::to_string(*streamMetadata.numFramesFromContent); } if (streamMetadata.numFramesFromHeader.has_value()) { map["numFramesFromHeader"] = @@ -586,13 +586,13 @@ std::string get_stream_json_metadata( map["beginStreamSecondsFromHeader"] = std::to_string(*streamMetadata.beginStreamSecondsFromHeader); } - if (streamMetadata.minPtsSecondsFromScan.has_value()) { - map["minPtsSecondsFromScan"] = - std::to_string(*streamMetadata.minPtsSecondsFromScan); + if (streamMetadata.beginStreamSecondsFromContent.has_value()) { + map["beginStreamSecondsFromContent"] = + std::to_string(*streamMetadata.beginStreamSecondsFromContent); } - if (streamMetadata.maxPtsSecondsFromScan.has_value()) { - map["maxPtsSecondsFromScan"] = - std::to_string(*streamMetadata.maxPtsSecondsFromScan); + if (streamMetadata.endStreamFromContentSeconds.has_value()) { + map["endStreamFromContentSeconds"] = + std::to_string(*streamMetadata.endStreamFromContentSeconds); } if (streamMetadata.codecName.has_value()) { map["codec"] = quoteValue(streamMetadata.codecName.value()); diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py index 9fc49cae5..bb2d69328 100644 --- a/src/torchcodec/_samplers/video_clip_sampler.py +++ b/src/torchcodec/_samplers/video_clip_sampler.py @@ -271,18 +271,18 @@ def _get_start_seconds( + 1 ) / metadata_json["averageFpsFromHeader"] - minPtsSecondsFromScan = ( - metadata_json["minPtsSecondsFromScan"] - if metadata_json["minPtsSecondsFromScan"] + beginStreamSecondsFromContent = ( + metadata_json["beginStreamSecondsFromContent"] + if metadata_json["beginStreamSecondsFromContent"] else 0 ) - maxPtsSecondsFromScan = ( - metadata_json["maxPtsSecondsFromScan"] - if metadata_json["maxPtsSecondsFromScan"] > 0 + endStreamFromContentSeconds = ( + metadata_json["endStreamFromContentSeconds"] + if metadata_json["endStreamFromContentSeconds"] > 0 else video_duration_in_seconds ) last_possible_clip_start_in_seconds = ( - maxPtsSecondsFromScan - clip_duration_in_seconds + endStreamFromContentSeconds - clip_duration_in_seconds ) if last_possible_clip_start_in_seconds < 0: raise VideoTooShortException( @@ -292,7 +292,7 @@ def _get_start_seconds( clip_starts_in_seconds: List[float] = [] sample_start_second = max( time_based_sampler_args.sample_start_second, - minPtsSecondsFromScan, + beginStreamSecondsFromContent, ) sample_end_second = min( last_possible_clip_start_in_seconds, diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index a30609c2a..9b98f364a 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -84,15 +84,15 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) { EXPECT_NEAR(*videoStream.bitRate, 128783, 1e-1); EXPECT_NEAR(*videoStream.durationSeconds, 13.013, 1e-1); EXPECT_EQ(videoStream.numFrames, 390); - EXPECT_FALSE(videoStream.minPtsSecondsFromScan.has_value()); - EXPECT_FALSE(videoStream.maxPtsSecondsFromScan.has_value()); - EXPECT_FALSE(videoStream.numFramesFromScan.has_value()); + EXPECT_FALSE(videoStream.beginStreamSecondsFromContent.has_value()); + EXPECT_FALSE(videoStream.endStreamFromContentSeconds.has_value()); + EXPECT_FALSE(videoStream.numFramesFromContent.has_value()); decoder->scanFileAndUpdateMetadataAndIndex(); metadata = decoder->getContainerMetadata(); const auto& videoStream1 = metadata.allStreamMetadata[3]; - EXPECT_EQ(*videoStream1.minPtsSecondsFromScan, 0); - EXPECT_EQ(*videoStream1.maxPtsSecondsFromScan, 13.013); - EXPECT_EQ(*videoStream1.numFramesFromScan, 390); + EXPECT_EQ(*videoStream1.beginStreamSecondsFromContent, 0); + EXPECT_EQ(*videoStream1.endStreamFromContentSeconds, 13.013); + EXPECT_EQ(*videoStream1.numFramesFromContent, 390); } TEST(SingleStreamDecoderTest, MissingVideoFileThrowsException) { diff --git a/test/test_ops.py b/test/test_ops.py index af9b9f2f3..b72d9fe74 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -414,8 +414,8 @@ def test_video_get_json_metadata_with_stream(self): metadata_dict = json.loads(metadata) assert metadata_dict["width"] == 480 assert metadata_dict["height"] == 270 - assert metadata_dict["minPtsSecondsFromScan"] == 0 - assert metadata_dict["maxPtsSecondsFromScan"] == 13.013 + assert metadata_dict["beginStreamSecondsFromContent"] == 0 + assert metadata_dict["endStreamFromContentSeconds"] == 13.013 def test_get_ffmpeg_version(self): ffmpeg_dict = get_ffmpeg_library_versions() From 80ae01d6f84030d858fce998b0a301ba13b27e68 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 10:38:58 -0700 Subject: [PATCH 06/13] Fix name endStreamSecondsFromContent --- src/torchcodec/_core/Metadata.h | 2 +- src/torchcodec/_core/SingleStreamDecoder.cpp | 4 ++-- src/torchcodec/_core/_metadata.py | 2 +- src/torchcodec/_core/custom_ops.cpp | 12 ++++++------ src/torchcodec/_samplers/video_clip_sampler.py | 8 ++++---- test/VideoDecoderTest.cpp | 4 ++-- test/test_ops.py | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index 6b71dbd25..3dc3b3f58 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -38,7 +38,7 @@ struct StreamMetadata { std::optional maxPtsFromScan; // These presentation timestamps are in seconds. std::optional beginStreamSecondsFromContent; - std::optional endStreamFromContentSeconds; + std::optional endStreamSecondsFromContent; // This can be useful for index-based seeking. std::optional numFramesFromContent; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 3b1d77836..ab6ee1729 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -270,7 +270,7 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { *streamMetadata.minPtsFromScan * av_q2d(avStream->time_base); } if (streamMetadata.maxPtsFromScan.has_value()) { - streamMetadata.endStreamFromContentSeconds = + streamMetadata.endStreamSecondsFromContent = *streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base); } } @@ -1489,7 +1489,7 @@ double SingleStreamDecoder::getMaxSeconds( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.endStreamFromContentSeconds.value(); + return streamMetadata.endStreamSecondsFromContent.value(); case SeekMode::approximate: { TORCH_CHECK( streamMetadata.durationSecondsFromHeader.has_value(), diff --git a/src/torchcodec/_core/_metadata.py b/src/torchcodec/_core/_metadata.py index 25d236dde..c15e86e74 100644 --- a/src/torchcodec/_core/_metadata.py +++ b/src/torchcodec/_core/_metadata.py @@ -240,7 +240,7 @@ def get_container_metadata(decoder: torch.Tensor) -> ContainerMetadata: "beginStreamSecondsFromContent" ), end_stream_seconds_from_content=stream_dict.get( - "endStreamFromContentSeconds" + "endStreamSecondsFromContent" ), width=stream_dict.get("width"), height=stream_dict.get("height"), diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index ef79d1ae4..81d3f602a 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -489,9 +489,9 @@ std::string get_json_metadata(at::Tensor& decoder) { metadataMap["beginStreamSecondsFromContent"] = std::to_string(*streamMetadata.beginStreamSecondsFromContent); } - if (streamMetadata.endStreamFromContentSeconds.has_value()) { - metadataMap["endStreamFromContentSeconds"] = - std::to_string(*streamMetadata.endStreamFromContentSeconds); + if (streamMetadata.endStreamSecondsFromContent.has_value()) { + metadataMap["endStreamSecondsFromContent"] = + std::to_string(*streamMetadata.endStreamSecondsFromContent); } if (streamMetadata.codecName.has_value()) { metadataMap["codec"] = quoteValue(streamMetadata.codecName.value()); @@ -590,9 +590,9 @@ std::string get_stream_json_metadata( map["beginStreamSecondsFromContent"] = std::to_string(*streamMetadata.beginStreamSecondsFromContent); } - if (streamMetadata.endStreamFromContentSeconds.has_value()) { - map["endStreamFromContentSeconds"] = - std::to_string(*streamMetadata.endStreamFromContentSeconds); + if (streamMetadata.endStreamSecondsFromContent.has_value()) { + map["endStreamSecondsFromContent"] = + std::to_string(*streamMetadata.endStreamSecondsFromContent); } if (streamMetadata.codecName.has_value()) { map["codec"] = quoteValue(streamMetadata.codecName.value()); diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py index bb2d69328..664a1b6c2 100644 --- a/src/torchcodec/_samplers/video_clip_sampler.py +++ b/src/torchcodec/_samplers/video_clip_sampler.py @@ -276,13 +276,13 @@ def _get_start_seconds( if metadata_json["beginStreamSecondsFromContent"] else 0 ) - endStreamFromContentSeconds = ( - metadata_json["endStreamFromContentSeconds"] - if metadata_json["endStreamFromContentSeconds"] > 0 + endStreamSecondsFromContent = ( + metadata_json["endStreamSecondsFromContent"] + if metadata_json["endStreamSecondsFromContent"] > 0 else video_duration_in_seconds ) last_possible_clip_start_in_seconds = ( - endStreamFromContentSeconds - clip_duration_in_seconds + endStreamSecondsFromContent - clip_duration_in_seconds ) if last_possible_clip_start_in_seconds < 0: raise VideoTooShortException( diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index 9b98f364a..fa20c358a 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -85,13 +85,13 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) { EXPECT_NEAR(*videoStream.durationSeconds, 13.013, 1e-1); EXPECT_EQ(videoStream.numFrames, 390); EXPECT_FALSE(videoStream.beginStreamSecondsFromContent.has_value()); - EXPECT_FALSE(videoStream.endStreamFromContentSeconds.has_value()); + EXPECT_FALSE(videoStream.endStreamSecondsFromContent.has_value()); EXPECT_FALSE(videoStream.numFramesFromContent.has_value()); decoder->scanFileAndUpdateMetadataAndIndex(); metadata = decoder->getContainerMetadata(); const auto& videoStream1 = metadata.allStreamMetadata[3]; EXPECT_EQ(*videoStream1.beginStreamSecondsFromContent, 0); - EXPECT_EQ(*videoStream1.endStreamFromContentSeconds, 13.013); + EXPECT_EQ(*videoStream1.endStreamSecondsFromContent, 13.013); EXPECT_EQ(*videoStream1.numFramesFromContent, 390); } diff --git a/test/test_ops.py b/test/test_ops.py index b72d9fe74..750124835 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -415,7 +415,7 @@ def test_video_get_json_metadata_with_stream(self): assert metadata_dict["width"] == 480 assert metadata_dict["height"] == 270 assert metadata_dict["beginStreamSecondsFromContent"] == 0 - assert metadata_dict["endStreamFromContentSeconds"] == 13.013 + assert metadata_dict["endStreamSecondsFromContent"] == 13.013 def test_get_ffmpeg_version(self): ffmpeg_dict = get_ffmpeg_library_versions() From 18ad24ff5b3c196abdc120059b321e4e337bf952 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 10:46:31 -0700 Subject: [PATCH 07/13] Update comment to reflect change --- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 4ec618824..16afa3ecc 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -121,7 +121,7 @@ class SingleStreamDecoder { // // Valid values for startSeconds and stopSeconds are: // - // [beginStreamSecondsFromContent, maxPtsSecondsFromScan) + // [beginStreamSecondsFromContent, endStreamSecondsFromContent) FrameBatchOutput getFramesPlayedInRange( double startSeconds, double stopSeconds); From 0e85a5f037c8f73ee91725793492a0f6582940d4 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 10:48:50 -0700 Subject: [PATCH 08/13] remove test artifact --- output.mp3 | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 output.mp3 diff --git a/output.mp3 b/output.mp3 deleted file mode 100644 index e69de29bb..000000000 From 4327916ef6535d9ec2b0f8088bfb4aa115b343e7 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Fri, 30 May 2025 12:08:54 -0700 Subject: [PATCH 09/13] Update fields in c++ tests --- test/VideoDecoderTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index fa20c358a..a1e51d88b 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -80,10 +80,10 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) { const auto& videoStream = metadata.allStreamMetadata[3]; EXPECT_EQ(videoStream.mediaType, AVMEDIA_TYPE_VIDEO); EXPECT_EQ(videoStream.codecName, "h264"); - EXPECT_NEAR(*videoStream.averageFps, 29.97f, 1e-1); + EXPECT_NEAR(*videoStream.averageFpsFromHeader, 29.97f, 1e-1); EXPECT_NEAR(*videoStream.bitRate, 128783, 1e-1); - EXPECT_NEAR(*videoStream.durationSeconds, 13.013, 1e-1); - EXPECT_EQ(videoStream.numFrames, 390); + EXPECT_NEAR(*videoStream.durationSecondsFromHeader, 13.013, 1e-1); + EXPECT_EQ(videoStream.numFramesFromHeader, 390); EXPECT_FALSE(videoStream.beginStreamSecondsFromContent.has_value()); EXPECT_FALSE(videoStream.endStreamSecondsFromContent.has_value()); EXPECT_FALSE(videoStream.numFramesFromContent.has_value()); @@ -434,7 +434,7 @@ TEST_P(SingleStreamDecoderTest, GetAudioMetadata) { const auto& audioStream = metadata.allStreamMetadata[0]; EXPECT_EQ(audioStream.mediaType, AVMEDIA_TYPE_AUDIO); - EXPECT_NEAR(*audioStream.durationSeconds, 13.25, 1e-1); + EXPECT_NEAR(*audioStream.durationSecondsFromHeader, 13.25, 1e-1); } INSTANTIATE_TEST_SUITE_P( From 553e74a1582b39f86a0bee5e64b0599b0525f77c Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Mon, 2 Jun 2025 09:34:38 -0400 Subject: [PATCH 10/13] Add pytest tmp_path (#706) --- test/test_encoders.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_encoders.py b/test/test_encoders.py index 5e98ff4f1..bf5f9cc6b 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -56,9 +56,11 @@ def test_bad_input(self): encoder.to_tensor(format=bad_format) @pytest.mark.parametrize("method", ("to_file", "to_tensor")) - def test_bad_input_parametrized(self, method): + def test_bad_input_parametrized(self, method, tmp_path): valid_params = ( - dict(dest="output.mp3") if method == "to_file" else dict(format="mp3") + dict(dest=str(tmp_path / "output.mp3")) + if method == "to_file" + else dict(format="mp3") ) decoder = AudioEncoder(self.decode(NASA_AUDIO_MP3), sample_rate=10) From 223b1d0701df2580e1806f30776bcac192a157ab Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 2 Jun 2025 07:32:44 -0700 Subject: [PATCH 11/13] Update minPtsFromScan and maxPtsFromScan --- src/torchcodec/_core/Metadata.h | 4 ++-- src/torchcodec/_core/SingleStreamDecoder.cpp | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index 3dc3b3f58..c622a5770 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -34,8 +34,8 @@ struct StreamMetadata { // More accurate duration, obtained by scanning the file. // These presentation timestamps are in time base. - std::optional minPtsFromScan; - std::optional maxPtsFromScan; + std::optional beginStreamPtsFromContent; + std::optional endStreamPtsFromContent; // These presentation timestamps are in seconds. std::optional beginStreamSecondsFromContent; std::optional endStreamSecondsFromContent; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index ab6ee1729..4acd2c4a3 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -236,10 +236,11 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { // record its relevant metadata. int streamIndex = packet->stream_index; auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex]; - streamMetadata.minPtsFromScan = std::min( - streamMetadata.minPtsFromScan.value_or(INT64_MAX), getPtsOrDts(packet)); - streamMetadata.maxPtsFromScan = std::max( - streamMetadata.maxPtsFromScan.value_or(INT64_MIN), + streamMetadata.beginStreamPtsFromContent = std::min( + streamMetadata.beginStreamPtsFromContent.value_or(INT64_MAX), + getPtsOrDts(packet)); + streamMetadata.endStreamPtsFromContent = std::max( + streamMetadata.endStreamPtsFromContent.value_or(INT64_MIN), getPtsOrDts(packet) + packet->duration); streamMetadata.numFramesFromContent = streamMetadata.numFramesFromContent.value_or(0) + 1; @@ -265,13 +266,14 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { streamMetadata.numFramesFromContent = streamInfos_[streamIndex].allFrames.size(); - if (streamMetadata.minPtsFromScan.has_value()) { + if (streamMetadata.beginStreamPtsFromContent.has_value()) { streamMetadata.beginStreamSecondsFromContent = - *streamMetadata.minPtsFromScan * av_q2d(avStream->time_base); + *streamMetadata.beginStreamPtsFromContent * + av_q2d(avStream->time_base); } - if (streamMetadata.maxPtsFromScan.has_value()) { + if (streamMetadata.endStreamPtsFromContent.has_value()) { streamMetadata.endStreamSecondsFromContent = - *streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base); + *streamMetadata.endStreamPtsFromContent * av_q2d(avStream->time_base); } } From 81364665605c6aac34a532e92839c9e3f62b8431 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 2 Jun 2025 07:59:26 -0700 Subject: [PATCH 12/13] Rename begin/endStreamSecondsFromContent to begin/endStreamPtsSecondsFromContent to retain PTS in variable name --- src/torchcodec/_core/Metadata.h | 4 ++-- src/torchcodec/_core/SingleStreamDecoder.cpp | 8 ++++---- src/torchcodec/_core/SingleStreamDecoder.h | 2 +- src/torchcodec/_core/custom_ops.cpp | 16 ++++++++-------- test/VideoDecoderTest.cpp | 8 ++++---- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/torchcodec/_core/Metadata.h b/src/torchcodec/_core/Metadata.h index c622a5770..dcbf7f896 100644 --- a/src/torchcodec/_core/Metadata.h +++ b/src/torchcodec/_core/Metadata.h @@ -37,8 +37,8 @@ struct StreamMetadata { std::optional beginStreamPtsFromContent; std::optional endStreamPtsFromContent; // These presentation timestamps are in seconds. - std::optional beginStreamSecondsFromContent; - std::optional endStreamSecondsFromContent; + std::optional beginStreamPtsSecondsFromContent; + std::optional endStreamPtsSecondsFromContent; // This can be useful for index-based seeking. std::optional numFramesFromContent; diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 4acd2c4a3..d8a6c8bad 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -267,12 +267,12 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() { streamInfos_[streamIndex].allFrames.size(); if (streamMetadata.beginStreamPtsFromContent.has_value()) { - streamMetadata.beginStreamSecondsFromContent = + streamMetadata.beginStreamPtsSecondsFromContent = *streamMetadata.beginStreamPtsFromContent * av_q2d(avStream->time_base); } if (streamMetadata.endStreamPtsFromContent.has_value()) { - streamMetadata.endStreamSecondsFromContent = + streamMetadata.endStreamPtsSecondsFromContent = *streamMetadata.endStreamPtsFromContent * av_q2d(avStream->time_base); } } @@ -1479,7 +1479,7 @@ double SingleStreamDecoder::getMinSeconds( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.beginStreamSecondsFromContent.value(); + return streamMetadata.beginStreamPtsSecondsFromContent.value(); case SeekMode::approximate: return 0; default: @@ -1491,7 +1491,7 @@ double SingleStreamDecoder::getMaxSeconds( const StreamMetadata& streamMetadata) { switch (seekMode_) { case SeekMode::exact: - return streamMetadata.endStreamSecondsFromContent.value(); + return streamMetadata.endStreamPtsSecondsFromContent.value(); case SeekMode::approximate: { TORCH_CHECK( streamMetadata.durationSecondsFromHeader.has_value(), diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h index 16afa3ecc..30a717b55 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.h +++ b/src/torchcodec/_core/SingleStreamDecoder.h @@ -121,7 +121,7 @@ class SingleStreamDecoder { // // Valid values for startSeconds and stopSeconds are: // - // [beginStreamSecondsFromContent, endStreamSecondsFromContent) + // [beginStreamPtsSecondsFromContent, endStreamPtsSecondsFromContent) FrameBatchOutput getFramesPlayedInRange( double startSeconds, double stopSeconds); diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp index 81d3f602a..192abba45 100644 --- a/src/torchcodec/_core/custom_ops.cpp +++ b/src/torchcodec/_core/custom_ops.cpp @@ -485,13 +485,13 @@ std::string get_json_metadata(at::Tensor& decoder) { metadataMap["numFramesFromHeader"] = std::to_string(*streamMetadata.numFramesFromHeader); } - if (streamMetadata.beginStreamSecondsFromContent.has_value()) { + if (streamMetadata.beginStreamPtsSecondsFromContent.has_value()) { metadataMap["beginStreamSecondsFromContent"] = - std::to_string(*streamMetadata.beginStreamSecondsFromContent); + std::to_string(*streamMetadata.beginStreamPtsSecondsFromContent); } - if (streamMetadata.endStreamSecondsFromContent.has_value()) { + if (streamMetadata.endStreamPtsSecondsFromContent.has_value()) { metadataMap["endStreamSecondsFromContent"] = - std::to_string(*streamMetadata.endStreamSecondsFromContent); + std::to_string(*streamMetadata.endStreamPtsSecondsFromContent); } if (streamMetadata.codecName.has_value()) { metadataMap["codec"] = quoteValue(streamMetadata.codecName.value()); @@ -586,13 +586,13 @@ std::string get_stream_json_metadata( map["beginStreamSecondsFromHeader"] = std::to_string(*streamMetadata.beginStreamSecondsFromHeader); } - if (streamMetadata.beginStreamSecondsFromContent.has_value()) { + if (streamMetadata.beginStreamPtsSecondsFromContent.has_value()) { map["beginStreamSecondsFromContent"] = - std::to_string(*streamMetadata.beginStreamSecondsFromContent); + std::to_string(*streamMetadata.beginStreamPtsSecondsFromContent); } - if (streamMetadata.endStreamSecondsFromContent.has_value()) { + if (streamMetadata.endStreamPtsSecondsFromContent.has_value()) { map["endStreamSecondsFromContent"] = - std::to_string(*streamMetadata.endStreamSecondsFromContent); + std::to_string(*streamMetadata.endStreamPtsSecondsFromContent); } if (streamMetadata.codecName.has_value()) { map["codec"] = quoteValue(streamMetadata.codecName.value()); diff --git a/test/VideoDecoderTest.cpp b/test/VideoDecoderTest.cpp index a1e51d88b..fd9f2535d 100644 --- a/test/VideoDecoderTest.cpp +++ b/test/VideoDecoderTest.cpp @@ -84,14 +84,14 @@ TEST_P(SingleStreamDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) { EXPECT_NEAR(*videoStream.bitRate, 128783, 1e-1); EXPECT_NEAR(*videoStream.durationSecondsFromHeader, 13.013, 1e-1); EXPECT_EQ(videoStream.numFramesFromHeader, 390); - EXPECT_FALSE(videoStream.beginStreamSecondsFromContent.has_value()); - EXPECT_FALSE(videoStream.endStreamSecondsFromContent.has_value()); + EXPECT_FALSE(videoStream.beginStreamPtsSecondsFromContent.has_value()); + EXPECT_FALSE(videoStream.endStreamPtsSecondsFromContent.has_value()); EXPECT_FALSE(videoStream.numFramesFromContent.has_value()); decoder->scanFileAndUpdateMetadataAndIndex(); metadata = decoder->getContainerMetadata(); const auto& videoStream1 = metadata.allStreamMetadata[3]; - EXPECT_EQ(*videoStream1.beginStreamSecondsFromContent, 0); - EXPECT_EQ(*videoStream1.endStreamSecondsFromContent, 13.013); + EXPECT_EQ(*videoStream1.beginStreamPtsSecondsFromContent, 0); + EXPECT_EQ(*videoStream1.endStreamPtsSecondsFromContent, 13.013); EXPECT_EQ(*videoStream1.numFramesFromContent, 390); } From 3966b94170b139bd71f192cd879fb91e47e3d2e2 Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Wed, 4 Jun 2025 10:34:20 -0700 Subject: [PATCH 13/13] correct merge changes --- src/torchcodec/_core/SingleStreamDecoder.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp index 0b640e202..a66281cdb 100644 --- a/src/torchcodec/_core/SingleStreamDecoder.cpp +++ b/src/torchcodec/_core/SingleStreamDecoder.cpp @@ -1490,10 +1490,7 @@ std::optional SingleStreamDecoder::getNumFrames( case SeekMode::exact: return streamMetadata.numFramesFromContent.value(); case SeekMode::approximate: { - TORCH_CHECK( - streamMetadata.numFramesFromHeader.has_value(), - "Cannot use approximate mode since we couldn't find the number of frames from the metadata."); - return streamMetadata.numFramesFromHeader.value(); + return streamMetadata.numFramesFromHeader; } default: throw std::runtime_error("Unknown SeekMode"); @@ -1518,10 +1515,7 @@ std::optional SingleStreamDecoder::getMaxSeconds( case SeekMode::exact: return streamMetadata.endStreamPtsSecondsFromContent.value(); case SeekMode::approximate: { - TORCH_CHECK( - streamMetadata.durationSecondsFromHeader.has_value(), - "Cannot use approximate mode since we couldn't find the duration from the metadata."); - return streamMetadata.durationSecondsFromHeader.value(); + return streamMetadata.durationSecondsFromHeader; } default: throw std::runtime_error("Unknown SeekMode");