Merge branch 'main' of github.com:pytorch/torchcodec into approx

scotts · scotts · commit 99b0d4ffa136 · 2025-01-21T11:12:05.000-08:00
diff --git a/.github/workflows/build_ffmpeg.yaml b/.github/workflows/build_ffmpeg.yaml
@@ -29,6 +29,9 @@ jobs:
       matrix:
         ffmpeg-version: ["4.4.4", "5.1.4", "6.1.1", "7.0.1"]
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
       job-name: Build
       upload-artifact: ffmpeg-lgpl
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -600,6 +600,8 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
     streamMetadata.maxPtsFromScan = std::max(
         streamMetadata.maxPtsFromScan.value_or(INT64_MIN),
         packet->pts + packet->duration);
+    streamMetadata.numFramesFromScan =
+        streamMetadata.numFramesFromScan.value_or(0) + 1;
 
     // Note that we set the other value in this struct, nextPts, only after
     // we have scanned all packets and sorted by pts.
@@ -612,19 +614,20 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
 
   // Set all per-stream metadata that requires knowing the content of all
   // packets.
-  for (size_t i = 0; i < containerMetadata_.streams.size(); ++i) {
-    auto& streamMetadata = containerMetadata_.streams[i];
-    auto stream = formatContext_->streams[i];
+  for (size_t streamIndex = 0; streamIndex < containerMetadata_.streams.size();
+       ++streamIndex) {
+    auto& streamMetadata = containerMetadata_.streams[streamIndex];
+    auto avStream = formatContext_->streams[streamIndex];
 
-    streamMetadata.numFramesFromScan = streams_[i].allFrames.size();
+    streamMetadata.numFramesFromScan = streams_[streamIndex].allFrames.size();
 
     if (streamMetadata.minPtsFromScan.has_value()) {
       streamMetadata.minPtsSecondsFromScan =
-          *streamMetadata.minPtsFromScan * av_q2d(stream->time_base);
+          *streamMetadata.minPtsFromScan * av_q2d(avStream->time_base);
     }
     if (streamMetadata.maxPtsFromScan.has_value()) {
       streamMetadata.maxPtsSecondsFromScan =
-          *streamMetadata.maxPtsFromScan * av_q2d(stream->time_base);
+          *streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base);
     }
   }
 
@@ -638,23 +641,23 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
   }
 
   // Sort all frames by their pts.
-  for (auto& [streamIndex, stream] : streams_) {
+  for (auto& [streamIndex, streamInfo] : streams_) {
     std::sort(
-        stream.keyFrames.begin(),
-        stream.keyFrames.end(),
+        streamInfo.keyFrames.begin(),
+        streamInfo.keyFrames.end(),
         [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
           return frameInfo1.pts < frameInfo2.pts;
         });
     std::sort(
-        stream.allFrames.begin(),
-        stream.allFrames.end(),
+        streamInfo.allFrames.begin(),
+        streamInfo.allFrames.end(),
         [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
           return frameInfo1.pts < frameInfo2.pts;
         });
 
-    for (size_t i = 0; i < stream.allFrames.size(); ++i) {
-      if (i + 1 < stream.allFrames.size()) {
-        stream.allFrames[i].nextPts = stream.allFrames[i + 1].pts;
+    for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
+      if (i + 1 < streamInfo.allFrames.size()) {
+        streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
       }
     }
   }
@@ -911,11 +914,9 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
   AVFrame* frame = rawOutput.frame.get();
   output.streamIndex = streamIndex;
   auto& streamInfo = streams_[streamIndex];
-  output.streamType = streams_[streamIndex].stream->codecpar->codec_type;
-  output.pts = frame->pts;
+  TORCH_CHECK(streamInfo.stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO);
   output.ptsSeconds =
       ptsToSeconds(frame->pts, formatContext_->streams[streamIndex]->time_base);
-  output.duration = getDuration(frame);
   output.durationSeconds = ptsToSeconds(
       getDuration(frame), formatContext_->streams[streamIndex]->time_base);
   // TODO: we should fold preAllocatedOutputTensor into RawDecodedOutput.
@@ -972,86 +973,78 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
   }
 
   torch::Tensor outputTensor;
-  if (output.streamType == AVMEDIA_TYPE_VIDEO) {
-    // We need to compare the current frame context with our previous frame
-    // context. If they are different, then we need to re-create our colorspace
-    // conversion objects. We create our colorspace conversion objects late so
-    // that we don't have to depend on the unreliable metadata in the header.
-    // And we sometimes re-create them because it's possible for frame
-    // resolution to change mid-stream. Finally, we want to reuse the colorspace
-    // conversion objects as much as possible for performance reasons.
-    enum AVPixelFormat frameFormat =
-        static_cast<enum AVPixelFormat>(frame->format);
-    auto frameContext = DecodedFrameContext{
-        frame->width,
-        frame->height,
-        frameFormat,
-        expectedOutputWidth,
-        expectedOutputHeight};
+  // We need to compare the current frame context with our previous frame
+  // context. If they are different, then we need to re-create our colorspace
+  // conversion objects. We create our colorspace conversion objects late so
+  // that we don't have to depend on the unreliable metadata in the header.
+  // And we sometimes re-create them because it's possible for frame
+  // resolution to change mid-stream. Finally, we want to reuse the colorspace
+  // conversion objects as much as possible for performance reasons.
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(frame->format);
+  auto frameContext = DecodedFrameContext{
+      frame->width,
+      frame->height,
+      frameFormat,
+      expectedOutputWidth,
+      expectedOutputHeight};
 
-    if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
-      outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
-          expectedOutputHeight, expectedOutputWidth, torch::kCPU));
+  if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
+    outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
+        expectedOutputHeight, expectedOutputWidth, torch::kCPU));
 
-      if (!streamInfo.swsContext ||
-          streamInfo.prevFrameContext != frameContext) {
-        createSwsContext(streamInfo, frameContext, frame->colorspace);
-        streamInfo.prevFrameContext = frameContext;
-      }
-      int resultHeight =
-          convertFrameToTensorUsingSwsScale(streamIndex, frame, outputTensor);
-      // If this check failed, it would mean that the frame wasn't reshaped to
-      // the expected height.
-      // TODO: Can we do the same check for width?
-      TORCH_CHECK(
-          resultHeight == expectedOutputHeight,
-          "resultHeight != expectedOutputHeight: ",
-          resultHeight,
-          " != ",
-          expectedOutputHeight);
+    if (!streamInfo.swsContext || streamInfo.prevFrameContext != frameContext) {
+      createSwsContext(streamInfo, frameContext, frame->colorspace);
+      streamInfo.prevFrameContext = frameContext;
+    }
+    int resultHeight =
+        convertFrameToTensorUsingSwsScale(streamIndex, frame, outputTensor);
+    // If this check failed, it would mean that the frame wasn't reshaped to
+    // the expected height.
+    // TODO: Can we do the same check for width?
+    TORCH_CHECK(
+        resultHeight == expectedOutputHeight,
+        "resultHeight != expectedOutputHeight: ",
+        resultHeight,
+        " != ",
+        expectedOutputHeight);
+
+    output.frame = outputTensor;
+  } else if (
+      streamInfo.colorConversionLibrary ==
+      ColorConversionLibrary::FILTERGRAPH) {
+    if (!streamInfo.filterState.filterGraph ||
+        streamInfo.prevFrameContext != frameContext) {
+      createFilterGraph(streamInfo, expectedOutputHeight, expectedOutputWidth);
+      streamInfo.prevFrameContext = frameContext;
+    }
+    outputTensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame);
 
-      output.frame = outputTensor;
-    } else if (
-        streamInfo.colorConversionLibrary ==
-        ColorConversionLibrary::FILTERGRAPH) {
-      if (!streamInfo.filterState.filterGraph ||
-          streamInfo.prevFrameContext != frameContext) {
-        createFilterGraph(
-            streamInfo, expectedOutputHeight, expectedOutputWidth);
-        streamInfo.prevFrameContext = frameContext;
-      }
-      outputTensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame);
-
-      // Similarly to above, if this check fails it means the frame wasn't
-      // reshaped to its expected dimensions by filtergraph.
-      auto shape = outputTensor.sizes();
-      TORCH_CHECK(
-          (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
-              (shape[1] == expectedOutputWidth) && (shape[2] == 3),
-          "Expected output tensor of shape ",
-          expectedOutputHeight,
-          "x",
-          expectedOutputWidth,
-          "x3, got ",
-          shape);
-
-      if (preAllocatedOutputTensor.has_value()) {
-        // We have already validated that preAllocatedOutputTensor and
-        // outputTensor have the same shape.
-        preAllocatedOutputTensor.value().copy_(outputTensor);
-        output.frame = preAllocatedOutputTensor.value();
-      } else {
-        output.frame = outputTensor;
-      }
+    // Similarly to above, if this check fails it means the frame wasn't
+    // reshaped to its expected dimensions by filtergraph.
+    auto shape = outputTensor.sizes();
+    TORCH_CHECK(
+        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
+            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
+        "Expected output tensor of shape ",
+        expectedOutputHeight,
+        "x",
+        expectedOutputWidth,
+        "x3, got ",
+        shape);
+
+    if (preAllocatedOutputTensor.has_value()) {
+      // We have already validated that preAllocatedOutputTensor and
+      // outputTensor have the same shape.
+      preAllocatedOutputTensor.value().copy_(outputTensor);
+      output.frame = preAllocatedOutputTensor.value();
     } else {
-      throw std::runtime_error(
-          "Invalid color conversion library: " +
-          std::to_string(static_cast<int>(streamInfo.colorConversionLibrary)));
+      output.frame = outputTensor;
     }
-  } else if (output.streamType == AVMEDIA_TYPE_AUDIO) {
-    // TODO: https://github.com/pytorch-labs/torchcodec/issues/85 implement
-    // audio decoding.
-    throw std::runtime_error("Audio is not supported yet.");
+  } else {
+    throw std::runtime_error(
+        "Invalid color conversion library: " +
+        std::to_string(static_cast<int>(streamInfo.colorConversionLibrary)));
   }
 }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -164,48 +164,22 @@ class VideoDecoder {
   // Calling getNextFrameNoDemuxInternal() will return the first frame at
   // or after this position.
   void setCursorPtsInSeconds(double seconds);
-  // This is an internal structure that is used to store the decoded output
-  // from decoding a frame through color conversion. Example usage is:
-  //
-  // RawDecodedOutput rawOutput = getDecodedOutputWithFilter();
-  // // Now allocate a single tensor or a batch tensor.
-  // torch::Tensor userOutput = torch::empty(...);
-  // // Now fill in `data` and `size`.
-  // rawOutput.data = userOutput.data_ptr();
-  // // Now run the color conversion.
-  // convertFrameToBufferUsingSwsScale(rawOutput);
-  //
-  // This structure ensures we always keep the streamIndex and frame together
-  // with the data output. Note that AVFrame itself doesn't retain the
-  // streamIndex.
+  // This structure ensures we always keep the streamIndex and AVFrame together
+  // Note that AVFrame itself doesn't retain the streamIndex.
   struct RawDecodedOutput {
     // The actual decoded output as a unique pointer to an AVFrame.
     UniqueAVFrame frame;
     // The stream index of the decoded frame.
     int streamIndex;
-    // This is an unowned pointer that we copy the frame data to after color
-    // conversion.
-    // For a single tensor this points to the start of data_ptr. For a batch
-    // tensor it may point to the middle of the allocated batch tensor.
-    void* data = nullptr;
-    // We carry around the size to ensure we don't stomp on memory while doing
-    // color conversion.
-    size_t size = 0;
   };
   struct DecodedOutput {
     // The actual decoded output as a Tensor.
     torch::Tensor frame;
-    // Could be AVMEDIA_TYPE_VIDEO or AVMEDIA_TYPE_AUDIO.
-    AVMediaType streamType;
     // The stream index of the decoded frame. Used to distinguish
     // between streams that are of the same type.
     int streamIndex;
-    // The presentation timestamp of the decoded frame in time base.
-    int64_t pts;
     // The presentation timestamp of the decoded frame in seconds.
     double ptsSeconds;
-    // The duration of the decoded frame in time base.
-    int64_t duration;
     // The duration of the decoded frame in seconds.
     double durationSeconds;
   };
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -110,17 +110,27 @@ def average_fps(self) -> Optional[float]:
 
     @property
     def begin_stream_seconds(self) -> float:
-        """TODO."""
+        """Beginning of the stream, in seconds (float). Conceptually, this
+        corresponds to the first frame's :term:`pts`. If
+        ``begin_stream_seconds_from_content`` is not None, then it is returned.
+        Otherwise, this value is 0.
+        """
         if self.begin_stream_seconds_from_content is None:
             return 0
-        return self.begin_stream_seconds_from_content
+        else:
+            return self.begin_stream_seconds_from_content
 
     @property
     def end_stream_seconds(self) -> Optional[float]:
-        """TODO."""
+        """End of the stream, in seconds (float or None).
+        Conceptually, this corresponds to last_frame.pts + last_frame.duration.
+        If ``end_stream_seconds_from_content`` is not None, then that value is
+        returned. Otherwise, returns ``duration_seconds``.
+        """
         if self.end_stream_seconds_from_content is None:
             return self.duration_seconds
-        return self.end_stream_seconds_from_content
+        else:
+            return self.end_stream_seconds_from_content
 
     def __repr__(self):
         # Overridden because properites are not printed by default.
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
@@ -174,12 +174,10 @@ TEST_P(VideoDecoderTest, ReturnsFirstTwoFramesOfVideo) {
   torch::Tensor tensor0FromOurDecoder = output.frame;
   EXPECT_EQ(tensor0FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 0.0);
-  EXPECT_EQ(output.pts, 0);
   output = ourDecoder->getNextFrameNoDemux();
   torch::Tensor tensor1FromOurDecoder = output.frame;
   EXPECT_EQ(tensor1FromOurDecoder.sizes(), std::vector<long>({3, 270, 480}));
   EXPECT_EQ(output.ptsSeconds, 1'001. / 30'000);
-  EXPECT_EQ(output.pts, 1001);
 
   torch::Tensor tensor0FromFFMPEG =
       readTensorFromDisk("nasa_13013.mp4.stream3.frame000000.pt");
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -528,9 +528,7 @@ def test_color_conversion_library_with_scaling(
         if height_scaling_factor != 1.0:
             assert target_height != input_video.height
 
-        filtergraph_decoder = create_from_file(
-            str(input_video.path)
-        )
+        filtergraph_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             filtergraph_decoder,
             width=target_width,
@@ -539,9 +537,7 @@ def test_color_conversion_library_with_scaling(
         )
         filtergraph_frame0, _, _ = get_next_frame(filtergraph_decoder)
 
-        swscale_decoder = create_from_file(
-            str(input_video.path)
-        )
+        swscale_decoder = create_from_file(str(input_video.path))
         _add_video_stream(
             swscale_decoder,
             width=target_width,
diff --git a/test/samplers/test_samplers.py b/test/samplers/test_samplers.py
@@ -592,17 +592,14 @@ def restore_metadata():
     with restore_metadata():
         decoder.metadata.end_stream_seconds_from_content = None
         decoder.metadata.duration_seconds_from_header = None
-        decoder.metadata.duration_seconds_from_content = None
         with pytest.raises(
             ValueError, match="Could not infer stream end from video metadata"
         ):
             sampler(decoder)
 
     with restore_metadata():
-        decoder.metadata.begin_stream_seconds_from_content = None
         decoder.metadata.end_stream_seconds_from_content = None
         decoder.metadata.average_fps_from_header = None
-        decoder.metadata.duration_seconds_from_header = None
         with pytest.raises(ValueError, match="Could not infer average fps"):
             sampler(decoder)