Merge branch 'main' of github.com:pytorch/torchcodec into find-nvcuvid

NicolasHug · NicolasHug · commit 0b1d1625f84e · 2025-09-08T16:48:40.000+01:00
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -67,9 +67,10 @@ jobs:
           # For the actual release we should add that label and change this to
           # include more python versions.
         python-version: ['3.10']
-        # We test against 12.6 and 12.9 to avoid having too big of a CI matrix,
+        # We test against 12.6 to avoid having too big of a CI matrix,
         # but for releases we should add 12.8.
-        cuda-version: ['12.6', '12.9']
+        # TODO add 13.0!
+        cuda-version: ['12.6']
         # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
         ffmpeg-version-for-tests: ['4.4.2', '6', '7']
 
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -15,6 +15,18 @@ static bool g_cpu = registerDeviceInterface(
 
 } // namespace
 
+CpuDeviceInterface::SwsFrameContext::SwsFrameContext(
+    int inputWidth,
+    int inputHeight,
+    AVPixelFormat inputFormat,
+    int outputWidth,
+    int outputHeight)
+    : inputWidth(inputWidth),
+      inputHeight(inputHeight),
+      inputFormat(inputFormat),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight) {}
+
 bool CpuDeviceInterface::SwsFrameContext::operator==(
     const CpuDeviceInterface::SwsFrameContext& other) const {
   return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
@@ -97,13 +109,12 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     // And we sometimes re-create them because it's possible for frame
     // resolution to change mid-stream. Finally, we want to reuse the colorspace
     // conversion objects as much as possible for performance reasons.
-    SwsFrameContext swsFrameContext;
-
-    swsFrameContext.inputWidth = avFrame->width;
-    swsFrameContext.inputHeight = avFrame->height;
-    swsFrameContext.inputFormat = frameFormat;
-    swsFrameContext.outputWidth = expectedOutputWidth;
-    swsFrameContext.outputHeight = expectedOutputHeight;
+    SwsFrameContext swsFrameContext(
+        avFrame->width,
+        avFrame->height,
+        frameFormat,
+        expectedOutputWidth,
+        expectedOutputHeight);
 
     outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
         expectedOutputHeight, expectedOutputWidth, torch::kCPU));
@@ -128,22 +139,20 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
     // See comment above in swscale branch about the filterGraphContext_
     // creation. creation
-    FiltersContext filtersContext;
-
-    filtersContext.inputWidth = avFrame->width;
-    filtersContext.inputHeight = avFrame->height;
-    filtersContext.inputFormat = frameFormat;
-    filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio;
-    filtersContext.outputWidth = expectedOutputWidth;
-    filtersContext.outputHeight = expectedOutputHeight;
-    filtersContext.outputFormat = AV_PIX_FMT_RGB24;
-    filtersContext.timeBase = timeBase;
-
     std::stringstream filters;
     filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
     filters << ":sws_flags=bilinear";
 
-    filtersContext.filtergraphStr = filters.str();
+    FiltersContext filtersContext(
+        avFrame->width,
+        avFrame->height,
+        frameFormat,
+        avFrame->sample_aspect_ratio,
+        expectedOutputWidth,
+        expectedOutputHeight,
+        AV_PIX_FMT_RGB24,
+        filters.str(),
+        timeBase);
 
     if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
       filterGraphContext_ =
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -43,11 +43,19 @@ class CpuDeviceInterface : public DeviceInterface {
       const UniqueAVFrame& avFrame);
 
   struct SwsFrameContext {
-    int inputWidth;
-    int inputHeight;
-    AVPixelFormat inputFormat;
-    int outputWidth;
-    int outputHeight;
+    int inputWidth = 0;
+    int inputHeight = 0;
+    AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+    int outputWidth = 0;
+    int outputHeight = 0;
+
+    SwsFrameContext() = default;
+    SwsFrameContext(
+        int inputWidth,
+        int inputHeight,
+        AVPixelFormat inputFormat,
+        int outputWidth,
+        int outputHeight);
     bool operator==(const SwsFrameContext&) const;
     bool operator!=(const SwsFrameContext&) const;
   };
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -275,7 +275,32 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
   }
 
   torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
-  nppCtx_->hStream = at::cuda::getCurrentCUDAStream(deviceIndex).stream();
+
+  // Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the
+  // NVDEC stream, i.e. the CUDA stream that the frame was decoded on.
+  // We will be waiting for this event to complete before calling the NPP
+  // functions, to ensure NVDEC has finished decoding the frame before running
+  // the NPP color-conversion.
+  // Note that our code is generic and assumes that the NVDEC's stream can be
+  // arbitrary, but unfortunately we know it's hardcoded to be the default
+  // stream by FFmpeg:
+  // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
+  TORCH_CHECK(
+      hwFramesCtx->device_ctx != nullptr,
+      "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
+  auto cudaDeviceCtx =
+      static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
+  at::cuda::CUDAEvent nvdecDoneEvent;
+  at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
+      c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
+  nvdecDoneEvent.record(nvdecStream);
+
+  // Don't start NPP work before NVDEC is done decoding the frame!
+  at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
+  nvdecDoneEvent.block(nppStream);
+
+  // Create the NPP context if we haven't yet.
+  nppCtx_->hStream = nppStream.stream();
   cudaError_t err =
       cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
   TORCH_CHECK(
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -17,14 +17,6 @@
 
 namespace facebook::torchcodec {
 
-// Note that all these device functions should only be called if the device is
-// not a CPU device. CPU device functions are already implemented in the
-// SingleStreamDecoder implementation.
-// These functions should only be called from within an if block like this:
-// if (device.type() != torch::kCPU) {
-//   deviceFunction(device, ...);
-// }
-
 class DeviceInterface {
  public:
   DeviceInterface(const torch::Device& device) : device_(device) {}
diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -61,7 +61,15 @@ int getNumChannels(const UniqueAVFrame& avFrame) {
     (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
   return avFrame->ch_layout.nb_channels;
 #else
-  return av_get_channel_layout_nb_channels(avFrame->channel_layout);
+  int numChannels = av_get_channel_layout_nb_channels(avFrame->channel_layout);
+  // Handle FFmpeg 4 bug where channel_layout and numChannels are 0 or unset
+  // Set values based on avFrame->channels which appears to be correct
+  // to allow successful initialization of SwrContext
+  if (numChannels == 0 && avFrame->channels > 0) {
+    avFrame->channel_layout = av_get_default_channel_layout(avFrame->channels);
+    numChannels = avFrame->channels;
+  }
+  return numChannels;
 #endif
 }
 
diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
@@ -13,6 +13,26 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
+FiltersContext::FiltersContext(
+    int inputWidth,
+    int inputHeight,
+    AVPixelFormat inputFormat,
+    AVRational inputAspectRatio,
+    int outputWidth,
+    int outputHeight,
+    AVPixelFormat outputFormat,
+    const std::string& filtergraphStr,
+    AVRational timeBase)
+    : inputWidth(inputWidth),
+      inputHeight(inputHeight),
+      inputFormat(inputFormat),
+      inputAspectRatio(inputAspectRatio),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight),
+      outputFormat(outputFormat),
+      filtergraphStr(filtergraphStr),
+      timeBase(timeBase) {}
+
 bool operator==(const AVRational& lhs, const AVRational& rhs) {
   return lhs.num == rhs.num && lhs.den == rhs.den;
 }
diff --git a/src/torchcodec/_core/FilterGraph.h b/src/torchcodec/_core/FilterGraph.h
@@ -19,11 +19,24 @@ struct FiltersContext {
   int outputWidth = 0;
   int outputHeight = 0;
   AVPixelFormat outputFormat = AV_PIX_FMT_NONE;
-
   std::string filtergraphStr;
   AVRational timeBase = {0, 0};
   UniqueAVBufferRef hwFramesCtx;
 
+  FiltersContext() = default;
+  FiltersContext(FiltersContext&&) = default;
+  FiltersContext& operator=(FiltersContext&&) = default;
+  FiltersContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      AVRational inputAspectRatio,
+      int outputWidth,
+      int outputHeight,
+      AVPixelFormat outputFormat,
+      const std::string& filtergraphStr,
+      AVRational timeBase);
+
   bool operator==(const FiltersContext&) const;
   bool operator!=(const FiltersContext&) const;
 };
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -243,13 +243,6 @@ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
     return;
   }
 
-  for (unsigned int i = 0; i < formatContext_->nb_streams; ++i) {
-    // We want to scan and update the metadata of all streams.
-    TORCH_CHECK(
-        formatContext_->streams[i]->discard != AVDISCARD_ALL,
-        "Did you add a stream before you called for a scan?");
-  }
-
   AutoAVPacket autoAVPacket;
   while (true) {
     ReferenceAVPacket packet(autoAVPacket);
@@ -1253,7 +1246,11 @@ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
       formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
     convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
-  } else if (deviceInterface_) {
+  } else {
+    TORCH_CHECK(
+        deviceInterface_ != nullptr,
+        "No device interface available for video decoding. This ",
+        "shouldn't happen, please report.");
     deviceInterface_->convertAVFrameToFrameOutput(
         streamInfo.videoStreamOptions,
         streamInfo.timeBase,
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -247,6 +247,12 @@ def get_frames_at(self, indices: list[int]) -> FrameBatch:
         Returns:
             FrameBatch: The frames at the given indices.
         """
+        if isinstance(indices, torch.Tensor):
+            # TODO we should avoid converting tensors to lists and just let the
+            # core ops and C++ code natively accept tensors.  See
+            # https://github.com/pytorch/torchcodec/issues/879
+            indices = indices.to(torch.int).tolist()
+
         data, pts_seconds, duration_seconds = core.get_frames_at_indices(
             self._decoder, frame_indices=indices
         )
@@ -322,6 +328,12 @@ def get_frames_played_at(self, seconds: list[float]) -> FrameBatch:
         Returns:
             FrameBatch: The frames that are played at ``seconds``.
         """
+        if isinstance(seconds, torch.Tensor):
+            # TODO we should avoid converting tensors to lists and just let the
+            # core ops and C++ code natively accept tensors.  See
+            # https://github.com/pytorch/torchcodec/issues/879
+            seconds = seconds.to(torch.float).tolist()
+
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
             self._decoder, timestamps=seconds
         )
diff --git a/test/resources/sine_mono_s16.wav.stream0.all_frames.pt b/test/resources/sine_mono_s16.wav.stream0.all_frames.pt
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -1390,6 +1390,17 @@ def test_custom_frame_mappings_init_fails_invalid_json(self, tmp_path, device):
                         custom_frame_mappings=custom_frame_mappings,
                     )
 
+    def test_get_frames_at_tensor_indices(self):
+        # Non-regression test for tensor support in get_frames_at() and
+        # get_frames_played_at()
+        decoder = VideoDecoder(NASA_VIDEO.path)
+
+        decoder.get_frames_at(torch.tensor([0, 10], dtype=torch.int))
+        decoder.get_frames_at(torch.tensor([0, 10], dtype=torch.float))
+
+        decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.int))
+        decoder.get_frames_played_at(torch.tensor([0, 1], dtype=torch.float))
+
 
 class TestAudioDecoder:
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3, SINE_MONO_S32))
@@ -1682,26 +1693,25 @@ def test_downsample_empty_frame(self):
             frames_44100_to_8000.data, frames_8000.data, atol=0.03, rtol=0
         )
 
-    def test_s16_ffmpeg4_bug(self):
-        # s16 fails on FFmpeg4 but can be decoded on other versions.
-        # Debugging logs show that we're hitting:
-        # [SWR @ 0x560a7abdaf80] Input channel count and layout are unset
-        # which seems to point to:
-        # https://github.com/FFmpeg/FFmpeg/blob/40a6963fbd0c47be358a3760480180b7b532e1e9/libswresample/swresample.c#L293-L305
-        # ¯\_(ツ)_/¯
+    def test_decode_s16_ffmpeg4(self):
+        # Non-regression test for https://github.com/pytorch/torchcodec/issues/843
+        # Ensures that decoding s16 on FFmpeg4 handles
+        # unset input channel count and layout
 
         asset = SINE_MONO_S16
         decoder = AudioDecoder(asset.path)
         assert decoder.metadata.sample_rate == asset.sample_rate
         assert decoder.metadata.sample_format == asset.sample_format
 
-        cm = (
-            pytest.raises(RuntimeError, match="The frame has 0 channels, expected 1.")
-            if get_ffmpeg_major_version() == 4
-            else contextlib.nullcontext()
+        test_samples = decoder.get_samples_played_in_range()
+        assert test_samples.data.shape[0] == decoder.metadata.num_channels
+        assert test_samples.sample_rate == decoder.metadata.sample_rate
+        reference_frames = asset.get_frame_data_by_range(
+            start=0, stop=1, stream_index=0
+        )
+        torch.testing.assert_close(
+            test_samples.data[0], reference_frames, atol=0, rtol=0
         )
-        with cm:
-            decoder.get_samples_played_in_range()
 
     @pytest.mark.parametrize("asset", (NASA_AUDIO, NASA_AUDIO_MP3))
     @pytest.mark.parametrize("sample_rate", (None, 8000, 16_000, 44_1000))