Refactor device interface, again.

scotts · scotts · commit d2e9bde61f02 · 2025-10-01T11:41:02.000-07:00
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -48,12 +48,15 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
 
 void CpuDeviceInterface::initialize(
     [[maybe_unused]] AVCodecContext* codecContext,
+    const AVRational& timeBase) {
+  timeBase_ = timeBase;
+}
+
+void CpuDeviceInterface::initializeVideo(
     const VideoStreamOptions& videoStreamOptions,
     const std::vector<std::unique_ptr<Transform>>& transforms,
-    const AVRational& timeBase,
     const std::optional<FrameDims>& resizedOutputDims) {
   videoStreamOptions_ = videoStreamOptions;
-  timeBase_ = timeBase;
   resizedOutputDims_ = resizedOutputDims;
 
   // We can only use swscale when we have a single resize transform. Note that
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -25,9 +25,11 @@ class CpuDeviceInterface : public DeviceInterface {
 
   virtual void initialize(
       [[maybe_unused]] AVCodecContext* codecContext,
+      const AVRational& timeBase) override;
+
+  virtual void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,
       const std::vector<std::unique_ptr<Transform>>& transforms,
-      const AVRational& timeBase,
       const std::optional<FrameDims>& resizedOutputDims) override;
 
   void convertAVFrameToFrameOutput(
@@ -73,6 +75,14 @@ class CpuDeviceInterface : public DeviceInterface {
 
   VideoStreamOptions videoStreamOptions_;
   AVRational timeBase_;
+
+  // If the resized output dimensions are present, then we always use those as
+  // the output frame's dimensions. If they are not present, then we use the
+  // dimensions of the raw decoded frame. Note that we do not know the
+  // dimensions of the raw decoded frame until very late; we learn it in
+  // convertAVFrameToFrameOutput(). Deciding the final output frame's actual
+  // dimensions late allows us to handle video streams with variable
+  // resolutions.
   std::optional<FrameDims> resizedOutputDims_;
 
   // Color-conversion objects. Only one of filterGraph_ and swsContext_ should
diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp
@@ -195,18 +195,20 @@ CudaDeviceInterface::~CudaDeviceInterface() {
 
 void CudaDeviceInterface::initialize(
     AVCodecContext* codecContext,
-    const VideoStreamOptions& videoStreamOptions,
-    [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
-    const AVRational& timeBase,
-    [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {
+    const AVRational& timeBase) {
   TORCH_CHECK(ctx_, "FFmpeg HW device has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
-
   codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
-  videoStreamOptions_ = videoStreamOptions;
   timeBase_ = timeBase;
 }
 
+void CudaDeviceInterface::initializeVideo(
+    const VideoStreamOptions& videoStreamOptions,
+    [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
+    [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {
+  videoStreamOptions_ = videoStreamOptions;
+}
+
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12(
     UniqueAVFrame& avFrame) {
   // We need FFmpeg filters to handle those conversion cases which are not
@@ -220,13 +222,13 @@ UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12(
     return std::move(avFrame);
   }
 
+  auto hwFramesCtx =
+      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
   TORCH_CHECK(
-      avFrame->hw_frames_ctx != nullptr,
+      hwFramesCtx != nullptr,
       "The AVFrame does not have a hw_frames_ctx. "
       "That's unexpected, please report this to the TorchCodec repo.");
 
-  auto hwFramesCtx =
-      reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
   AVPixelFormat actualFormat = hwFramesCtx->sw_format;
 
   // If the frame is already in NV12 format, we don't need to do anything.
@@ -355,10 +357,10 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
       TORCH_CHECK(
           cpuInterface != nullptr, "Failed to create CPU device interface");
       cpuInterface->initialize(
-          nullptr,
+          /*codecContext=*/nullptr, timeBase_);
+      cpuInterface->initializeVideo(
           VideoStreamOptions(),
           {},
-          timeBase_,
           /*resizedOutputDims=*/std::nullopt);
 
       cpuInterface->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
diff --git a/src/torchcodec/_core/CudaDeviceInterface.h b/src/torchcodec/_core/CudaDeviceInterface.h
@@ -20,12 +20,13 @@ class CudaDeviceInterface : public DeviceInterface {
 
   std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
 
-  void initialize(
-      AVCodecContext* codecContext,
+  void initialize(AVCodecContext* codecContext, const AVRational& timeBase)
+      override;
+
+  void initializeVideo(
       const VideoStreamOptions& videoStreamOptions,
       [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>&
           transforms,
-      const AVRational& timeBase,
       [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims)
       override;
 
diff --git a/src/torchcodec/_core/DeviceInterface.h b/src/torchcodec/_core/DeviceInterface.h
@@ -30,14 +30,18 @@ class DeviceInterface {
 
   virtual std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) = 0;
 
-  // Initialize the hardware device that is specified in `device`. Some builds
-  // support CUDA and others only support CPU.
+  // Initialize the device with parameters generic to all kinds of decoding.
   virtual void initialize(
       AVCodecContext* codecContext,
-      const VideoStreamOptions& videoStreamOptions,
-      const std::vector<std::unique_ptr<Transform>>& transforms,
-      const AVRational& timeBase,
-      const std::optional<FrameDims>& resizedOutputDims) = 0;
+      const AVRational& timeBase) = 0;
+
+  // Initialize the device with parameters specific to video decoding. There is
+  // a default empty implementation.
+  virtual void initializeVideo(
+      [[maybe_unused]] const VideoStreamOptions& videoStreamOptions,
+      [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>&
+          transforms,
+      [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {}
 
   virtual void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -436,13 +436,20 @@ void SingleStreamDecoder::addStream(
   TORCH_CHECK(codecContext != nullptr);
   streamInfo.codecContext.reset(codecContext);
 
+  deviceInterface_->initialize(
+      streamInfo.codecContext.get(), streamInfo.timeBase);
+
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
   streamInfo.codecContext->thread_count = ffmpegThreadCount.value_or(0);
   streamInfo.codecContext->pkt_timebase = streamInfo.stream->time_base;
 
+  // Note that we must make sure to call avcodec_open2() AFTER we initialize
+  // the device interface. Device initialization tells the codec context which
+  // device to use. If we initialize the device interface after avcodec_open2(),
+  // then all decoding may fall back to the CPU.
   retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr);
   TORCH_CHECK(retVal >= AVSUCCESS, getFFMPEGErrorStringFromErrorCode(retVal));
 
@@ -510,18 +517,14 @@ void SingleStreamDecoder::addVideoStream(
     if (transform->getOutputFrameDims().has_value()) {
       resizedOutputDims_ = transform->getOutputFrameDims().value();
     }
+
+    // Note that we are claiming ownership of the transform objects passed in to
+    // us.
     transforms_.push_back(std::unique_ptr<Transform>(transform));
   }
 
-  // We initialize the device context late because we want to know a lot of
-  // information that we can only know after resolving the codec, opening the
-  // stream and inspecting the metadata.
-  deviceInterface_->initialize(
-      streamInfo.codecContext.get(),
-      videoStreamOptions,
-      transforms_,
-      streamInfo.timeBase,
-      resizedOutputDims_);
+  deviceInterface_->initializeVideo(
+      videoStreamOptions, transforms_, resizedOutputDims_);
 }
 
 void SingleStreamDecoder::addAudioStream(