Refactor C++ decoder initialization (#533)

scotts · web-flow · commit 5e325a290278 · 2025-03-04T11:01:56.000-05:00
1. We had a struct, AVInput, which was not necessary. Removing it allows us to simplify
setting up a VideoDecoder that uses an AVIOContext that reads directly from a provided
buffer of bytes.
2. Normalize style of names and clarify purpose.
3. Use TORCH_CHECK instead of throwing exceptions.
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -62,28 +62,26 @@ int64_t getDuration(const AVFrame* frame) {
 
 AVIOBytesContext::AVIOBytesContext(
     const void* data,
-    size_t data_size,
-    size_t tempBufferSize) {
-  auto buffer = static_cast<uint8_t*>(av_malloc(tempBufferSize));
-  if (!buffer) {
-    throw std::runtime_error(
-        "Failed to allocate buffer of size " + std::to_string(tempBufferSize));
-  }
-  bufferData_.data = static_cast<const uint8_t*>(data);
-  bufferData_.size = data_size;
-  bufferData_.current = 0;
+    size_t dataSize,
+    size_t bufferSize)
+    : bufferData_{static_cast<const uint8_t*>(data), dataSize, 0} {
+  auto buffer = static_cast<uint8_t*>(av_malloc(bufferSize));
+  TORCH_CHECK(
+      buffer != nullptr,
+      "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
   avioContext_.reset(avio_alloc_context(
       buffer,
-      tempBufferSize,
+      bufferSize,
       0,
       &bufferData_,
       &AVIOBytesContext::read,
       nullptr,
       &AVIOBytesContext::seek));
+
   if (!avioContext_) {
     av_freep(&buffer);
-    throw std::runtime_error("Failed to allocate AVIOContext");
+    TORCH_CHECK(false, "Failed to allocate AVIOContext");
   }
 }
 
@@ -99,14 +97,14 @@ AVIOContext* AVIOBytesContext::getAVIO() {
 
 // The signature of this function is defined by FFMPEG.
 int AVIOBytesContext::read(void* opaque, uint8_t* buf, int buf_size) {
-  struct AVIOBufferData* bufferData =
-      static_cast<struct AVIOBufferData*>(opaque);
+  auto bufferData = static_cast<AVIOBufferData*>(opaque);
   TORCH_CHECK(
       bufferData->current <= bufferData->size,
       "Tried to read outside of the buffer: current=",
       bufferData->current,
       ", size=",
       bufferData->size);
+
   buf_size =
       FFMIN(buf_size, static_cast<int>(bufferData->size - bufferData->current));
   TORCH_CHECK(
@@ -117,6 +115,7 @@ int AVIOBytesContext::read(void* opaque, uint8_t* buf, int buf_size) {
       bufferData->size,
       ", current=",
       bufferData->current);
+
   if (!buf_size) {
     return AVERROR_EOF;
   }
@@ -127,7 +126,7 @@ int AVIOBytesContext::read(void* opaque, uint8_t* buf, int buf_size) {
 
 // The signature of this function is defined by FFMPEG.
 int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
-  AVIOBufferData* bufferData = (AVIOBufferData*)opaque;
+  auto bufferData = static_cast<AVIOBufferData*>(opaque);
   int64_t ret = -1;
 
   switch (whence) {
@@ -141,6 +140,7 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
     default:
       break;
   }
+
   return ret;
 }
 
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -155,7 +155,7 @@ struct AVIOBufferData {
 // memory buffer that is passed in.
 class AVIOBytesContext {
  public:
-  AVIOBytesContext(const void* data, size_t data_size, size_t tempBufferSize);
+  AVIOBytesContext(const void* data, size_t dataSize, size_t bufferSize);
   ~AVIOBytesContext();
 
   // Returns the AVIOContext that can be passed to FFMPEG.
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -40,11 +40,6 @@ int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
   return static_cast<int64_t>(std::round(seconds * timeBase.den));
 }
 
-struct AVInput {
-  UniqueAVFormatContext formatContext;
-  std::unique_ptr<AVIOBytesContext> ioBytesContext;
-};
-
 std::vector<std::string> splitStringWithDelimiters(
     const std::string& str,
     const std::string& delims) {
@@ -72,50 +67,47 @@ VideoDecoder::VideoDecoder(const std::string& videoFilePath, SeekMode seekMode)
     : seekMode_(seekMode) {
   av_log_set_level(AV_LOG_QUIET);
 
-  AVFormatContext* formatContext = nullptr;
-  int open_ret = avformat_open_input(
-      &formatContext, videoFilePath.c_str(), nullptr, nullptr);
-  if (open_ret != 0) {
-    throw std::invalid_argument(
-        "Could not open input file: " + videoFilePath + " " +
-        getFFMPEGErrorStringFromErrorCode(open_ret));
-  }
-  TORCH_CHECK(formatContext != nullptr);
-  AVInput input;
-  input.formatContext.reset(formatContext);
-  formatContext_ = std::move(input.formatContext);
+  AVFormatContext* rawContext = nullptr;
+  int ffmpegStatus =
+      avformat_open_input(&rawContext, videoFilePath.c_str(), nullptr, nullptr);
+  TORCH_CHECK(
+      ffmpegStatus == 0,
+      "Could not open input file: " + videoFilePath + " " +
+          getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
+  TORCH_CHECK(rawContext != nullptr);
+  formatContext_.reset(rawContext);
 
   initializeDecoder();
 }
 
-VideoDecoder::VideoDecoder(const void* buffer, size_t length, SeekMode seekMode)
+VideoDecoder::VideoDecoder(const void* data, size_t length, SeekMode seekMode)
     : seekMode_(seekMode) {
-  TORCH_CHECK(buffer != nullptr, "Video buffer cannot be nullptr!");
+  TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
 
   av_log_set_level(AV_LOG_QUIET);
 
-  AVInput input;
-  input.formatContext.reset(avformat_alloc_context());
-  TORCH_CHECK(
-      input.formatContext.get() != nullptr, "Unable to alloc avformat context");
-  constexpr int kAVIOInternalTemporaryBufferSize = 64 * 1024;
-  input.ioBytesContext.reset(
-      new AVIOBytesContext(buffer, length, kAVIOInternalTemporaryBufferSize));
-  if (!input.ioBytesContext) {
-    throw std::runtime_error("Failed to create AVIOBytesContext");
-  }
-  input.formatContext->pb = input.ioBytesContext->getAVIO();
-  AVFormatContext* tempFormatContext = input.formatContext.release();
-  int open_ret =
-      avformat_open_input(&tempFormatContext, nullptr, nullptr, nullptr);
-  input.formatContext.reset(tempFormatContext);
-  if (open_ret != 0) {
-    throw std::runtime_error(
-        std::string("Failed to open input buffer: ") +
-        getFFMPEGErrorStringFromErrorCode(open_ret));
+  constexpr int bufferSize = 64 * 1024;
+  ioBytesContext_.reset(new AVIOBytesContext(data, length, bufferSize));
+  TORCH_CHECK(ioBytesContext_, "Failed to create AVIOBytesContext");
+
+  // Because FFmpeg requires a reference to a pointer in the call to open, we
+  // can't use a unique pointer here. Note that means we must call free if open
+  // fails.
+  AVFormatContext* rawContext = avformat_alloc_context();
+  TORCH_CHECK(rawContext != nullptr, "Unable to alloc avformat context");
+
+  rawContext->pb = ioBytesContext_->getAVIO();
+  int ffmpegStatus =
+      avformat_open_input(&rawContext, nullptr, nullptr, nullptr);
+  if (ffmpegStatus != 0) {
+    avformat_free_context(rawContext);
+    TORCH_CHECK(
+        false,
+        "Failed to open input buffer: " +
+            getFFMPEGErrorStringFromErrorCode(ffmpegStatus));
   }
-  formatContext_ = std::move(input.formatContext);
-  ioBytesContext_ = std::move(input.ioBytesContext);
+
+  formatContext_.reset(rawContext);
 
   initializeDecoder();
 }
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -34,10 +34,10 @@ class VideoDecoder {
       const std::string& videoFilePath,
       SeekMode seekMode = SeekMode::exact);
 
-  // Creates a VideoDecoder from a given buffer. Note that the buffer is not
-  // owned by the VideoDecoder.
+  // Creates a VideoDecoder from a given buffer of data. Note that the data is
+  // not owned by the VideoDecoder.
   explicit VideoDecoder(
-      const void* buffer,
+      const void* data,
       size_t length,
       SeekMode seekMode = SeekMode::exact);
 
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp
@@ -93,8 +93,7 @@ TEST_P(VideoDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) {
 }
 
 TEST(VideoDecoderTest, MissingVideoFileThrowsException) {
-  EXPECT_THROW(
-      VideoDecoder("/this/file/does/not/exist"), std::invalid_argument);
+  EXPECT_THROW(VideoDecoder("/this/file/does/not/exist"), c10::Error);
 }
 
 void dumpTensorToDisk(

Original file line number	Diff line number	Diff line change
`@@ -93,8 +93,7 @@ TEST_P(VideoDecoderTest, ReturnsFpsAndDurationForVideoInMetadata) {`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`TEST(VideoDecoderTest, MissingVideoFileThrowsException) {`
`96`		`- EXPECT_THROW(`
`97`		`- VideoDecoder("/this/file/does/not/exist"), std::invalid_argument);`
	`96`	`+ EXPECT_THROW(VideoDecoder("/this/file/does/not/exist"), c10::Error);`
`98`	`97`	`}`
`99`	`98`
`100`	`99`	`void dumpTensorToDisk(`