Refactor AVIOContextHolder

scotts · scotts · commit 681b9cc1587a · 2025-03-15T12:48:07.000-07:00
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -78,14 +78,14 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
 #endif
 }
 
-AVIOBytesContext::AVIOBytesContext(
-    const void* data,
-    int64_t dataSize,
-    int bufferSize)
-    : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
-  TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
-  TORCH_CHECK(dataSize > 0, "Video data size must be positive");
-
+void AVIOContextHolder::createAVIOContext(
+    AVIOReadFunction read,
+    AVIOSeekFunction seek,
+    void* heldData,
+    int bufferSize) {
+  TORCH_CHECK(
+      bufferSize > 0,
+      "Buffer size must be greater than 0; is " + std::to_string(bufferSize));
   auto buffer = static_cast<uint8_t*>(av_malloc(bufferSize));
   TORCH_CHECK(
       buffer != nullptr,
@@ -95,74 +95,25 @@ AVIOBytesContext::AVIOBytesContext(
       buffer,
       bufferSize,
       0,
-      &dataContext_,
-      &AVIOBytesContext::read,
-      nullptr,
-      &AVIOBytesContext::seek));
+      heldData,
+      read,
+      nullptr, // write function; not supported yet
+      seek));
 
   if (!avioContext_) {
     av_freep(&buffer);
     TORCH_CHECK(false, "Failed to allocate AVIOContext");
   }
 }
 
-AVIOBytesContext::~AVIOBytesContext() {
+AVIOContextHolder::~AVIOContextHolder() {
   if (avioContext_) {
     av_freep(&avioContext_->buffer);
   }
 }
 
-AVIOContext* AVIOBytesContext::getAVIOContext() const {
+AVIOContext* AVIOContextHolder::getAVIOContext() {
   return avioContext_.get();
 }
 
-// The signature of this function is defined by FFmpeg.
-int AVIOBytesContext::read(void* opaque, uint8_t* buf, int buf_size) {
-  auto dataContext = static_cast<DataContext*>(opaque);
-  TORCH_CHECK(
-      dataContext->current <= dataContext->size,
-      "Tried to read outside of the buffer: current=",
-      dataContext->current,
-      ", size=",
-      dataContext->size);
-
-  buf_size = FFMIN(
-      buf_size, static_cast<int>(dataContext->size - dataContext->current));
-  TORCH_CHECK(
-      buf_size >= 0,
-      "Tried to read negative bytes: buf_size=",
-      buf_size,
-      ", size=",
-      dataContext->size,
-      ", current=",
-      dataContext->current);
-
-  if (!buf_size) {
-    return AVERROR_EOF;
-  }
-  memcpy(buf, dataContext->data + dataContext->current, buf_size);
-  dataContext->current += buf_size;
-  return buf_size;
-}
-
-// The signature of this function is defined by FFmpeg.
-int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
-  auto dataContext = static_cast<DataContext*>(opaque);
-  int64_t ret = -1;
-
-  switch (whence) {
-    case AVSEEK_SIZE:
-      ret = dataContext->size;
-      break;
-    case SEEK_SET:
-      dataContext->current = offset;
-      ret = offset;
-      break;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -145,43 +145,27 @@ int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
 
+using AVIOReadFunction = int (*)(void*, uint8_t*, int);
+using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
+
 // TODO: explain purpose of context holder
 class AVIOContextHolder {
  public:
-  virtual ~AVIOContextHolder(){};
-  virtual AVIOContext* getAVIOContext() const = 0;
-};
-
-// TODO: make comment below better
-// A struct that holds state for reading bytes from an IO context.
-// We give this to FFMPEG and it will pass it back to us when it needs to read
-// or seek in the memory buffer.
-//
-// A class that can be used as AVFormatContext's IO context. It reads from a
-// memory buffer that is passed in.
-class AVIOBytesContext : public AVIOContextHolder {
- public:
-  AVIOBytesContext(const void* data, int64_t dataSize, int bufferSize);
-  virtual ~AVIOBytesContext();
+  virtual ~AVIOContextHolder();
+  AVIOContext* getAVIOContext();
 
-  // Returns the AVIOContext that can be passed to FFMPEG.
-  virtual AVIOContext* getAVIOContext() const override;
-
-  // The signature of this function is defined by FFMPEG.
-  static int read(void* opaque, uint8_t* buf, int buf_size);
-
-  // The signature of this function is defined by FFMPEG.
-  static int64_t seek(void* opaque, int64_t offset, int whence);
+ protected:
+  void createAVIOContext(
+      AVIOReadFunction read,
+      AVIOSeekFunction seek,
+      void* heldData,
+      int bufferSize = defaultBufferSize);
 
  private:
-  struct DataContext {
-    const uint8_t* data;
-    int64_t size;
-    int64_t current;
-  };
-
   UniqueAVIOContext avioContext_;
-  DataContext dataContext_;
+
+  // Defaults to 64 KB
+  static const int defaultBufferSize = 64 * 1014;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/decoders/_core/PyBindOps.cpp b/src/torchcodec/decoders/_core/PyBindOps.cpp
@@ -28,11 +28,9 @@ struct PyObjectDeleter {
 
 class AVIOFileLikeContext : public AVIOContextHolder {
  public:
-  AVIOFileLikeContext(py::object fileLike, int bufferSize)
-      : fileLikeContext_{
-            std::unique_ptr<py::object, PyObjectDeleter>(
-                new py::object(fileLike)),
-            bufferSize} {
+  explicit AVIOFileLikeContext(py::object fileLike)
+      : fileLikeContext_{std::unique_ptr<py::object, PyObjectDeleter>(
+            new py::object(fileLike))} {
     {
       // TODO: Is it necessary to acquire the GIL here? Is it maybe even
       // harmful? At the moment, this is only called from within a pybind
@@ -45,40 +43,11 @@ class AVIOFileLikeContext : public AVIOContextHolder {
           py::hasattr(fileLike, "seek"),
           "File like object must implement a seek method.");
     }
-
-    auto buffer = static_cast<uint8_t*>(av_malloc(bufferSize));
-    TORCH_CHECK(
-        buffer != nullptr,
-        "Failed to allocate buffer of size " + std::to_string(bufferSize));
-
-    avioContext_.reset(avio_alloc_context(
-        buffer,
-        bufferSize,
-        0,
-        &fileLikeContext_,
-        &AVIOFileLikeContext::read,
-        nullptr,
-        &AVIOFileLikeContext::seek));
-
-    if (!avioContext_) {
-      av_freep(&buffer);
-      TORCH_CHECK(false, "Failed to allocate AVIOContext");
-    }
-  }
-
-  virtual ~AVIOFileLikeContext() {
-    if (avioContext_) {
-      av_freep(&avioContext_->buffer);
-    }
-  }
-
-  virtual AVIOContext* getAVIOContext() const override {
-    return avioContext_.get();
+    createAVIOContext(&read, &seek, &fileLikeContext_);
   }
 
   static int read(void* opaque, uint8_t* buf, int buf_size) {
     auto fileLikeContext = static_cast<FileLikeContext*>(opaque);
-    buf_size = FFMIN(buf_size, fileLikeContext->bufferSize);
 
     int num_read = 0;
     while (num_read < buf_size) {
@@ -126,10 +95,8 @@ class AVIOFileLikeContext : public AVIOContextHolder {
     //
     //   https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
     std::unique_ptr<py::object, PyObjectDeleter> fileLike;
-    int bufferSize;
   };
 
-  UniqueAVIOContext avioContext_;
   FileLikeContext fileLikeContext_;
 };
 
@@ -150,9 +117,7 @@ int64_t create_from_file_like(
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  constexpr int bufferSize = 64 * 1024;
-  auto contextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, bufferSize);
+  auto contextHolder = std::make_unique<AVIOFileLikeContext>(file_like);
 
   VideoDecoder* decoder = new VideoDecoder(std::move(contextHolder), realSeek);
   return reinterpret_cast<int64_t>(decoder);
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -64,6 +64,82 @@ TORCH_LIBRARY(torchcodec_ns, m) {
 }
 
 namespace {
+
+// TODO: make comment below better
+// A struct that holds state for reading bytes from an IO context.
+// We give this to FFMPEG and it will pass it back to us when it needs to read
+// or seek in the memory buffer.
+//
+// A class that can be used as AVFormatContext's IO context. It reads from a
+// memory buffer that is passed in.
+class AVIOBytesContext : public AVIOContextHolder {
+ public:
+  explicit AVIOBytesContext(const void* data, int64_t dataSize)
+      : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
+    TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
+    TORCH_CHECK(dataSize > 0, "Video data size must be positive");
+    createAVIOContext(&read, &seek, &dataContext_);
+  }
+
+  // The signature of this function is defined by FFMPEG.
+  static int read(void* opaque, uint8_t* buf, int buf_size) {
+    auto dataContext = static_cast<DataContext*>(opaque);
+    TORCH_CHECK(
+        dataContext->current <= dataContext->size,
+        "Tried to read outside of the buffer: current=",
+        dataContext->current,
+        ", size=",
+        dataContext->size);
+
+    buf_size = FFMIN(
+        buf_size, static_cast<int>(dataContext->size - dataContext->current));
+    TORCH_CHECK(
+        buf_size >= 0,
+        "Tried to read negative bytes: buf_size=",
+        buf_size,
+        ", size=",
+        dataContext->size,
+        ", current=",
+        dataContext->current);
+
+    if (!buf_size) {
+      return AVERROR_EOF;
+    }
+    memcpy(buf, dataContext->data + dataContext->current, buf_size);
+    dataContext->current += buf_size;
+    return buf_size;
+  }
+
+  // The signature of this function is defined by FFMPEG.
+  static int64_t seek(void* opaque, int64_t offset, int whence) {
+    auto dataContext = static_cast<DataContext*>(opaque);
+    int64_t ret = -1;
+
+    switch (whence) {
+      case AVSEEK_SIZE:
+        ret = dataContext->size;
+        break;
+      case SEEK_SET:
+        dataContext->current = offset;
+        ret = offset;
+        break;
+      default:
+        break;
+    }
+
+    return ret;
+  }
+
+ private:
+  struct DataContext {
+    const uint8_t* data;
+    int64_t size;
+    int64_t current;
+  };
+
+  DataContext dataContext_;
+};
+
 at::Tensor wrapDecoderPointerToTensor(
     std::unique_ptr<VideoDecoder> uniqueDecoder) {
   VideoDecoder* decoder = uniqueDecoder.release();
@@ -135,9 +211,7 @@ at::Tensor create_from_tensor(
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  constexpr int bufferSize = 64 * 1024;
-  auto contextHolder =
-      std::make_unique<AVIOBytesContext>(data, length, bufferSize);
+  auto contextHolder = std::make_unique<AVIOBytesContext>(data, length);
 
   std::unique_ptr<VideoDecoder> uniqueDecoder =
       std::make_unique<VideoDecoder>(std::move(contextHolder), realSeek);