Support encoding into tensor

NicolasHug · NicolasHug · commit cfea9abc890c · 2025-04-11T14:27:19.000+01:00
diff --git a/src/torchcodec/_core/AVIOBytesContext.cpp b/src/torchcodec/_core/AVIOBytesContext.cpp
@@ -13,7 +13,7 @@ AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
     : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
   TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
   TORCH_CHECK(dataSize > 0, "Video data size must be positive");
-  createAVIOContext(&read, &seek, &dataContext_);
+  createAVIOContext(&read, nullptr, &seek, &dataContext_);
 }
 
 // The signature of this function is defined by FFMPEG.
@@ -67,4 +67,46 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
   return ret;
 }
 
+AVIOToTensorContext::AVIOToTensorContext()
+    : dataContext_{torch::empty({OUTPUT_TENSOR_SIZE}, {torch::kUInt8}), 0} {
+  createAVIOContext(nullptr, &write, &seek, &dataContext_);
+}
+
+// The signature of this function is defined by FFMPEG.
+int AVIOToTensorContext::write(void* opaque, uint8_t* buf, int buf_size) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+  TORCH_CHECK(
+      dataContext->current + buf_size <= OUTPUT_TENSOR_SIZE,
+      "Can't encode more, output tensor needs to be re-allocated and this isn't supported yet.");
+  uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
+  std::memcpy(outputTensorData + dataContext->current, buf, buf_size);
+  dataContext->current += static_cast<int64_t>(buf_size);
+  return buf_size;
+}
+
+// The signature of this function is defined by FFMPEG.
+int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+  int64_t ret = -1;
+
+  switch (whence) {
+    case AVSEEK_SIZE:
+      ret = dataContext->outputTensor.numel();
+      break;
+    case SEEK_SET:
+      dataContext->current = offset;
+      ret = offset;
+      break;
+    default:
+      break;
+  }
+
+  return ret;
+}
+
+torch::Tensor AVIOToTensorContext::getOutputTensor() {
+  return dataContext_.outputTensor.narrow(
+      /*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOBytesContext.h b/src/torchcodec/_core/AVIOBytesContext.h
@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <torch/types.h>
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 
 namespace facebook::torchcodec {
@@ -29,4 +30,25 @@ class AVIOBytesContext : public AVIOContextHolder {
   DataContext dataContext_;
 };
 
+class AVIOToTensorContext : public AVIOContextHolder {
+ public:
+  explicit AVIOToTensorContext();
+  torch::Tensor getOutputTensor();
+
+ private:
+  // Should this class be tensor-aware? Or should we just store a uint8* buffer
+  // instead of the tensor? If it's not tensor-aware it means we need to do the
+  // (re)allocation outside of it. Same for the call to narrow().
+  struct DataContext {
+    torch::Tensor outputTensor;
+    int64_t current;
+  };
+
+  static const int OUTPUT_TENSOR_SIZE = 5'000'000; // TODO-ENCODING handle this
+  static int write(void* opaque, uint8_t* buf, int buf_size);
+  static int64_t seek(void* opaque, int64_t offset, int whence);
+
+  DataContext dataContext_;
+};
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOContextHolder.cpp b/src/torchcodec/_core/AVIOContextHolder.cpp
@@ -11,6 +11,7 @@ namespace facebook::torchcodec {
 
 void AVIOContextHolder::createAVIOContext(
     AVIOReadFunction read,
+    AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
     int bufferSize) {
@@ -22,13 +23,17 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
+  TORCH_CHECK(
+      write != nullptr ^ (read != nullptr && seek != nullptr),
+      "read and seek methods must be defined, or write method must be defined. "
+      "But not both!")
   avioContext_.reset(avio_alloc_context(
       buffer,
       bufferSize,
-      0,
+      /*write_flag=*/write != nullptr,
       heldData,
       read,
-      nullptr, // write function; not supported yet
+      write,
       seek));
 
   if (!avioContext_) {
diff --git a/src/torchcodec/_core/AVIOContextHolder.h b/src/torchcodec/_core/AVIOContextHolder.h
@@ -19,9 +19,9 @@ namespace facebook::torchcodec {
 //      freed.
 //   2. It is a base class for AVIOContext specializations. When specializing a
 //      AVIOContext, we need to provide four things:
-//        1. A read callback function.
-//        2. A seek callback function.
-//        3. A write callback function. (Not supported yet; it's for encoding.)
+//        1. A read callback function, for decoding.
+//        2. A seek callback function, for decoding and encoding.
+//        3. A write callback function, for encoding>
 //        4. A pointer to some context object that has the same lifetime as the
 //           AVIOContext itself. This context object holds the custom state that
 //           tracks the custom behavior of reading, seeking and writing. It is
@@ -46,11 +46,13 @@ class AVIOContextHolder {
 
   // These signatures are defined by FFmpeg.
   using AVIOReadFunction = int (*)(void*, uint8_t*, int);
+  using AVIOWriteFunction = int (*)(void*, uint8_t*, int);
   using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
 
   // Deriving classes should call this function in their constructor.
   void createAVIOContext(
       AVIOReadFunction read,
+      AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
       int bufferSize = defaultBufferSize);
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.cpp b/src/torchcodec/_core/AVIOFileLikeContext.cpp
@@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, &seek, &fileLike_);
+  createAVIOContext(&read, nullptr, &seek, &fileLike_);
 }
 
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -8,7 +8,8 @@ find_package(pybind11 REQUIRED)
 find_package(Torch REQUIRED)
 find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -Werror ${TORCH_CXX_FLAGS}")
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic -Werror ${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
 function(make_torchcodec_sublibrary
     library_name
@@ -60,11 +61,13 @@ function(make_torchcodec_libraries
     set(decoder_sources
         AVIOContextHolder.cpp
         FFMPEGCommon.cpp
-	DeviceInterface.cpp
+	    DeviceInterface.cpp
         SingleStreamDecoder.cpp
         # TODO: lib name should probably not be "*_decoder*" now that it also
         # contains an encoder
         Encoder.cpp
+        # TODO-Encoding remove from here. Should only be needed in custom_ops.cpp
+        AVIOBytesContext.cpp
     )
 
     if(ENABLE_CUDA)
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -1,5 +1,6 @@
 #include <sstream>
 
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
@@ -40,9 +41,13 @@ AudioEncoder::~AudioEncoder() {}
 AudioEncoder::AudioEncoder(
     const torch::Tensor wf,
     int sampleRate,
-    std::string_view fileName,
+    std::optional<std::string_view> fileName,
+    std::optional<std::string_view> formatName,
     std::optional<int64_t> bit_rate)
     : wf_(wf) {
+  TORCH_CHECK(
+      fileName.has_value() ^ formatName.has_value(),
+      "Pass one of filename OR format, not both.");
   TORCH_CHECK(
       wf_.dtype() == torch::kFloat32,
       "waveform must have float32 dtype, got ",
@@ -54,25 +59,32 @@ AudioEncoder::AudioEncoder(
 
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
-  auto status = avformat_alloc_output_context2(
-      &avFormatContext, nullptr, nullptr, fileName.data());
+  int status = AVSUCCESS;
+  if (fileName.has_value()) {
+    status = avformat_alloc_output_context2(
+        &avFormatContext, nullptr, nullptr, fileName->data());
+  } else {
+    status = avformat_alloc_output_context2(
+        &avFormatContext, nullptr, formatName->data(), nullptr);
+  }
   TORCH_CHECK(
       avFormatContext != nullptr,
       "Couldn't allocate AVFormatContext. ",
       "Check the desired extension? ",
       getFFMPEGErrorStringFromErrorCode(status));
   avFormatContext_.reset(avFormatContext);
 
-  // TODO-ENCODING: Should also support encoding into bytes (use
-  // AVIOBytesContext)
-  TORCH_CHECK(
-      !(avFormatContext->oformat->flags & AVFMT_NOFILE),
-      "AVFMT_NOFILE is set. We only support writing to a file.");
-  status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
-  TORCH_CHECK(
-      status >= 0,
-      "avio_open failed: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+  if (fileName.has_value()) {
+    status =
+        avio_open(&avFormatContext_->pb, fileName->data(), AVIO_FLAG_WRITE);
+    TORCH_CHECK(
+        status >= 0,
+        "avio_open failed: ",
+        getFFMPEGErrorStringFromErrorCode(status));
+  } else {
+    avioContextHolder_ = std::make_unique<AVIOToTensorContext>();
+    avFormatContext->pb = avioContextHolder_->getAVIOContext();
+  }
 
   // We use the AVFormatContext's default codec for that
   // specific format/container.
@@ -168,7 +180,18 @@ AVSampleFormat AudioEncoder::findOutputSampleFormat(const AVCodec& avCodec) {
   return avCodec.sample_fmts[0];
 }
 
+torch::Tensor AudioEncoder::encodeToTensor() {
+  TORCH_CHECK(
+      avioContextHolder_ != nullptr,
+      "Cannot encode to tensor, avio context doesn't exist.");
+  encode();
+  return avioContextHolder_->getOutputTensor();
+}
+
 void AudioEncoder::encode() {
+  // TODO-ENCODING: Need to check, but consecutive calls to encode() are
+  // probably invalid. We can address this once we (re)design the public and
+  // private encoding APIs.
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
   //  Default to 256 like in torchaudio
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/types.h>
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 
 namespace facebook::torchcodec {
@@ -19,9 +20,11 @@ class AudioEncoder {
       // match this, and that's up to the user. If sample rates don't match,
       // encoding will still work but audio will be distorted.
       int sampleRate,
-      std::string_view fileName,
+      std::optional<std::string_view> fileName,
+      std::optional<std::string_view> formatName,
       std::optional<int64_t> bit_rate = std::nullopt);
   void encode();
+  torch::Tensor encodeToTensor();
 
  private:
   void encodeInnerLoop(
@@ -36,5 +39,8 @@ class AudioEncoder {
   UniqueSwrContext swrContext_;
 
   const torch::Tensor wf_;
+
+  // Stores the AVIOContext for the output tensor buffer.
+  std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
 };
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -18,12 +18,12 @@
     _test_frame_pts_equality,
     add_audio_stream,
     add_video_stream,
-    create_audio_encoder,
     create_from_bytes,
     create_from_file,
     create_from_file_like,
     create_from_tensor,
-    encode_audio,
+    encode_audio_to_file,
+    encode_audio_to_tensor,
     get_ffmpeg_library_versions,
     get_frame_at_index,
     get_frame_at_pts,
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
diff --git a/test/test_ops.py b/test/test_ops.py

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)`
`23`	`23`	`py::hasattr(fileLike, "seek"),`
`24`	`24`	`"File like object must implement a seek method.");`
`25`	`25`	`}`
`26`		`- createAVIOContext(&read, &seek, &fileLike_);`
	`26`	`+ createAVIOContext(&read, nullptr, &seek, &fileLike_);`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {`