meta-pytorch
diff --git a/‎docs/source/api_ref_decoders.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/api_ref_decoders.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/decoding/basic_cuda_example.py‎
Lines changed: 5 additions & 3 deletions b/‎examples/decoding/basic_cuda_example.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 32 additions & 60 deletions b/‎src/torchcodec/_core/CudaDeviceInterface.cpp‎
Lines changed: 32 additions & 60 deletions
diff --git a/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/_core/CudaDeviceInterface.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 41 additions & 29 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 41 additions & 29 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 20 additions & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/StreamOptions.h‎
Lines changed: 3 additions & 3 deletions b/‎src/torchcodec/_core/StreamOptions.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/custom_ops.cpp‎
Lines changed: 4 additions & 2 deletions b/‎src/torchcodec/_core/custom_ops.cpp‎
Lines changed: 4 additions & 2 deletions
@@ -19,6 +19,12 @@ For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_decoding_a
     VideoDecoder
     AudioDecoder
 
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+    :template: function.rst
+
+    set_cuda_backend
 
 .. autosummary::
     :toctree: generated/
 
@@ -94,9 +94,10 @@
 #
 # To use CUDA decoder, you need to pass in a cuda device to the decoder.
 #
-from torchcodec.decoders import VideoDecoder
+from torchcodec.decoders import set_cuda_backend, VideoDecoder
 
-decoder = VideoDecoder(video_file, device="cuda")
+with set_cuda_backend("beta"):  # Use the BETA backend, it's faster!
+    decoder = VideoDecoder(video_file, device="cuda")
 frame = decoder[0]
 
 # %%
@@ -120,7 +121,8 @@
 # against equivalent results from the CPU decoders.
 timestamps = [12, 19, 45, 131, 180]
 cpu_decoder = VideoDecoder(video_file, device="cpu")
-cuda_decoder = VideoDecoder(video_file, device="cuda")
+with set_cuda_backend("beta"):
+    cuda_decoder = VideoDecoder(video_file, device="cuda")
 cpu_frames = cpu_decoder.get_frames_played_at(timestamps).data
 cuda_frames = cuda_decoder.get_frames_played_at(timestamps).data
 
 
@@ -41,27 +41,44 @@ const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
 PerGpuCache<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>
     g_cached_hw_device_ctxs(MAX_CUDA_GPUS, MAX_CONTEXTS_PER_GPU_IN_CACHE);
 
+int getFlagsAVHardwareDeviceContextCreate() {
+// 58.26.100 introduced the concept of reusing the existing cuda context
+// which is much faster and lower memory than creating a new cuda context.
 #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
+  return AV_CUDA_USE_CURRENT_CONTEXT;
+#else
+  return 0;
+#endif
+}
+
+UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
+  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
+  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
+  torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
+
+  UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
+  if (hardwareDeviceCtx) {
+    return hardwareDeviceCtx;
+  }
 
-AVBufferRef* getFFMPEGContextFromExistingCudaContext(
-    const torch::Device& device,
-    torch::DeviceIndex nonNegativeDeviceIndex,
-    enum AVHWDeviceType type) {
+  // Create hardware device context
   c10::cuda::CUDAGuard deviceGuard(device);
   // Valid values for the argument to cudaSetDevice are 0 to maxDevices - 1:
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb
   // So we ensure the deviceIndex is not negative.
   // We set the device because we may be called from a different thread than
   // the one that initialized the cuda context.
   cudaSetDevice(nonNegativeDeviceIndex);
-  AVBufferRef* hw_device_ctx = nullptr;
+  AVBufferRef* hardwareDeviceCtxRaw = nullptr;
   std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
+
   int err = av_hwdevice_ctx_create(
-      &hw_device_ctx,
+      &hardwareDeviceCtxRaw,
       type,
       deviceOrdinal.c_str(),
       nullptr,
-      AV_CUDA_USE_CURRENT_CONTEXT);
+      getFlagsAVHardwareDeviceContextCreate());
+
   if (err < 0) {
     /* clang-format off */
     TORCH_CHECK(
@@ -72,53 +89,8 @@ AVBufferRef* getFFMPEGContextFromExistingCudaContext(
         "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
     /* clang-format on */
   }
-  return hw_device_ctx;
-}
-
-#else
-
-AVBufferRef* getFFMPEGContextFromNewCudaContext(
-    [[maybe_unused]] const torch::Device& device,
-    torch::DeviceIndex nonNegativeDeviceIndex,
-    enum AVHWDeviceType type) {
-  AVBufferRef* hw_device_ctx = nullptr;
-  std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
-  int err = av_hwdevice_ctx_create(
-      &hw_device_ctx, type, deviceOrdinal.c_str(), nullptr, 0);
-  if (err < 0) {
-    TORCH_CHECK(
-        false,
-        "Failed to create specified HW device",
-        getFFMPEGErrorStringFromErrorCode(err));
-  }
-  return hw_device_ctx;
-}
 
-#endif
-
-UniqueAVBufferRef getCudaContext(const torch::Device& device) {
-  enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
-  TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
-  torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
-
-  UniqueAVBufferRef hw_device_ctx = g_cached_hw_device_ctxs.get(device);
-  if (hw_device_ctx) {
-    return hw_device_ctx;
-  }
-
-  // 58.26.100 introduced the concept of reusing the existing cuda context
-  // which is much faster and lower memory than creating a new cuda context.
-  // So we try to use that if it is available.
-  // FFMPEG 6.1.2 appears to be the earliest release that contains version
-  // 58.26.100 of avutil.
-  // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
-#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
-  return UniqueAVBufferRef(getFFMPEGContextFromExistingCudaContext(
-      device, nonNegativeDeviceIndex, type));
-#else
-  return UniqueAVBufferRef(
-      getFFMPEGContextFromNewCudaContext(device, nonNegativeDeviceIndex, type));
-#endif
+  return UniqueAVBufferRef(hardwareDeviceCtxRaw);
 }
 
 } // namespace
@@ -131,15 +103,14 @@ CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
 
   initializeCudaContextWithPytorch(device_);
 
-  // TODO rename this, this is a hardware device context, not a CUDA context!
-  // See https://github.com/meta-pytorch/torchcodec/issues/924
-  ctx_ = getCudaContext(device_);
+  hardwareDeviceCtx_ = getHardwareDeviceContext(device_);
   nppCtx_ = getNppStreamContext(device_);
 }
 
 CudaDeviceInterface::~CudaDeviceInterface() {
-  if (ctx_) {
-    g_cached_hw_device_ctxs.addIfCacheHasCapacity(device_, std::move(ctx_));
+  if (hardwareDeviceCtx_) {
+    g_cached_hw_device_ctxs.addIfCacheHasCapacity(
+        device_, std::move(hardwareDeviceCtx_));
   }
   returnNppStreamContextToCache(device_, std::move(nppCtx_));
 }
@@ -170,9 +141,10 @@ void CudaDeviceInterface::initializeVideo(
 
 void CudaDeviceInterface::registerHardwareDeviceWithCodec(
     AVCodecContext* codecContext) {
-  TORCH_CHECK(ctx_, "FFmpeg HW device has not been initialized");
+  TORCH_CHECK(
+      hardwareDeviceCtx_, "Hardware device context has not been initialized");
   TORCH_CHECK(codecContext != nullptr, "codecContext is null");
-  codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
+  codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
 }
 
 UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
 
@@ -52,7 +52,7 @@ class CudaDeviceInterface : public DeviceInterface {
   VideoStreamOptions videoStreamOptions_;
   AVRational timeBase_;
 
-  UniqueAVBufferRef ctx_;
+  UniqueAVBufferRef hardwareDeviceCtx_;
   UniqueNppContext nppCtx_;
 
   // This filtergraph instance is only used for NV12 format conversion in
 
@@ -4,6 +4,10 @@
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
+extern "C" {
+#include <libavutil/pixdesc.h>
+}
+
 namespace facebook::torchcodec {
 
 namespace {
@@ -587,15 +591,6 @@ void VideoEncoder::initializeEncoder(
   TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
   avCodecContext_.reset(avCodecContext);
 
-  // Set encoding options
-  // TODO-VideoEncoder: Allow bitrate to be set
-  std::optional<int> desiredBitRate = videoStreamOptions.bitRate;
-  if (desiredBitRate.has_value()) {
-    TORCH_CHECK(
-        *desiredBitRate >= 0, "bit_rate=", *desiredBitRate, " must be >= 0.");
-  }
-  avCodecContext_->bit_rate = desiredBitRate.value_or(0);
-
   // Store dimension order and input pixel format
   // TODO-VideoEncoder: Remove assumption that tensor in NCHW format
   auto sizes = frames_.sizes();
@@ -608,9 +603,15 @@ void VideoEncoder::initializeEncoder(
   outWidth_ = inWidth_;
   outHeight_ = inHeight_;
 
-  // Use YUV420P as default output format
   // TODO-VideoEncoder: Enable other pixel formats
-  outPixelFormat_ = AV_PIX_FMT_YUV420P;
+  // Let FFmpeg choose best pixel format to minimize loss
+  outPixelFormat_ = avcodec_find_best_pix_fmt_of_list(
+      getSupportedPixelFormats(*avCodec), // List of supported formats
+      AV_PIX_FMT_GBRP, // We reorder input to GBRP currently
+      0, // No alpha channel
+      nullptr // Discard conversion loss information
+  );
+  TORCH_CHECK(outPixelFormat_ != -1, "Failed to find best pix fmt")
 
   // Configure codec parameters
   avCodecContext_->codec_id = avCodec->id;
@@ -621,37 +622,39 @@ void VideoEncoder::initializeEncoder(
   avCodecContext_->time_base = {1, inFrameRate_};
   avCodecContext_->framerate = {inFrameRate_, 1};
 
-  // TODO-VideoEncoder: Allow GOP size and max B-frames to be set
-  if (videoStreamOptions.gopSize.has_value()) {
-    avCodecContext_->gop_size = *videoStreamOptions.gopSize;
-  } else {
-    avCodecContext_->gop_size = 12; // Default GOP size
+  // Set flag for containers that require extradata to be in the codec context
+  if (avFormatContext_->oformat->flags & AVFMT_GLOBALHEADER) {
+    avCodecContext_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
   }
 
-  if (videoStreamOptions.maxBFrames.has_value()) {
-    avCodecContext_->max_b_frames = *videoStreamOptions.maxBFrames;
-  } else {
-    avCodecContext_->max_b_frames = 0; // No max B-frames to reduce compression
+  // Apply videoStreamOptions
+  AVDictionary* options = nullptr;
+  if (videoStreamOptions.crf.has_value()) {
+    av_dict_set(
+        &options,
+        "crf",
+        std::to_string(videoStreamOptions.crf.value()).c_str(),
+        0);
   }
+  int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
+  av_dict_free(&options);
 
-  int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
       status == AVSUCCESS,
       "avcodec_open2 failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
-  TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
+  avStream_ = avformat_new_stream(avFormatContext_.get(), nullptr);
+  TORCH_CHECK(avStream_ != nullptr, "Couldn't create new stream.");
 
   // Set the stream time base to encode correct frame timestamps
-  avStream->time_base = avCodecContext_->time_base;
+  avStream_->time_base = avCodecContext_->time_base;
   status = avcodec_parameters_from_context(
-      avStream->codecpar, avCodecContext_.get());
+      avStream_->codecpar, avCodecContext_.get());
   TORCH_CHECK(
       status == AVSUCCESS,
       "avcodec_parameters_from_context failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
-  streamIndex_ = avStream->index;
 }
 
 void VideoEncoder::encode() {
@@ -694,7 +697,7 @@ UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
         outWidth_,
         outHeight_,
         outPixelFormat_,
-        SWS_BILINEAR,
+        SWS_BICUBIC, // Used by FFmpeg CLI
         nullptr,
         nullptr,
         nullptr));
@@ -757,7 +760,7 @@ void VideoEncoder::encodeFrame(
       "Error while sending frame: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  while (true) {
+  while (status >= 0) {
     ReferenceAVPacket packet(autoAVPacket);
     status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
     if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
@@ -776,7 +779,16 @@ void VideoEncoder::encodeFrame(
         "Error receiving packet: ",
         getFFMPEGErrorStringFromErrorCode(status));
 
-    packet->stream_index = streamIndex_;
+    // The code below is borrowed from torchaudio:
+    // https://github.com/pytorch/audio/blob/b6a3368a45aaafe05f1a6a9f10c68adc5e944d9e/src/libtorio/ffmpeg/stream_writer/encoder.cpp#L46
+    // Setting packet->duration to 1 allows the last frame to be properly
+    // encoded, and needs to be set before calling av_packet_rescale_ts.
+    if (packet->duration == 0) {
+      packet->duration = 1;
+    }
+    av_packet_rescale_ts(
+        packet.get(), avCodecContext_->time_base, avStream_->time_base);
+    packet->stream_index = avStream_->index;
 
     status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
     TORCH_CHECK(
 
@@ -153,7 +153,7 @@ class VideoEncoder {
 
   UniqueEncodingAVFormatContext avFormatContext_;
   UniqueAVCodecContext avCodecContext_;
-  int streamIndex_ = -1;
+  AVStream* avStream_;
   UniqueSwsContext swsContext_;
 
   const torch::Tensor frames_;
 
@@ -90,6 +90,26 @@ const int* getSupportedSampleRates(const AVCodec& avCodec) {
   return supportedSampleRates;
 }
 
+const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec) {
+  const AVPixelFormat* supportedPixelFormats = nullptr;
+#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
+  int numPixelFormats = 0;
+  int ret = avcodec_get_supported_config(
+      nullptr,
+      &avCodec,
+      AV_CODEC_CONFIG_PIX_FORMAT,
+      0,
+      reinterpret_cast<const void**>(&supportedPixelFormats),
+      &numPixelFormats);
+  if (ret < 0 || supportedPixelFormats == nullptr) {
+    TORCH_CHECK(false, "Couldn't get supported pixel formats from encoder.");
+  }
+#else
+  supportedPixelFormats = avCodec.pix_fmts;
+#endif
+  return supportedPixelFormats;
+}
+
 const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec) {
   const AVSampleFormat* supportedSampleFormats = nullptr;
 #if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
 
@@ -168,6 +168,7 @@ void setDuration(const UniqueAVFrame& frame, int64_t duration);
 
 const int* getSupportedSampleRates(const AVCodec& avCodec);
 const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec);
+const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
 
 int getNumChannels(const UniqueAVFrame& avFrame);
 int getNumChannels(const UniqueAVCodecContext& avCodecContext);
 
@@ -45,9 +45,9 @@ struct VideoStreamOptions {
   std::string_view deviceVariant = "default";
 
   // Encoding options
-  std::optional<int> bitRate;
-  std::optional<int> gopSize;
-  std::optional<int> maxBFrames;
+  // TODO-VideoEncoder: Consider adding other optional fields here
+  // (bit rate, gop size, max b frames, preset)
+  std::optional<int> crf;
 };
 
 struct AudioStreamOptions {
 
@@ -33,7 +33,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
-      "encode_video_to_file(Tensor frames, int frame_rate, str filename) -> ()");
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
   m.def(
       "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
   m.def(
@@ -501,8 +501,10 @@ OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
 void encode_video_to_file(
     const at::Tensor& frames,
     int64_t frame_rate,
-    std::string_view file_name) {
+    std::string_view file_name,
+    std::optional<int64_t> crf = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.crf = crf;
   VideoEncoder(
       frames,
       validateInt64ToInt(frame_rate, "frame_rate"),