Handle options

NicolasHug · NicolasHug · commit d914615914b4 · 2025-01-22T12:32:29.000Z
diff --git a/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp b/src/torchcodec/decoders/_core/CPUOnlyDevice.cpp
@@ -16,7 +16,7 @@ namespace facebook::torchcodec {
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    [[maybe_unused]] const VideoDecoder::VideoStreamDecoderOptions& options,
+    [[maybe_unused]] const VideoDecoder::VideoStreamOptions& options,
     [[maybe_unused]] VideoDecoder::RawDecodedOutput& rawOutput,
     [[maybe_unused]] VideoDecoder::DecodedOutput& output,
     [[maybe_unused]] std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp
@@ -185,7 +185,7 @@ void initializeContextOnCuda(
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& options,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
diff --git a/src/torchcodec/decoders/_core/DeviceInterface.h b/src/torchcodec/decoders/_core/DeviceInterface.h
@@ -31,7 +31,7 @@ void initializeContextOnCuda(
 
 void convertAVFrameToDecodedOutputOnCuda(
     const torch::Device& device,
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& options,
     VideoDecoder::RawDecodedOutput& rawOutput,
     VideoDecoder::DecodedOutput& output,
     std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -124,7 +124,7 @@ VideoDecoder::ColorConversionLibrary getDefaultColorConversionLibrary(
 torch::Tensor VideoDecoder::maybePermuteHWC2CHW(
     int streamIndex,
     torch::Tensor& hwcTensor) {
-  if (streamInfos_[streamIndex].options.dimensionOrder == "NHWC") {
+  if (streamInfos_[streamIndex].videoStreamOptions.dimensionOrder == "NHWC") {
     return hwcTensor;
   }
   auto numDimensions = hwcTensor.dim();
@@ -141,7 +141,7 @@ torch::Tensor VideoDecoder::maybePermuteHWC2CHW(
   }
 }
 
-VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
+VideoDecoder::VideoStreamOptions::VideoStreamOptions(
     const std::string& optionsString) {
   std::vector<std::string> tokens =
       splitStringWithDelimiters(optionsString, ",");
@@ -194,14 +194,14 @@ VideoDecoder::VideoStreamDecoderOptions::VideoStreamDecoderOptions(
 
 VideoDecoder::BatchDecodedOutput::BatchDecodedOutput(
     int64_t numFrames,
-    const VideoStreamDecoderOptions& options,
+    const VideoStreamOptions& videoStreamOptions,
     const StreamMetadata& streamMetadata)
     : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
       durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
-  auto frameDims = getHeightAndWidthFromOptionsOrMetadata(options, streamMetadata);
+  auto frameDims = getHeightAndWidthFromOptionsOrMetadata(videoStreamOptions, streamMetadata);
   int height = frameDims.height;
   int width = frameDims.width;
-  frames = allocateEmptyHWCTensor(height, width, options.device, numFrames);
+  frames = allocateEmptyHWCTensor(height, width, videoStreamOptions.device, numFrames);
 }
 
 bool VideoDecoder::DecodedFrameContext::operator==(
@@ -338,9 +338,9 @@ void VideoDecoder::createFilterGraph(
   filterState.filterGraph.reset(avfilter_graph_alloc());
   TORCH_CHECK(filterState.filterGraph.get() != nullptr);
 
-  if (streamInfo.options.ffmpegThreadCount.has_value()) {
+  if (streamInfo.videoStreamOptions.ffmpegThreadCount.has_value()) {
     filterState.filterGraph->nb_threads =
-        streamInfo.options.ffmpegThreadCount.value();
+        streamInfo.videoStreamOptions.ffmpegThreadCount.value();
   }
 
   const AVFilter* buffersrc = avfilter_get_by_name("buffer");
@@ -444,7 +444,7 @@ int VideoDecoder::getBestStreamIndex(AVMediaType mediaType) {
 
 void VideoDecoder::addVideoStreamDecoder(
     int preferredStreamIndex,
-    const VideoStreamDecoderOptions& options) {
+    const VideoStreamOptions& videoStreamOptions) {
   if (activeStreamIndices_.count(preferredStreamIndex) > 0) {
     throw std::invalid_argument(
         "Stream with index " + std::to_string(preferredStreamIndex) +
@@ -484,26 +484,26 @@ void VideoDecoder::addVideoStreamDecoder(
         " is not a video stream.");
   }
 
-  if (options.device.type() == torch::kCUDA) {
-    codec = findCudaCodec(options.device, streamInfo.stream->codecpar->codec_id)
+  if (videoStreamOptions.device.type() == torch::kCUDA) {
+    codec = findCudaCodec(videoStreamOptions.device, streamInfo.stream->codecpar->codec_id)
                 .value_or(codec);
   }
 
   AVCodecContext* codecContext = avcodec_alloc_context3(codec);
   TORCH_CHECK(codecContext != nullptr);
-  codecContext->thread_count = options.ffmpegThreadCount.value_or(0);
+  codecContext->thread_count = videoStreamOptions.ffmpegThreadCount.value_or(0);
   streamInfo.codecContext.reset(codecContext);
 
   int retVal = avcodec_parameters_to_context(
       streamInfo.codecContext.get(), streamInfo.stream->codecpar);
   TORCH_CHECK_EQ(retVal, AVSUCCESS);
 
-  if (options.device.type() == torch::kCPU) {
+  if (videoStreamOptions.device.type() == torch::kCPU) {
     // No more initialization needed for CPU.
-  } else if (options.device.type() == torch::kCUDA) {
-    initializeContextOnCuda(options.device, codecContext);
+  } else if (videoStreamOptions.device.type() == torch::kCUDA) {
+    initializeContextOnCuda(videoStreamOptions.device, codecContext);
   } else {
-    TORCH_CHECK(false, "Invalid device type: " + options.device.str());
+    TORCH_CHECK(false, "Invalid device type: " + videoStreamOptions.device.str());
   }
 
   retVal = avcodec_open2(streamInfo.codecContext.get(), codec, nullptr);
@@ -514,7 +514,7 @@ void VideoDecoder::addVideoStreamDecoder(
   codecContext->time_base = streamInfo.stream->time_base;
   activeStreamIndices_.insert(streamIndex);
   updateMetadataWithCodecContext(streamInfo.streamIndex, codecContext);
-  streamInfo.options = options;
+  streamInfo.videoStreamOptions = videoStreamOptions;
 
   // By default, we want to use swscale for color conversion because it is
   // faster. However, it has width requirements, so we may need to fall back
@@ -523,10 +523,10 @@ void VideoDecoder::addVideoStreamDecoder(
   // swscale's width requirements to be violated. We don't expose the ability to
   // choose color conversion library publicly; we only use this ability
   // internally.
-  int width = options.width.value_or(codecContext->width);
+  int width = videoStreamOptions.width.value_or(codecContext->width);
   auto defaultLibrary = getDefaultColorConversionLibrary(width);
   streamInfo.colorConversionLibrary =
-      options.colorConversionLibrary.value_or(defaultLibrary);
+      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
 }
 
 void VideoDecoder::updateMetadataWithCodecContext(
@@ -920,19 +920,19 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
   output.durationSeconds = ptsToSeconds(
       getDuration(avFrame), formatContext_->streams[streamIndex]->time_base);
   // TODO: we should fold preAllocatedOutputTensor into RawDecodedOutput.
-  if (streamInfo.options.device.type() == torch::kCPU) {
+  if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToDecodedOutputOnCPU(
         rawOutput, output, preAllocatedOutputTensor);
-  } else if (streamInfo.options.device.type() == torch::kCUDA) {
+  } else if (streamInfo.videoStreamOptions.device.type() == torch::kCUDA) {
     convertAVFrameToDecodedOutputOnCuda(
-        streamInfo.options.device,
-        streamInfo.options,
+        streamInfo.videoStreamOptions.device,
+        streamInfo.videoStreamOptions,
         rawOutput,
         output,
         preAllocatedOutputTensor);
   } else {
     TORCH_CHECK(
-        false, "Invalid device type: " + streamInfo.options.device.str());
+        false, "Invalid device type: " + streamInfo.videoStreamOptions.device.str());
   }
   return output;
 }
@@ -955,7 +955,7 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU(
   auto& streamInfo = streamInfos_[streamIndex];
 
   auto frameDims =
-      getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *avFrame);
+      getHeightAndWidthFromOptionsOrAVFrame(streamInfo.videoStreamOptions, *avFrame);
   int expectedOutputHeight = frameDims.height;
   int expectedOutputWidth = frameDims.width;
 
@@ -1262,8 +1262,8 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices(
 
   const auto& streamMetadata = containerMetadata_.streamMetadatas[streamIndex];
   const auto& streamInfo = streamInfos_[streamIndex];
-  const auto& options = streamInfo.options;
-  BatchDecodedOutput output(frameIndices.size(), options, streamMetadata);
+  const auto& videoStreamOptions = streamInfo.videoStreamOptions;
+  BatchDecodedOutput output(frameIndices.size(), videoStreamOptions, streamMetadata);
 
   auto previousIndexInVideo = -1;
   for (size_t f = 0; f < frameIndices.size(); ++f) {
@@ -1344,8 +1344,8 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange(
       step > 0, "Step must be greater than 0; is " + std::to_string(step));
 
   int64_t numOutputFrames = std::ceil((stop - start) / double(step));
-  const auto& options = streamInfo.options;
-  BatchDecodedOutput output(numOutputFrames, options, streamMetadata);
+  const auto& videoStreamOptions = streamInfo.videoStreamOptions;
+  BatchDecodedOutput output(numOutputFrames, videoStreamOptions, streamMetadata);
 
   for (int64_t i = start, f = 0; i < stop; i += step, ++f) {
     DecodedOutput singleOut =
@@ -1372,7 +1372,7 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
           std::to_string(stopSeconds) + ".");
 
   const auto& streamInfo = streamInfos_[streamIndex];
-  const auto& options = streamInfo.options;
+  const auto& videoStreamOptions = streamInfo.videoStreamOptions;
 
   // Special case needed to implement a half-open range. At first glance, this
   // may seem unnecessary, as our search for stopFrame can return the end, and
@@ -1392,7 +1392,7 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
   // values of the intervals will map to the same frame indices below. Hence, we
   // need this special case below.
   if (startSeconds == stopSeconds) {
-    BatchDecodedOutput output(0, options, streamMetadata);
+    BatchDecodedOutput output(0, videoStreamOptions, streamMetadata);
     output.frames = maybePermuteHWC2CHW(streamIndex, output.frames);
     return output;
   }
@@ -1429,7 +1429,7 @@ VideoDecoder::getFramesPlayedByTimestampInRange(
       secondsToIndexUpperBound(stopSeconds, streamInfo, streamMetadata);
   int64_t numFrames = stopFrameIndex - startFrameIndex;
 
-  BatchDecodedOutput output(numFrames, options, streamMetadata);
+  BatchDecodedOutput output(numFrames, videoStreamOptions, streamMetadata);
   for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) {
     DecodedOutput singleOut =
         getFrameAtIndexInternal(streamIndex, i, output.frames[f]);
@@ -1584,7 +1584,7 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
 
 VideoDecoder::~VideoDecoder() {
   for (auto& [streamIndex, streamInfo] : streamInfos_) {
-    auto& device = streamInfo.options.device;
+    auto& device = streamInfo.videoStreamOptions.device;
     if (device.type() == torch::kCPU) {
     } else if (device.type() == torch::kCUDA) {
       releaseContextOnCuda(device, streamInfo.codecContext.get());
@@ -1599,19 +1599,19 @@ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
 }
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     const VideoDecoder::StreamMetadata& streamMetadata) {
   return FrameDims(
-      options.height.value_or(*streamMetadata.height),
-      options.width.value_or(*streamMetadata.width));
+      videoStreamOptions.height.value_or(*streamMetadata.height),
+      videoStreamOptions.width.value_or(*streamMetadata.width));
 }
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     const AVFrame& avFrame) {
   return FrameDims(
-      options.height.value_or(avFrame.height),
-      options.width.value_or(avFrame.width));
+      videoStreamOptions.height.value_or(avFrame.height),
+      videoStreamOptions.width.value_or(avFrame.width));
 }
 
 torch::Tensor allocateEmptyHWCTensor(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -130,9 +130,9 @@ class VideoDecoder {
     // Use the libswscale library for color conversion.
     SWSCALE
   };
-  struct VideoStreamDecoderOptions {
-    VideoStreamDecoderOptions() {}
-    explicit VideoStreamDecoderOptions(const std::string& optionsString);
+  struct VideoStreamOptions {
+    VideoStreamOptions() {}
+    explicit VideoStreamOptions(const std::string& optionsString);
     // Number of threads we pass to FFMPEG for decoding.
     // 0 means FFMPEG will choose the number of threads automatically to fully
     // utilize all cores. If not set, it will be the default FFMPEG behavior for
@@ -149,13 +149,13 @@ class VideoDecoder {
     // By default we use CPU for decoding for both C++ and python users.
     torch::Device device = torch::kCPU;
   };
-  struct AudioStreamDecoderOptions {};
+  struct AudioStreamOptions {};
   void addVideoStreamDecoder(
       int streamIndex,
-      const VideoStreamDecoderOptions& options = VideoStreamDecoderOptions());
+      const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
   void addAudioStreamDecoder(
       int streamIndex,
-      const AudioStreamDecoderOptions& options = AudioStreamDecoderOptions());
+      const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
 
   torch::Tensor maybePermuteHWC2CHW(int streamIndex, torch::Tensor& hwcTensor);
 
@@ -214,7 +214,7 @@ class VideoDecoder {
 
     explicit BatchDecodedOutput(
         int64_t numFrames,
-        const VideoStreamDecoderOptions& options,
+        const VideoStreamOptions& videoStreamOptions,
         const StreamMetadata& streamMetadata);
   };
 
@@ -313,7 +313,7 @@ class VideoDecoder {
     // this pts to the user when they request a frame.
     // We update this field if the user requested a seek.
     int64_t discardFramesBeforePts = INT64_MIN;
-    VideoStreamDecoderOptions options;
+    VideoStreamOptions videoStreamOptions;
     // The filter state associated with this stream (for video streams). The
     // actual graph will be nullptr for inactive streams.
     FilterState filterState;
@@ -488,11 +488,11 @@ struct FrameDims {
 FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
 
 FrameDims getHeightAndWidthFromOptionsOrMetadata(
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     const VideoDecoder::StreamMetadata& streamMetadata);
 
 FrameDims getHeightAndWidthFromOptionsOrAVFrame(
-    const VideoDecoder::VideoStreamDecoderOptions& options,
+    const VideoDecoder::VideoStreamOptions& videoStreamOptions,
     const AVFrame& avFrame);
 
 torch::Tensor allocateEmptyHWCTensor(
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -180,23 +180,23 @@ void _add_video_stream(
     std::optional<int64_t> stream_index,
     std::optional<std::string_view> device,
     std::optional<std::string_view> color_conversion_library) {
-  VideoDecoder::VideoStreamDecoderOptions options;
-  options.width = width;
-  options.height = height;
-  options.ffmpegThreadCount = num_threads;
+  VideoDecoder::VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.width = width;
+  videoStreamOptions.height = height;
+  videoStreamOptions.ffmpegThreadCount = num_threads;
 
   if (dimension_order.has_value()) {
     std::string stdDimensionOrder{dimension_order.value()};
     TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
-    options.dimensionOrder = stdDimensionOrder;
+    videoStreamOptions.dimensionOrder = stdDimensionOrder;
   }
   if (color_conversion_library.has_value()) {
     std::string stdColorConversionLibrary{color_conversion_library.value()};
     if (stdColorConversionLibrary == "filtergraph") {
-      options.colorConversionLibrary =
+      videoStreamOptions.colorConversionLibrary =
           VideoDecoder::ColorConversionLibrary::FILTERGRAPH;
     } else if (stdColorConversionLibrary == "swscale") {
-      options.colorConversionLibrary =
+      videoStreamOptions.colorConversionLibrary =
           VideoDecoder::ColorConversionLibrary::SWSCALE;
     } else {
       throw std::runtime_error(
@@ -206,10 +206,10 @@ void _add_video_stream(
   }
   if (device.has_value()) {
     if (device.value() == "cpu") {
-      options.device = torch::Device(torch::kCPU);
+      videoStreamOptions.device = torch::Device(torch::kCPU);
     } else if (device.value().rfind("cuda", 0) == 0) { // starts with "cuda"
       std::string deviceStr(device.value());
-      options.device = torch::Device(deviceStr);
+      videoStreamOptions.device = torch::Device(deviceStr);
     } else {
       throw std::runtime_error(
           "Invalid device=" + std::string(device.value()) +
@@ -218,7 +218,7 @@ void _add_video_stream(
   }
 
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addVideoStreamDecoder(stream_index.value_or(-1), options);
+  videoDecoder->addVideoStreamDecoder(stream_index.value_or(-1), videoStreamOptions);
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
diff --git a/test/decoders/VideoDecoderTest.cpp b/test/decoders/VideoDecoderTest.cpp