meta-pytorch
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 13 additions & 4 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 25 additions & 8 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 21 additions & 24 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 21 additions & 24 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 3 additions & 2 deletions b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/FilterGraph.cpp‎
Lines changed: 16 additions & 7 deletions b/‎src/torchcodec/_core/FilterGraph.cpp‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎src/torchcodec/decoders/_video_decoder.py‎
Lines changed: 1 addition & 21 deletions b/‎src/torchcodec/decoders/_video_decoder.py‎
Lines changed: 1 addition & 21 deletions
diff --git a/‎test/generate_reference_resources.py‎
Lines changed: 21 additions & 8 deletions b/‎test/generate_reference_resources.py‎
Lines changed: 21 additions & 8 deletions
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt‎
0 Bytes
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt‎
0 Bytes
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt‎
0 Bytes
@@ -94,6 +94,12 @@ void CpuDeviceInterface::initializeVideo(
   // If we have any transforms, replace filters_ with the filter strings from
   // the transforms. As noted above, we decide between swscale and filtergraph
   // when we actually decode a frame.
+  //
+  // Note: We explicitly add the format conversion filter at the end to ensure
+  // that color conversion happens AFTER the transforms, not before. This
+  // matches the behavior of the reference generation in the test suite.
+  // Without this, FFmpeg's automatic format negotiation might insert the
+  // conversion before the transforms, which would produce different results.
   std::stringstream filters;
   bool first = true;
   for (const auto& transform : transforms) {
@@ -104,7 +110,10 @@ void CpuDeviceInterface::initializeVideo(
     first = false;
   }
   if (!transforms.empty()) {
-    filters_ = filters.str();
+    // Note that we ensure that the transforms come BEFORE the format
+    // conversion. This means that the transforms are applied in the frame's
+    // original pixel format and colorspace.
+    filters_ = filters.str() + filters_;
   }
 
   initialized_ = true;
@@ -324,17 +333,17 @@ void CpuDeviceInterface::createSwsContext(
 torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
     const UniqueAVFrame& avFrame,
     const FrameDims& outputDims) {
-  enum AVPixelFormat frameFormat =
+  enum AVPixelFormat avFrameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
 
   FiltersContext filtersContext(
       avFrame->width,
       avFrame->height,
-      frameFormat,
+      avFrameFormat,
       avFrame->sample_aspect_ratio,
       outputDims.width,
       outputDims.height,
-      AV_PIX_FMT_RGB24,
+      /*outputFormat=*/AV_PIX_FMT_RGB24,
       filters_,
       timeBase_);
 
 
@@ -109,15 +109,32 @@ class CpuDeviceInterface : public DeviceInterface {
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
 
-  // The filter we supply to filterGraph_, if it is used. The default is the
-  // copy filter, which just copies the input to the output. Computationally, it
-  // should be a no-op. If we get no user-provided transforms, we will use the
-  // copy filter. Otherwise, we will construct the string from the transforms.
+  // We pass the filters to FFmpeg's filtergraph API. It is a simple pipeline
+  // of what FFmpeg calls "filters" to apply to decoded frames before returning
+  // them. In the PyTorch ecosystem, we call these "transforms". During
+  // initialization, we convert the user-supplied transforms into this string of
+  // filters.
   //
-  // Note that even if we only use the copy filter, we still get the desired
-  // colorspace conversion. We construct the filtergraph with its output sink
-  // set to RGB24.
-  std::string filters_ = "copy";
+  // Note that we start with the format conversion, and then we ensure that the
+  // user-supplied filters always happen BEFORE the format conversion. We want
+  // the user-supplied filters to operate on frames in their original pixel
+  // format and colorspace.
+  //
+  // The reason why is not obvious: when users do not need to perform any
+  // transforms, or the only transform they apply is a single resize, we can
+  // sometimes just call swscale directly; see getColorConversionLibrary() for
+  // the full conditions. A single call to swscale's sws_scale() will always do
+  // the scaling (resize) in the frame's original pixel format and colorspace.
+  // In order for calling swscale directly to be an optimization, we must make
+  // sure that the behavior between calling it directly and using filtergraph
+  // is identical.
+  //
+  // If we had to apply transforms in the output pixel format and colorspace,
+  // we could achieve that by calling sws_scale() twice: once to do the resize
+  // and another time to do the format conversion. But that goes against the
+  // whole point of calling sws_scale() directly, as it's a performance
+  // optimization.
+  std::string filters_ = "format=rgb24";
 
   // The flags we supply to swsContext_, if it used. The flags control the
   // resizing algorithm. We default to bilinear. Users can override this with a
 
@@ -399,68 +399,65 @@ SwrContext* createSwrContext(
   return swrContext;
 }
 
-AVFilterContext* createBuffersinkFilter(
+AVFilterContext* createAVFilterContextWithOptions(
     AVFilterGraph* filterGraph,
-    enum AVPixelFormat outputFormat) {
-  const AVFilter* buffersink = avfilter_get_by_name("buffersink");
-  TORCH_CHECK(buffersink != nullptr, "Failed to get buffersink filter.");
-
-  AVFilterContext* sinkContext = nullptr;
-  int status;
+    const AVFilter* buffer,
+    const enum AVPixelFormat outputFormat) {
+  AVFilterContext* avFilterContext = nullptr;
   const char* filterName = "out";
 
-  enum AVPixelFormat pix_fmts[] = {outputFormat, AV_PIX_FMT_NONE};
+  enum AVPixelFormat pixFmts[] = {outputFormat, AV_PIX_FMT_NONE};
 
 // av_opt_set_int_list was replaced by av_opt_set_array() in FFmpeg 8.
 #if LIBAVUTIL_VERSION_MAJOR >= 60 // FFmpeg >= 8
   // Output options like pixel_formats must be set before filter init
-  sinkContext =
-      avfilter_graph_alloc_filter(filterGraph, buffersink, filterName);
+  avFilterContext =
+      avfilter_graph_alloc_filter(filterGraph, buffer, filterName);
   TORCH_CHECK(
-      sinkContext != nullptr, "Failed to allocate buffersink filter context.");
+      avFilterContext != nullptr, "Failed to allocate buffer filter context.");
 
   // When setting pix_fmts, only the first element is used, so nb_elems = 1
   // AV_PIX_FMT_NONE acts as a terminator for the array in av_opt_set_int_list
-  status = av_opt_set_array(
-      sinkContext,
+  int status = av_opt_set_array(
+      avFilterContext,
       "pixel_formats",
       AV_OPT_SEARCH_CHILDREN,
       0, // start_elem
       1, // nb_elems
       AV_OPT_TYPE_PIXEL_FMT,
-      pix_fmts);
+      pixFmts);
   TORCH_CHECK(
       status >= 0,
-      "Failed to set pixel format for buffersink filter: ",
+      "Failed to set pixel format for buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  status = avfilter_init_str(sinkContext, nullptr);
+  status = avfilter_init_str(avFilterContext, nullptr);
   TORCH_CHECK(
       status >= 0,
-      "Failed to initialize buffersink filter: ",
+      "Failed to initialize buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #else // FFmpeg <= 7
   // For older FFmpeg versions, create filter and then set options
-  status = avfilter_graph_create_filter(
-      &sinkContext, buffersink, filterName, nullptr, nullptr, filterGraph);
+  int status = avfilter_graph_create_filter(
+      &avFilterContext, buffer, filterName, nullptr, nullptr, filterGraph);
   TORCH_CHECK(
       status >= 0,
-      "Failed to create buffersink filter: ",
+      "Failed to create buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
   status = av_opt_set_int_list(
-      sinkContext,
+      avFilterContext,
       "pix_fmts",
-      pix_fmts,
+      pixFmts,
       AV_PIX_FMT_NONE,
       AV_OPT_SEARCH_CHILDREN);
   TORCH_CHECK(
       status >= 0,
-      "Failed to set pixel formats for buffersink filter: ",
+      "Failed to set pixel formats for buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #endif
 
-  return sinkContext;
+  return avFilterContext;
 }
 
 UniqueAVFrame convertAudioAVFrameSamples(
 
@@ -246,8 +246,9 @@ int64_t computeSafeDuration(
     const AVRational& frameRate,
     const AVRational& timeBase);
 
-AVFilterContext* createBuffersinkFilter(
+AVFilterContext* createAVFilterContextWithOptions(
     AVFilterGraph* filterGraph,
-    enum AVPixelFormat outputFormat);
+    const AVFilter* buffer,
+    const enum AVPixelFormat outputFormat);
 
 } // namespace facebook::torchcodec
@@ -63,8 +63,8 @@ FilterGraph::FilterGraph(
     filterGraph_->nb_threads = videoStreamOptions.ffmpegThreadCount.value();
   }
 
-  const AVFilter* buffersrc = avfilter_get_by_name("buffer");
-
+  // Configure the source context.
+  const AVFilter* bufferSrc = avfilter_get_by_name("buffer");
   UniqueAVBufferSrcParameters srcParams(av_buffersrc_parameters_alloc());
   TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
 
@@ -78,7 +78,7 @@ FilterGraph::FilterGraph(
   }
 
   sourceContext_ =
-      avfilter_graph_alloc_filter(filterGraph_.get(), buffersrc, "in");
+      avfilter_graph_alloc_filter(filterGraph_.get(), bufferSrc, "in");
   TORCH_CHECK(sourceContext_, "Failed to allocate filter graph");
 
   int status = av_buffersrc_parameters_set(sourceContext_, srcParams.get());
@@ -93,23 +93,31 @@ FilterGraph::FilterGraph(
       "Failed to create filter graph : ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  sinkContext_ =
-      createBuffersinkFilter(filterGraph_.get(), filtersContext.outputFormat);
+  // Configure the sink context.
+  const AVFilter* bufferSink = avfilter_get_by_name("buffersink");
+  TORCH_CHECK(bufferSink != nullptr, "Failed to get buffersink filter.");
+
+  sinkContext_ = createAVFilterContextWithOptions(
+      filterGraph_.get(), bufferSink, filtersContext.outputFormat);
   TORCH_CHECK(
       sinkContext_ != nullptr, "Failed to create and configure buffersink");
 
+  // Create the filtergraph nodes based on the source and sink contexts.
   UniqueAVFilterInOut outputs(avfilter_inout_alloc());
-  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
-
   outputs->name = av_strdup("in");
   outputs->filter_ctx = sourceContext_;
   outputs->pad_idx = 0;
   outputs->next = nullptr;
+
+  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
   inputs->name = av_strdup("out");
   inputs->filter_ctx = sinkContext_;
   inputs->pad_idx = 0;
   inputs->next = nullptr;
 
+  // Create the filtergraph specified by the filtergraph string in the context
+  // of the inputs and outputs. Note the dance we have to do with release and
+  // resetting the output and input nodes because FFmpeg modifies them in place.
   AVFilterInOut* outputsTmp = outputs.release();
   AVFilterInOut* inputsTmp = inputs.release();
   status = avfilter_graph_parse_ptr(
@@ -126,6 +134,7 @@ FilterGraph::FilterGraph(
       getFFMPEGErrorStringFromErrorCode(status),
       ", provided filters: " + filtersContext.filtergraphStr);
 
+  // Check filtergraph validity and configure links and formats.
   status = avfilter_graph_config(filterGraph_.get(), nullptr);
   TORCH_CHECK(
       status >= 0,
 
@@ -8,7 +8,7 @@
 import json
 import numbers
 from pathlib import Path
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 import torch
 from torch import device as torch_device, Tensor
@@ -103,7 +103,6 @@ def __init__(
         dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
         num_ffmpeg_threads: int = 1,
         device: Optional[Union[str, torch_device]] = "cpu",
-        transforms: List[Any] = [],  # TRANSFORMS TODO: what is the user-facing type?
         seek_mode: Literal["exact", "approximate"] = "exact",
         custom_frame_mappings: Optional[
             Union[str, bytes, io.RawIOBase, io.BufferedReader]
@@ -149,16 +148,13 @@ def __init__(
 
         device_variant = _get_cuda_backend()
 
-        transform_specs = make_transform_specs(transforms)
-
         core.add_video_stream(
             self._decoder,
             stream_index=stream_index,
             dimension_order=dimension_order,
             num_threads=num_ffmpeg_threads,
             device=device,
             device_variant=device_variant,
-            transform_specs=transform_specs,
             custom_frame_mappings=custom_frame_mappings_data,
         )
 
@@ -436,22 +432,6 @@ def _get_and_validate_stream_metadata(
     )
 
 
-def make_transform_specs(transforms: List[Any]) -> str:
-    from torchvision.transforms import v2
-
-    transform_specs = []
-    for transform in transforms:
-        if isinstance(transform, v2.Resize):
-            if len(transform.size) != 2:
-                raise ValueError(
-                    f"Resize transform must have a (height, width) pair for the size, got {transform.size}."
-                )
-            transform_specs.append(f"resize, {transform.size[0]}, {transform.size[1]}")
-        else:
-            raise ValueError(f"Unsupported transform {transform}.")
-    return ";".join(transform_specs)
-
-
 def _read_custom_frame_mappings(
     custom_frame_mappings: Union[str, bytes, io.RawIOBase, io.BufferedReader]
 ) -> tuple[Tensor, Tensor, Tensor]:
 
@@ -52,15 +52,15 @@ def generate_frame_by_index(
     output_bmp = f"{base_path}.bmp"
 
     # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
-    # which always happens BEFORE any of the filters that we receive as input. We do this to
-    # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
+    # which always happens AFTER any of the filters that we receive as input. We do this to
+    # ensure that the color conversion happens AFTER the filters, matching the behavior of the
     # torchcodec filtergraph implementation.
-    #
-    # Not doing this would result in the color conversion happening AFTER the filters, which
-    # would result in different color values for the same frame.
-    filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24"
+    select = f"select='eq(n\\,{frame_index})'"
+    format = "format=rgb24"
     if filters is not None:
-        filtergraph = filtergraph + f",{filters}"
+        filtergraph = ",".join([select, filters, format])
+    else:
+        filtergraph = ",".join([select, format])
 
     cmd = [
         "ffmpeg",
@@ -99,7 +99,7 @@ def generate_frame_by_timestamp(
     convert_image_to_tensor(output_path)
 
 
-def generate_nasa_13013_references():
+def generate_nasa_13013_references_by_index():
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
     streams = [0, 3]
@@ -108,13 +108,17 @@ def generate_nasa_13013_references():
         for frame in frames:
             generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream)
 
+
+def generate_nasa_13013_references_by_timestamp():
     # Extract individual frames at specific timestamps, including the last frame of the video.
     seek_timestamp = [6.0, 6.1, 10.0, 12.979633]
     timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp]
     for timestamp, name in zip(seek_timestamp, timestamp_name):
         output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp"
         generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp)
 
+
+def generate_nasa_13013_references_crop():
     # Extract frames with specific filters. We have tests that assume these exact filters.
     frames = [0, 15, 200, 389]
     crop_filter = "crop=300:200:50:35:exact=1"
@@ -123,6 +127,8 @@ def generate_nasa_13013_references():
             NASA_VIDEO, frame_index=frame, stream_index=3, filters=crop_filter
         )
 
+
+def generate_nasa_13013_references_resize():
     frames = [17, 230, 389]
     # Note that the resize algorithm passed to flags is exposed to users,
     # but bilinear is the default we use.
@@ -133,6 +139,13 @@ def generate_nasa_13013_references():
         )
 
 
+def generate_nasa_13013_references():
+    generate_nasa_13013_references_by_index()
+    generate_nasa_13013_references_by_timestamp()
+    generate_nasa_13013_references_crop()
+    generate_nasa_13013_references_resize()
+
+
 def generate_h265_video_references():
     # This video was generated by running the following:
     # conda install -c conda-forge x265