meta-pytorch · scotts · Nov 5, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 30, 2025
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -80,7 +80,10 @@ void CpuDeviceInterface::initializeVideo(
     first = false;
   }
   if (!transforms.empty()) {
-    filters_ = filters.str();
+    // Note that we ensure that the transforms come BEFORE the format
+    // conversion. This means that the transforms are applied in the frame's
+    // original pixel format and colorspace.
+    filters_ = filters.str() + "," + filters_;
   }
 
   initialized_ = true;
@@ -256,17 +259,17 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
 torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
     const UniqueAVFrame& avFrame,
     const FrameDims& outputDims) {
-  enum AVPixelFormat frameFormat =
+  enum AVPixelFormat avFrameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
 
   FiltersContext filtersContext(
       avFrame->width,
       avFrame->height,
-      frameFormat,
+      avFrameFormat,
       avFrame->sample_aspect_ratio,
       outputDims.width,
       outputDims.height,
-      AV_PIX_FMT_RGB24,
+      /*outputFormat=*/AV_PIX_FMT_RGB24,
       filters_,
       timeBase_);
 

diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -87,15 +87,31 @@ class CpuDeviceInterface : public DeviceInterface {
   UniqueSwsContext swsContext_;
   SwsFrameContext prevSwsFrameContext_;
 
-  // The filter we supply to filterGraph_, if it is used. The default is the
-  // copy filter, which just copies the input to the output. Computationally, it
-  // should be a no-op. If we get no user-provided transforms, we will use the
-  // copy filter. Otherwise, we will construct the string from the transforms.
+  // We pass the filters to FFmpeg's filtergraph API. It is a simple pipeline
+  // of what FFmpeg calls "filters" to apply to decoded frames before returning
+  // them. In the PyTorch ecosystem, we call these "transforms". During
+  // initialization, we convert the user-supplied transforms into this string of
+  // filters.
   //
-  // Note that even if we only use the copy filter, we still get the desired
-  // colorspace conversion. We construct the filtergraph with its output sink
-  // set to RGB24.
-  std::string filters_ = "copy";
+  // Note that we start with the format conversion, and then we ensure that the
+  // user-supplied filters always happen BEFORE the format conversion. We want
+  // the user-supplied filters to operate on frames in their original pixel
+  // format and colorspace.
+  //
+  // The reason why is not obvious: when users do not need to perform any
+  // transforms, or the only transform they apply is a single resize, we can
+  // sometimes just call swscale directly; see getColorConversionLibrary() for
+  // the full conditions. A single call to swscale's sws_scale() will always do
+  // the scaling (resize) in the frame's original pixel format and colorspace.
+  // In order for calling swscale directly to be an optimization, we must make
+  // sure that the behavior between calling it directly and using filtergraph
+  // is identical.
+  //
+  // If we had to apply transforms in the output pixel format and colorspace,
+  // we could achieve that by calling sws_scale() twice: once to do the resize
+  // and another time to do the format conversion. But that will be slower,
+  // which goes against the whole point of calling sws_scale() directly.
+  std::string filters_ = "format=rgb24";
 
   // The flags we supply to swsContext_, if it used. The flags control the
   // resizing algorithm. We default to bilinear. Users can override this with a

diff --git a/src/torchcodec/_core/FFMPEGCommon.cpp b/src/torchcodec/_core/FFMPEGCommon.cpp
@@ -399,68 +399,65 @@ SwrContext* createSwrContext(
   return swrContext;
 }
 
-AVFilterContext* createBuffersinkFilter(
+AVFilterContext* createAVFilterContextWithOptions(
     AVFilterGraph* filterGraph,
-    enum AVPixelFormat outputFormat) {
-  const AVFilter* buffersink = avfilter_get_by_name("buffersink");
-  TORCH_CHECK(buffersink != nullptr, "Failed to get buffersink filter.");
-
-  AVFilterContext* sinkContext = nullptr;
-  int status;
+    const AVFilter* buffer,
+    const enum AVPixelFormat outputFormat) {
+  AVFilterContext* avFilterContext = nullptr;
   const char* filterName = "out";
 
-  enum AVPixelFormat pix_fmts[] = {outputFormat, AV_PIX_FMT_NONE};
+  enum AVPixelFormat pixFmts[] = {outputFormat, AV_PIX_FMT_NONE};
 
 // av_opt_set_int_list was replaced by av_opt_set_array() in FFmpeg 8.
 #if LIBAVUTIL_VERSION_MAJOR >= 60 // FFmpeg >= 8
   // Output options like pixel_formats must be set before filter init
-  sinkContext =
-      avfilter_graph_alloc_filter(filterGraph, buffersink, filterName);
+  avFilterContext =
+      avfilter_graph_alloc_filter(filterGraph, buffer, filterName);
   TORCH_CHECK(
-      sinkContext != nullptr, "Failed to allocate buffersink filter context.");
+      avFilterContext != nullptr, "Failed to allocate buffer filter context.");
 
   // When setting pix_fmts, only the first element is used, so nb_elems = 1
   // AV_PIX_FMT_NONE acts as a terminator for the array in av_opt_set_int_list
-  status = av_opt_set_array(
-      sinkContext,
+  int status = av_opt_set_array(
+      avFilterContext,
       "pixel_formats",
       AV_OPT_SEARCH_CHILDREN,
       0, // start_elem
       1, // nb_elems
       AV_OPT_TYPE_PIXEL_FMT,
-      pix_fmts);
+      pixFmts);
   TORCH_CHECK(
       status >= 0,
-      "Failed to set pixel format for buffersink filter: ",
+      "Failed to set pixel format for buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  status = avfilter_init_str(sinkContext, nullptr);
+  status = avfilter_init_str(avFilterContext, nullptr);
   TORCH_CHECK(
       status >= 0,
-      "Failed to initialize buffersink filter: ",
+      "Failed to initialize buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #else // FFmpeg <= 7
   // For older FFmpeg versions, create filter and then set options
-  status = avfilter_graph_create_filter(
-      &sinkContext, buffersink, filterName, nullptr, nullptr, filterGraph);
+  int status = avfilter_graph_create_filter(
+      &avFilterContext, buffer, filterName, nullptr, nullptr, filterGraph);
   TORCH_CHECK(
       status >= 0,
-      "Failed to create buffersink filter: ",
+      "Failed to create buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
   status = av_opt_set_int_list(
-      sinkContext,
+      avFilterContext,
       "pix_fmts",
-      pix_fmts,
+      pixFmts,
       AV_PIX_FMT_NONE,
       AV_OPT_SEARCH_CHILDREN);
   TORCH_CHECK(
       status >= 0,
-      "Failed to set pixel formats for buffersink filter: ",
+      "Failed to set pixel formats for buffer filter: ",
       getFFMPEGErrorStringFromErrorCode(status));
 #endif
 
-  return sinkContext;
+  return avFilterContext;
 }
 
 UniqueAVFrame convertAudioAVFrameSamples(

diff --git a/src/torchcodec/_core/FFMPEGCommon.h b/src/torchcodec/_core/FFMPEGCommon.h
@@ -247,9 +247,10 @@ int64_t computeSafeDuration(
     const AVRational& frameRate,
     const AVRational& timeBase);
 
-AVFilterContext* createBuffersinkFilter(
+AVFilterContext* createAVFilterContextWithOptions(
     AVFilterGraph* filterGraph,
-    enum AVPixelFormat outputFormat);
+    const AVFilter* buffer,
+    const enum AVPixelFormat outputFormat);
 
 struct SwsFrameContext {
   int inputWidth = 0;

diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
@@ -63,8 +63,8 @@ FilterGraph::FilterGraph(
     filterGraph_->nb_threads = videoStreamOptions.ffmpegThreadCount.value();
   }
 
-  const AVFilter* buffersrc = avfilter_get_by_name("buffer");
-
+  // Configure the source context.
+  const AVFilter* bufferSrc = avfilter_get_by_name("buffer");
   UniqueAVBufferSrcParameters srcParams(av_buffersrc_parameters_alloc());
   TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
 
@@ -78,7 +78,7 @@ FilterGraph::FilterGraph(
   }
 
   sourceContext_ =
-      avfilter_graph_alloc_filter(filterGraph_.get(), buffersrc, "in");
+      avfilter_graph_alloc_filter(filterGraph_.get(), bufferSrc, "in");
   TORCH_CHECK(sourceContext_, "Failed to allocate filter graph");
 
   int status = av_buffersrc_parameters_set(sourceContext_, srcParams.get());
@@ -93,23 +93,31 @@ FilterGraph::FilterGraph(
       "Failed to create filter graph : ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  sinkContext_ =
-      createBuffersinkFilter(filterGraph_.get(), filtersContext.outputFormat);
+  // Configure the sink context.
+  const AVFilter* bufferSink = avfilter_get_by_name("buffersink");
+  TORCH_CHECK(bufferSink != nullptr, "Failed to get buffersink filter.");
+
+  sinkContext_ = createAVFilterContextWithOptions(
+      filterGraph_.get(), bufferSink, filtersContext.outputFormat);
   TORCH_CHECK(
       sinkContext_ != nullptr, "Failed to create and configure buffersink");
 
+  // Create the filtergraph nodes based on the source and sink contexts.
   UniqueAVFilterInOut outputs(avfilter_inout_alloc());
-  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
-
   outputs->name = av_strdup("in");
   outputs->filter_ctx = sourceContext_;
   outputs->pad_idx = 0;
   outputs->next = nullptr;
+
+  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
   inputs->name = av_strdup("out");
   inputs->filter_ctx = sinkContext_;
   inputs->pad_idx = 0;
   inputs->next = nullptr;
 
+  // Create the filtergraph specified by the filtergraph string in the context
+  // of the inputs and outputs. Note the dance we have to do with release and
+  // resetting the output and input nodes because FFmpeg modifies them in place.
   AVFilterInOut* outputsTmp = outputs.release();
   AVFilterInOut* inputsTmp = inputs.release();
   status = avfilter_graph_parse_ptr(
@@ -126,6 +134,7 @@ FilterGraph::FilterGraph(
       getFFMPEGErrorStringFromErrorCode(status),
       ", provided filters: " + filtersContext.filtergraphStr);
 
+  // Check filtergraph validity and configure links and formats.
   status = avfilter_graph_config(filterGraph_.get(), nullptr);
   TORCH_CHECK(
       status >= 0,

diff --git a/src/torchcodec/_core/Transform.cpp b/src/torchcodec/_core/Transform.cpp
@@ -42,7 +42,7 @@ int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
 std::string ResizeTransform::getFilterGraphCpu() const {
   return "scale=" + std::to_string(outputDims_.width) + ":" +
       std::to_string(outputDims_.height) +
-      ":sws_flags=" + toFilterGraphInterpolation(interpolationMode_);
+      ":flags=" + toFilterGraphInterpolation(interpolationMode_);
 }
 
 std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {

diff --git a/test/generate_reference_resources.py b/test/generate_reference_resources.py
@@ -51,16 +51,17 @@ def generate_frame_by_index(
     )
     output_bmp = f"{base_path}.bmp"
 
-    # Note that we have an exlicit format conversion to rgb24 in our filtergraph specification,
-    # which always happens BEFORE any of the filters that we receive as input. We do this to
-    # ensure that the color conversion happens BEFORE the filters, matching the behavior of the
-    # torchcodec filtergraph implementation.
-    #
-    # Not doing this would result in the color conversion happening AFTER the filters, which
-    # would result in different color values for the same frame.
-    filtergraph = f"select='eq(n\\,{frame_index})',format=rgb24"
+    # Note that we have an exlicit format conversion to rgb24 in our filtergraph
+    # specification, and we always place the user-supplied filters BEFORE the
+    # format conversion. We do this to ensure that the filters are applied in
+    # the pixel format and colorspace of the input frames. This behavior matches
+    # the TorchCodec implementation.
+    select = f"select='eq(n\\,{frame_index})'"
+    format = "format=rgb24"
     if filters is not None:
-        filtergraph = filtergraph + f",{filters}"
+        filtergraph = ",".join([select, filters, format])
+    else:
+        filtergraph = ",".join([select, format])
 
     cmd = [
         "ffmpeg",
@@ -99,7 +100,7 @@ def generate_frame_by_timestamp(
     convert_image_to_tensor(output_path)
 
 
-def generate_nasa_13013_references():
+def generate_nasa_13013_references_by_index():
     # Note: The naming scheme used here must match the naming scheme used to load
     # tensors in ./utils.py.
     streams = [0, 3]
@@ -108,13 +109,17 @@ def generate_nasa_13013_references():
         for frame in frames:
             generate_frame_by_index(NASA_VIDEO, frame_index=frame, stream_index=stream)
 
+
+def generate_nasa_13013_references_by_timestamp():
     # Extract individual frames at specific timestamps, including the last frame of the video.
     seek_timestamp = [6.0, 6.1, 10.0, 12.979633]
     timestamp_name = [f"{seek_timestamp:06f}" for seek_timestamp in seek_timestamp]
     for timestamp, name in zip(seek_timestamp, timestamp_name):
         output_bmp = f"{NASA_VIDEO.path}.time{name}.bmp"
         generate_frame_by_timestamp(NASA_VIDEO.path, timestamp, output_bmp)
 
+
+def generate_nasa_13013_references_crop():
     # Extract frames with specific filters. We have tests that assume these exact filters.
     frames = [0, 15, 200, 389]
     crop_filter = "crop=300:200:50:35:exact=1"
@@ -124,6 +129,24 @@ def generate_nasa_13013_references():
         )
 
 
+def generate_nasa_13013_references_resize():
+    frames = [17, 230, 389]
+    # Note that the resize algorithm passed to flags is exposed to users,
+    # but bilinear is the default we use.
+    resize_filter = "scale=240:135:flags=bilinear"
+    for frame in frames:
+        generate_frame_by_index(
+            NASA_VIDEO, frame_index=frame, stream_index=3, filters=resize_filter
+        )
+
+
+def generate_nasa_13013_references():
+    generate_nasa_13013_references_by_index()
+    generate_nasa_13013_references_by_timestamp()
+    generate_nasa_13013_references_crop()
+    generate_nasa_13013_references_resize()
+
+
 def generate_h265_video_references():
     # This video was generated by running the following:
     # conda install -c conda-forge x265

diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt
diff --git a/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt b/test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt
diff --git a/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000017.pt b/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000017.pt
diff --git a/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000230.pt b/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000230.pt
diff --git a/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000389.pt b/test/resources/nasa_13013.mp4.scale_240_135_flags_bilinear.stream3.frame000389.pt