meta-pytorch
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 12 additions & 14 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.cpp‎
Lines changed: 12 additions & 14 deletions
diff --git a/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 1 addition & 0 deletions b/‎src/torchcodec/_core/CpuDeviceInterface.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 2 deletions b/‎src/torchcodec/_core/FFMPEGCommon.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/Transform.cpp‎
Lines changed: 2 additions & 20 deletions b/‎src/torchcodec/_core/Transform.cpp‎
Lines changed: 2 additions & 20 deletions
diff --git a/‎src/torchcodec/_core/Transform.h‎
Lines changed: 0 additions & 9 deletions b/‎src/torchcodec/_core/Transform.h‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎test/generate_reference_resources.py‎
Lines changed: 3 additions & 4 deletions b/‎test/generate_reference_resources.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000000.pt‎
0 Bytes
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000015.pt‎
0 Bytes
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000200.pt‎
0 Bytes
diff --git a/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt‎
0 Bytes b/‎test/resources/nasa_13013.mp4.crop_300_200_50_35_exact_1.stream3.frame000389.pt‎
0 Bytes
@@ -46,8 +46,7 @@ void CpuDeviceInterface::initializeVideo(
   // We calculate this value during initilization but we don't refer to it until
   // getColorConversionLibrary() is called. Calculating this value during
   // initialization saves us from having to save all of the transforms.
-  areTransformsSwScaleCompatible_ = transforms.empty() ||
-      (transforms.size() == 1 && transforms[0]->isResize());
+  areTransformsSwScaleCompatible_ = transforms.empty();
 
   // Note that we do not expose this capability in the public API, only through
   // the core API.
@@ -57,16 +56,6 @@ void CpuDeviceInterface::initializeVideo(
   userRequestedSwScale_ = videoStreamOptions_.colorConversionLibrary ==
       ColorConversionLibrary::SWSCALE;
 
-  // We can only use swscale when we have a single resize transform. Note that
-  // we actually decide on whether or not to actually use swscale at the last
-  // possible moment, when we actually convert the frame. This is because we
-  // need to know the actual frame dimensions.
-  if (transforms.size() == 1 && transforms[0]->isResize()) {
-    auto resize = dynamic_cast<ResizeTransform*>(transforms[0].get());
-    TORCH_CHECK(resize != nullptr, "ResizeTransform expected but not found!")
-    swsFlags_ = resize->getSwsFlags();
-  }
-
   // If we have any transforms, replace filters_ with the filter strings from
   // the transforms. As noted above, we decide between swscale and filtergraph
   // when we actually decode a frame.
@@ -83,7 +72,7 @@ void CpuDeviceInterface::initializeVideo(
     // Note that we ensure that the transforms come BEFORE the format
     // conversion. This means that the transforms are applied in the frame's
     // original pixel format and colorspace.
-    filters_ = filters.str() + "," + filters_;
+    filters_ += "," + filters.str();
   }
 
   initialized_ = true;
@@ -221,6 +210,11 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
 
+  TORCH_CHECK(
+      avFrame->height == outputDims.height &&
+          avFrame->width == outputDims.width,
+      "Input dimensions are not equal to output dimensions; resize for sws_scale() is not yet supported.");
+
   // We need to compare the current frame context with our previous frame
   // context. If they are different, then we need to re-create our colorspace
   // conversion objects. We create our colorspace conversion objects late so
@@ -237,7 +231,11 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
 
   if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
     swsContext_ = createSwsContext(
-        swsFrameContext, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
+        swsFrameContext,
+        avFrame->colorspace,
+        /*outputFormat=*/AV_PIX_FMT_RGB24,
+        /*swsFlags=*/0); // We don't set any flags because we don't yet use
+                         // sws_scale() for resizing.
     prevSwsFrameContext_ = swsFrameContext;
   }
 
 
@@ -93,6 +93,7 @@ class CpuDeviceInterface : public DeviceInterface {
   // initialization, we convert the user-supplied transforms into this string of
   // filters.
   //
+  // TODO: make sure Scott corrects the below:
   // Note that we start with just the format conversion, and then we ensure that
   // the user-supplied filters always happen BEFORE the format conversion. We
   // want the user-supplied filters to operate on frames in their original pixel
 
@@ -275,7 +275,7 @@ struct SwsFrameContext {
 UniqueSwsContext createSwsContext(
     const SwsFrameContext& swsFrameContext,
     AVColorSpace colorspace,
-    AVPixelFormat outputFormat = AV_PIX_FMT_RGB24,
-    int swsFlags = SWS_BILINEAR);
+    AVPixelFormat outputFormat,
+    int swsFlags);
 
 } // namespace facebook::torchcodec
@@ -25,21 +25,11 @@ std::string toFilterGraphInterpolation(
   }
 }
 
-int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
-  switch (mode) {
-    case ResizeTransform::InterpolationMode::BILINEAR:
-      return SWS_BILINEAR;
-    default:
-      TORCH_CHECK(
-          false,
-          "Unknown interpolation mode: " +
-              std::to_string(static_cast<int>(mode)));
-  }
-}
-
 } // namespace
 
 std::string ResizeTransform::getFilterGraphCpu() const {
+  // Note that we turn on gamma correct scaling. This produces results that are
+  // closer to what TorchVision's resize produces.
   return "scale=" + std::to_string(outputDims_.width) + ":" +
       std::to_string(outputDims_.height) +
       ":flags=" + toFilterGraphInterpolation(interpolationMode_);
@@ -49,14 +39,6 @@ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
   return outputDims_;
 }
 
-bool ResizeTransform::isResize() const {
-  return true;
-}
-
-int ResizeTransform::getSwsFlags() const {
-  return toSwsInterpolation(interpolationMode_);
-}
-
 CropTransform::CropTransform(const FrameDims& dims, int x, int y)
     : outputDims_(dims), x_(x), y_(y) {
   TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
 
@@ -29,12 +29,6 @@ class Transform {
     return std::nullopt;
   }
 
-  // The ResizeTransform is special, because it is the only transform that
-  // swscale can handle.
-  virtual bool isResize() const {
-    return false;
-  }
-
   // The validity of some transforms depends on the characteristics of the
   // AVStream they're being applied to. For example, some transforms will
   // specify coordinates inside a frame, we need to validate that those are
@@ -58,9 +52,6 @@ class ResizeTransform : public Transform {
 
   std::string getFilterGraphCpu() const override;
   std::optional<FrameDims> getOutputFrameDims() const override;
-  bool isResize() const override;
-
-  int getSwsFlags() const;
 
  private:
   FrameDims outputDims_;
 
@@ -52,14 +52,13 @@ def generate_frame_by_index(
     output_bmp = f"{base_path}.bmp"
 
     # Note that we have an exlicit format conversion to rgb24 in our filtergraph
-    # specification, and we always place the user-supplied filters BEFORE the
+    # specification, and we always place the user-supplied filters AFTER the
     # format conversion. We do this to ensure that the filters are applied in
-    # the pixel format and colorspace of the input frames. This behavior matches
-    # the TorchCodec implementation.
+    # RGB24 colorspace, which matches TorchCodec's behavior.
     select = f"select='eq(n\\,{frame_index})'"
     format = "format=rgb24"
     if filters is not None:
-        filtergraph = ",".join([select, filters, format])
+        filtergraph = ",".join([select, format, filters])
     else:
         filtergraph = ",".join([select, format])