WIPWIPWIPWIWPWIP

NicolasHug · NicolasHug · commit ee5e41bccd4c · 2025-10-30T18:44:46.000Z
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -96,6 +96,7 @@ function(make_torchcodec_libraries
         Encoder.cpp
         ValidationUtils.cpp
         Transform.cpp
+        SwsContext.cpp
     )
 
     if(ENABLE_CUDA)
diff --git a/src/torchcodec/_core/CUDACommon.cpp b/src/torchcodec/_core/CUDACommon.cpp
@@ -367,7 +367,7 @@ UniqueAVFrame transferCpuFrameToGpuNV12(
       cpuFrame, outputDims, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
 
   int convertedHeight = sws_scale(
-      swsContext.get(),
+      swsContext,
       cpuFrame->data,
       cpuFrame->linesize,
       0,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -215,7 +215,7 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
     const UniqueAVFrame& avFrame,
     torch::Tensor& outputTensor,
     const FrameDims& outputDims) {
-  // Get or create swscale context. The SwsContext class manages caching
+  // Get or create swscale context. The SwsScaler class manages caching
   // and recreation logic internally based on frame properties.
   auto swsContext = swsCtx_.getOrCreateContext(
       avFrame, outputDims, avFrame->colorspace, AV_PIX_FMT_RGB24, swsFlags_);
@@ -225,7 +225,7 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwScale(
   int expectedOutputWidth = outputTensor.sizes()[1];
   int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0};
   int resultHeight = sws_scale(
-      swsContext.get(),
+      swsContext,
       avFrame->data,
       avFrame->linesize,
       0,
diff --git a/src/torchcodec/_core/SwsContext.cpp b/src/torchcodec/_core/SwsContext.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "src/torchcodec/_core/SwsContext.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+namespace facebook::torchcodec {
+
+SwsFrameContext::SwsFrameContext(
+    int inputWidth,
+    int inputHeight,
+    AVPixelFormat inputFormat,
+    int outputWidth,
+    int outputHeight)
+    : inputWidth(inputWidth),
+      inputHeight(inputHeight),
+      inputFormat(inputFormat),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight) {}
+
+bool SwsFrameContext::operator==(const SwsFrameContext& other) const {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight;
+}
+
+bool SwsFrameContext::operator!=(const SwsFrameContext& other) const {
+  return !(*this == other);
+}
+
+SwsContext* SwsScaler::getOrCreateContext(
+    const UniqueAVFrame& avFrame,
+    const FrameDims& outputDims,
+    AVColorSpace colorspace,
+    AVPixelFormat outputFormat,
+    int swsFlags) {
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(avFrame->format);
+
+  SwsFrameContext currentFrameContext(
+      avFrame->width,
+      avFrame->height,
+      frameFormat,
+      outputDims.width,
+      outputDims.height);
+
+  // Recreate swscale context only if frame properties changed
+  if (!swsContext_ || prevFrameContext_ != currentFrameContext) {
+    swsContext_ = createSwsContext(
+        currentFrameContext, colorspace, outputFormat, swsFlags);
+    prevFrameContext_ = currentFrameContext;
+  }
+
+  return swsContext_.get();
+}
+
+} // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/SwsContext.h b/src/torchcodec/_core/SwsContext.h
@@ -0,0 +1,60 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+extern "C" {
+#include <libswscale/swscale.h>
+}
+
+#include "src/torchcodec/_core/Frame.h"
+
+namespace facebook::torchcodec {
+
+// Context describing frame properties needed for swscale conversion.
+// Used to detect when swscale context needs to be recreated.
+struct SwsFrameContext {
+  int inputWidth;
+  int inputHeight;
+  AVPixelFormat inputFormat;
+  int outputWidth;
+  int outputHeight;
+
+  SwsFrameContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      int outputWidth,
+      int outputHeight);
+
+  bool operator==(const SwsFrameContext& other) const;
+  bool operator!=(const SwsFrameContext& other) const;
+};
+
+// Manages swscale context creation and caching across multiple frame conversions.
+// Reuses the context as long as frame properties remain the same.
+class SwsScaler {
+ public:
+  SwsScaler() = default;
+  ~SwsScaler() = default;
+
+  // Get or create a swscale context for the given frame and output dimensions.
+  // Reuses cached context if frame properties haven't changed.
+  // Returns a raw pointer to the internal swscale context. The pointer is valid
+  // as long as this SwsScaler object is alive.
+  SwsContext* getOrCreateContext(
+      const UniqueAVFrame& avFrame,
+      const FrameDims& outputDims,
+      AVColorSpace colorspace,
+      AVPixelFormat outputFormat,
+      int swsFlags = SWS_BILINEAR);
+
+ private:
+  UniqueSwsContext swsContext_;
+  SwsFrameContext prevFrameContext_ = SwsFrameContext(0, 0, AV_PIX_FMT_NONE, 0, 0);
+};
+
+} // namespace facebook::torchcodec

Original file line number	Diff line number	Diff line change
`@@ -96,6 +96,7 @@ function(make_torchcodec_libraries`
`96`	`96`	`Encoder.cpp`
`97`	`97`	`ValidationUtils.cpp`
`98`	`98`	`Transform.cpp`
	`99`	`+ SwsContext.cpp`
`99`	`100`	`)`
`100`	`101`
`101`	`102`	`if(ENABLE_CUDA)`