2025-10-11 nightly release (e5b2eef)

pytorchbot · pytorchbot · commit e820cb0d48d1 · 2025-10-11T11:35:32.000Z
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
@@ -71,7 +71,7 @@ jobs:
         # but for releases we should add 12.8.
         cuda-version: ['12.6', '13.0']
         # TODO: put back ffmpeg 5 https://github.com/pytorch/torchcodec/issues/325
-        ffmpeg-version-for-tests: ['4.4.2', '6', '7']
+        ffmpeg-version-for-tests: ['4.4.2', '6', '7', '8.0']
 
     container:
       image: "pytorch/manylinux2_28-builder:cuda${{ matrix.cuda-version }}"
diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
@@ -63,7 +63,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ['3.10']
-        ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1']
+        ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1', '8.0']
     needs: build
     steps:
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
@@ -65,7 +65,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ['3.10']
-        ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1']
+        ffmpeg-version-for-tests: ['4.4.2', '5.1.2', '6.1.1', '7.0.1', '8.0']
     needs: build
     steps:
       - name: Download wheel
diff --git a/.github/workflows/windows_wheel.yaml b/.github/workflows/windows_wheel.yaml
@@ -71,6 +71,7 @@ jobs:
         # TODO: FFmpeg 5 on Windows segfaults in avcodec_open2() when passing
         # bad parameters.
         # See https://github.com/pytorch/torchcodec/pull/806
+        # TODO: Support FFmpeg 8 on Windows
         ffmpeg-version-for-tests: ['4.4.2', '6.1.1', '7.0.1']
     needs: build
     steps:
diff --git a/benchmarks/decoders/gpu_benchmark.py b/benchmarks/decoders/gpu_benchmark.py
@@ -29,18 +29,17 @@ def decode_full_video(video_path, decode_device_string, resize_device_string):
     num_threads = None
     if "cuda" in decode_device_string:
         num_threads = 1
-    width = None
-    height = None
+
+    resize_spec = ""
     if "native" in resize_device_string:
-        width = RESIZED_WIDTH
-        height = RESIZED_HEIGHT
+        resize_spec = f"resize, {RESIZED_HEIGHT}, {RESIZED_WIDTH}"
+
     torchcodec._core._add_video_stream(
         decoder,
         stream_index=-1,
         device=decode_device_string,
         num_threads=num_threads,
-        width=width,
-        height=height,
+        transform_specs=resize_spec,
     )
 
     start_time = time.time()
diff --git a/src/torchcodec/_core/CMakeLists.txt b/src/torchcodec/_core/CMakeLists.txt
@@ -263,11 +263,12 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})
         you still need a different FFmpeg to be installed for run time!"
     )
 
-    # This will expose the ffmpeg4, ffmpeg5, ffmpeg6, and ffmpeg7 targets
+    # This will expose the ffmpeg4, ffmpeg5, ffmpeg6, ffmpeg7, and ffmpeg8 targets
     include(
         ${CMAKE_CURRENT_SOURCE_DIR}/fetch_and_expose_non_gpl_ffmpeg_libs.cmake
     )
 
+    make_torchcodec_libraries(8 ffmpeg8)
     make_torchcodec_libraries(7 ffmpeg7)
     make_torchcodec_libraries(6 ffmpeg6)
     make_torchcodec_libraries(4 ffmpeg4)
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -43,9 +43,9 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
+      "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -183,6 +183,69 @@ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
   }
 }
 
+int checkedToPositiveInt(const std::string& str) {
+  int ret = 0;
+  try {
+    ret = std::stoi(str);
+  } catch (const std::invalid_argument&) {
+    TORCH_CHECK(false, "String cannot be converted to an int:" + str);
+  } catch (const std::out_of_range&) {
+    TORCH_CHECK(false, "String would become integer out of range:" + str);
+  }
+  TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
+  return ret;
+}
+
+// Resize transform specs take the form:
+//
+//   "resize, <height>, <width>"
+//
+// Where "resize" is the string literal and <height> and <width> are positive
+// integers.
+Transform* makeResizeTransform(
+    const std::vector<std::string>& resizeTransformSpec) {
+  TORCH_CHECK(
+      resizeTransformSpec.size() == 3,
+      "resizeTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(resizeTransformSpec[1]);
+  int width = checkedToPositiveInt(resizeTransformSpec[2]);
+  return new ResizeTransform(FrameDims(height, width));
+}
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+// The transformSpecsRaw string is always in the format:
+//
+//   "name1, param1, param2, ...; name2, param1, param2, ...; ..."
+//
+// Where "nameX" is the name of the transform, and "paramX" are the parameters.
+std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
+  std::vector<Transform*> transforms;
+  std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
+  for (const std::string& transformSpecRaw : transformSpecs) {
+    std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
+    TORCH_CHECK(
+        transformSpec.size() >= 1,
+        "Invalid transform spec: " + transformSpecRaw);
+
+    auto name = transformSpec[0];
+    if (name == "resize") {
+      transforms.push_back(makeResizeTransform(transformSpec));
+    } else {
+      TORCH_CHECK(false, "Invalid transform name: " + name);
+    }
+  }
+  return transforms;
+}
+
 } // namespace
 
 // ==============================
@@ -252,36 +315,18 @@ at::Tensor _create_from_file_like(
 
 void _add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
         custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
   videoStreamOptions.ffmpegThreadCount = num_threads;
 
-  // TODO: Eliminate this temporary bridge code. This exists because we have
-  //       not yet exposed the transforms API on the Python side. We also want
-  //       to remove the `width` and `height` arguments from the Python API.
-  //
-  // TEMPORARY BRIDGE CODE START
-  TORCH_CHECK(
-      width.has_value() == height.has_value(),
-      "width and height must both be set or unset.");
-  std::vector<Transform*> transforms;
-  if (width.has_value()) {
-    transforms.push_back(
-        new ResizeTransform(FrameDims(height.value(), width.value())));
-    width.reset();
-    height.reset();
-  }
-  // TEMPORARY BRIDGE CODE END
-
   if (dimension_order.has_value()) {
     std::string stdDimensionOrder{dimension_order.value()};
     TORCH_CHECK(stdDimensionOrder == "NHWC" || stdDimensionOrder == "NCHW");
@@ -309,6 +354,9 @@ void _add_video_stream(
   videoStreamOptions.device = torch::Device(std::string(device));
   videoStreamOptions.deviceVariant = device_variant;
 
+  std::vector<Transform*> transforms =
+      makeTransforms(std::string(transform_specs));
+
   std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
       custom_frame_mappings.has_value()
       ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
@@ -324,24 +372,22 @@ void _add_video_stream(
 // Add a new video stream at `stream_index` using the provided options.
 void add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
     std::string_view device = "cpu",
     std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
         custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
-      width,
-      height,
       num_threads,
       dimension_order,
       stream_index,
       device,
       device_variant,
+      transform_specs,
       custom_frame_mappings);
 }
 
diff --git a/src/torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake b/src/torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake
@@ -44,6 +44,10 @@ if (LINUX)
         f7_sha256
         1cb946d8b7c6393c2c3ebe1f900b8de7a2885fe614c45d4ec32c9833084f2f26
     )
+    set(
+        f8_sha256
+        c55b3c1a4b5e4d5fdd7c632bea3ab6f45b4e37cc8e0999dda3f84a8ed8defad8
+    )
     set(
        f4_library_file_names
        libavutil.so.56
@@ -84,6 +88,16 @@ if (LINUX)
        libswscale.so.8
        libswresample.so.5
     )
+    set(
+       f8_library_file_names
+       libavutil.so.60
+       libavcodec.so.62
+       libavformat.so.62
+       libavdevice.so.62
+       libavfilter.so.11
+       libswscale.so.9
+       libswresample.so.6
+    )
 elseif (APPLE)
     set(lib_dir "lib")
     set(
@@ -106,6 +120,10 @@ elseif (APPLE)
         f7_sha256
         48a4fc8ce098305cfd4a58f40889249c523ca3c285f66ba704b5bad0e3ada53a
     )
+    set(
+        f8_sha256
+        beb936b76f25d2621228a12cdb67c9ae3d1eff7aa713ef8d1167ebf0c25bd5ec
+    )
 
     set(
        f4_library_file_names
@@ -147,6 +165,16 @@ elseif (APPLE)
        libswscale.8.dylib
        libswresample.5.dylib
     )
+    set(
+       f8_library_file_names
+       libavutil.60.dylib
+       libavcodec.62.dylib
+       libavformat.62.dylib
+       libavdevice.62.dylib
+       libavfilter.11.dylib
+       libswscale.9.dylib
+       libswresample.6.dylib
+    )
 
 elseif (WIN32)
     set(lib_dir "bin")
@@ -170,6 +198,10 @@ elseif (WIN32)
         f7_sha256
         ae391ace382330e912793b70b68529ee7c91026d2869b4df7e7c3e7d3656bdd5
     )
+    set(
+        f8_sha256
+        bac845ac79876b104959cb0e7b9dec772a261116344dd17d2f97e7ddfac4a73f
+    )
 
     set(
         f4_library_file_names
@@ -211,6 +243,16 @@ elseif (WIN32)
         swscale.lib
         swresample.lib
     )
+    set(
+        f8_library_file_names
+        avutil.lib
+        avcodec.lib
+        avformat.lib
+        avdevice.lib
+        avfilter.lib
+        swscale.lib
+        swresample.lib
+    )
 else()
     message(
         FATAL_ERROR
@@ -242,19 +284,27 @@ FetchContent_Declare(
     URL_HASH
     SHA256=${f7_sha256}
 )
+FetchContent_Declare(
+    f8
+    URL ${platform_url}/8.0.tar.gz
+    URL_HASH
+    SHA256=${f8_sha256}
+)
 
-FetchContent_MakeAvailable(f4 f5 f6 f7)
+FetchContent_MakeAvailable(f4 f5 f6 f7 f8)
 
 add_library(ffmpeg4 INTERFACE)
 add_library(ffmpeg5 INTERFACE)
 add_library(ffmpeg6 INTERFACE)
 add_library(ffmpeg7 INTERFACE)
+add_library(ffmpeg8 INTERFACE)
 
 # Note: the f?_SOURCE_DIR variables were set by FetchContent_MakeAvailable
 target_include_directories(ffmpeg4 INTERFACE ${f4_SOURCE_DIR}/include)
 target_include_directories(ffmpeg5 INTERFACE ${f5_SOURCE_DIR}/include)
 target_include_directories(ffmpeg6 INTERFACE ${f6_SOURCE_DIR}/include)
 target_include_directories(ffmpeg7 INTERFACE ${f7_SOURCE_DIR}/include)
+target_include_directories(ffmpeg8 INTERFACE ${f8_SOURCE_DIR}/include)
 
 
 list(
@@ -277,6 +327,11 @@ list(
     PREPEND ${f7_SOURCE_DIR}/${lib_dir}/
     OUTPUT_VARIABLE f7_library_paths
 )
+list(
+    TRANSFORM f8_library_file_names
+    PREPEND ${f8_SOURCE_DIR}/${lib_dir}/
+    OUTPUT_VARIABLE f8_library_paths
+)
 
 target_link_libraries(
     ffmpeg4
@@ -298,3 +353,8 @@ target_link_libraries(
     INTERFACE
     ${f7_library_paths}
 )
+target_link_libraries(
+    ffmpeg8
+    INTERFACE
+    ${f8_library_paths}
+)
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -299,13 +299,12 @@ def create_from_tensor_abstract(
 def _add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -318,13 +317,12 @@ def _add_video_stream_abstract(
 def add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
     device: str = "cpu",
     device_variant: str = "default",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
diff --git a/src/torchcodec/_samplers/video_clip_sampler.py b/src/torchcodec/_samplers/video_clip_sampler.py
diff --git a/test/test_decoders.py b/test/test_decoders.py
diff --git a/test/test_metadata.py b/test/test_metadata.py
diff --git a/test/test_ops.py b/test/test_ops.py

Original file line number	Diff line number	Diff line change
`@@ -263,11 +263,12 @@ if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})`
`263`	`263`	`you still need a different FFmpeg to be installed for run time!"`
`264`	`264`	`)`
`265`	`265`
`266`		`- # This will expose the ffmpeg4, ffmpeg5, ffmpeg6, and ffmpeg7 targets`
	`266`	`+ # This will expose the ffmpeg4, ffmpeg5, ffmpeg6, ffmpeg7, and ffmpeg8 targets`
`267`	`267`	`include(`
`268`	`268`	`${CMAKE_CURRENT_SOURCE_DIR}/fetch_and_expose_non_gpl_ffmpeg_libs.cmake`
`269`	`269`	`)`
`270`	`270`
	`271`	`+ make_torchcodec_libraries(8 ffmpeg8)`
`271`	`272`	`make_torchcodec_libraries(7 ffmpeg7)`
`272`	`273`	`make_torchcodec_libraries(6 ffmpeg6)`
`273`	`274`	`make_torchcodec_libraries(4 ffmpeg4)`