put back read_video on pyav backend

NicolasHug · NicolasHug · commit a796e70f5dfe · 2025-08-26T10:46:10.000+01:00
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
@@ -5,7 +5,7 @@
 from typing import Any, Callable, cast, Optional, TypeVar, Union
 
 import torch
-from torchvision.io import _probe_video_from_file, _read_video_from_file, read_video, read_video_timestamps
+from torchvision.io import  read_video, read_video_timestamps
 
 from .utils import tqdm
 
@@ -305,11 +305,7 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]
         video_path = self.video_paths[video_idx]
         clip_pts = self.clips[video_idx][clip_idx]
 
-        from torchvision import get_video_backend
-
-        backend = get_video_backend()
-
-        if backend == "pyav":
+        if True:
             # check for invalid options
             if self._video_width != 0:
                 raise ValueError("pyav backend doesn't support _video_width != 0")
@@ -322,43 +318,10 @@ def get_clip(self, idx: int) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]
             if self._audio_samples != 0:
                 raise ValueError("pyav backend doesn't support _audio_samples != 0")
 
-        if backend == "pyav":
+        if True:
             start_pts = clip_pts[0].item()
             end_pts = clip_pts[-1].item()
             video, audio, info = read_video(video_path, start_pts, end_pts)
-        else:
-            _info = _probe_video_from_file(video_path)
-            video_fps = _info.video_fps
-            audio_fps = None
-
-            video_start_pts = cast(int, clip_pts[0].item())
-            video_end_pts = cast(int, clip_pts[-1].item())
-
-            audio_start_pts, audio_end_pts = 0, -1
-            audio_timebase = Fraction(0, 1)
-            video_timebase = Fraction(_info.video_timebase.numerator, _info.video_timebase.denominator)
-            if _info.has_audio:
-                audio_timebase = Fraction(_info.audio_timebase.numerator, _info.audio_timebase.denominator)
-                audio_start_pts = pts_convert(video_start_pts, video_timebase, audio_timebase, math.floor)
-                audio_end_pts = pts_convert(video_end_pts, video_timebase, audio_timebase, math.ceil)
-                audio_fps = _info.audio_sample_rate
-            video, audio, _ = _read_video_from_file(
-                video_path,
-                video_width=self._video_width,
-                video_height=self._video_height,
-                video_min_dimension=self._video_min_dimension,
-                video_max_dimension=self._video_max_dimension,
-                video_pts_range=(video_start_pts, video_end_pts),
-                video_timebase=video_timebase,
-                audio_samples=self._audio_samples,
-                audio_channels=self._audio_channels,
-                audio_pts_range=(audio_start_pts, audio_end_pts),
-                audio_timebase=audio_timebase,
-            )
-
-            info = {"video_fps": video_fps}
-            if audio_fps is not None:
-                info["audio_fps"] = audio_fps
 
         if self.frame_rate is not None:
             resampling_idx = self.resampling_idxs[video_idx][clip_idx]
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
@@ -15,11 +15,12 @@
     write_jpeg,
     write_png,
 )
-from .video import write_video
+from .video import write_video, read_video
 
 
 __all__ = [
     "write_video",
+    "read_video",
     "ImageReadMode",
     "decode_image",
     "decode_jpeg",
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -45,9 +45,6 @@ def _check_av_available() -> None:
         raise av
 
 
-
-
-
 def write_video(
     filename: str,
     video_array: torch.Tensor,
@@ -168,3 +165,192 @@ def write_video(
         for packet in stream.encode():
             container.mux(packet)
 
+
+def read_video(
+    filename: str,
+    start_pts: Union[float, Fraction] = 0,
+    end_pts: Optional[Union[float, Fraction]] = None,
+    pts_unit: str = "pts",
+    output_format: str = "THWC",
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]:
+    """[DEPRECATED] Reads a video from a file, returning both the video frames and the audio frames
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.24.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    Args:
+        filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts.
+        start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
+            The start presentation time of the video
+        end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
+            The end presentation time
+        pts_unit (str, optional): unit in which start_pts and end_pts values will be interpreted,
+            either 'pts' or 'sec'. Defaults to 'pts'.
+        output_format (str, optional): The format of the output video tensors. Can be either "THWC" (default) or "TCHW".
+
+    Returns:
+        vframes (Tensor[T, H, W, C] or Tensor[T, C, H, W]): the `T` video frames
+        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
+        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
+    """
+    _raise_video_deprecation_warning()
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_video)
+
+    output_format = output_format.upper()
+    if output_format not in ("THWC", "TCHW"):
+        raise ValueError(f"output_format should be either 'THWC' or 'TCHW', got {output_format}.")
+
+    from torchvision import get_video_backend
+
+    if True:  # ignore, this is to avoid a bigger diff in https://github.com/pytorch/vision/pull/9189
+        _check_av_available()
+
+        if end_pts is None:
+            end_pts = float("inf")
+
+        if end_pts < start_pts:
+            raise ValueError(
+                f"end_pts should be larger than start_pts, got start_pts={start_pts} and end_pts={end_pts}"
+            )
+
+        info = {}
+        video_frames = []
+        audio_frames = []
+        audio_timebase = _video_opt.default_timebase
+
+        try:
+            with av.open(filename, metadata_errors="ignore") as container:
+                if container.streams.audio:
+                    audio_timebase = container.streams.audio[0].time_base
+                if container.streams.video:
+                    video_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.video[0],
+                        {"video": 0},
+                    )
+                    video_fps = container.streams.video[0].average_rate
+                    # guard against potentially corrupted files
+                    if video_fps is not None:
+                        info["video_fps"] = float(video_fps)
+
+                if container.streams.audio:
+                    audio_frames = _read_from_stream(
+                        container,
+                        start_pts,
+                        end_pts,
+                        pts_unit,
+                        container.streams.audio[0],
+                        {"audio": 0},
+                    )
+                    info["audio_fps"] = container.streams.audio[0].rate
+
+        except FFmpegError:
+            # TODO raise a warning?
+            pass
+
+        vframes_list = [frame.to_rgb().to_ndarray() for frame in video_frames]
+        aframes_list = [frame.to_ndarray() for frame in audio_frames]
+
+        if vframes_list:
+            vframes = torch.as_tensor(np.stack(vframes_list))
+        else:
+            vframes = torch.empty((0, 1, 1, 3), dtype=torch.uint8)
+
+        if aframes_list:
+            aframes = np.concatenate(aframes_list, 1)
+            aframes = torch.as_tensor(aframes)
+            if pts_unit == "sec":
+                start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+                if end_pts != float("inf"):
+                    end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
+            aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
+        else:
+            aframes = torch.empty((1, 0), dtype=torch.float32)
+
+    if output_format == "TCHW":
+        # [T,H,W,C] --> [T,C,H,W]
+        vframes = vframes.permute(0, 3, 1, 2)
+
+    return vframes, aframes, info
+
+
+def _can_read_timestamps_from_packets(container: "av.container.Container") -> bool:
+    extradata = container.streams[0].codec_context.extradata
+    if extradata is None:
+        return False
+    if b"Lavc" in extradata:
+        return True
+    return False
+
+
+def _decode_video_timestamps(container: "av.container.Container") -> list[int]:
+    if _can_read_timestamps_from_packets(container):
+        # fast path
+        return [x.pts for x in container.demux(video=0) if x.pts is not None]
+    else:
+        return [x.pts for x in container.decode(video=0) if x.pts is not None]
+
+
+def read_video_timestamps(filename: str, pts_unit: str = "pts") -> tuple[list[int], Optional[float]]:
+    """[DEPREACTED] List the video frames timestamps.
+
+    .. warning::
+
+        DEPRECATED: All the video decoding and encoding capabilities of torchvision
+        are deprecated from version 0.22 and will be removed in version 0.25.  We
+        recommend that you migrate to
+        `TorchCodec <https://github.com/pytorch/torchcodec>`__, where we'll
+        consolidate the future decoding/encoding capabilities of PyTorch
+
+    Note that the function decodes the whole video frame-by-frame.
+
+    Args:
+        filename (str): path to the video file
+        pts_unit (str, optional): unit in which timestamp values will be returned
+            either 'pts' or 'sec'. Defaults to 'pts'.
+
+    Returns:
+        pts (List[int] if pts_unit = 'pts', List[Fraction] if pts_unit = 'sec'):
+            presentation timestamps for each one of the frames in the video.
+        video_fps (float, optional): the frame rate for the video
+
+    """
+    _raise_video_deprecation_warning()
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(read_video_timestamps)
+    from torchvision import get_video_backend
+
+    _check_av_available()
+
+    video_fps = None
+    pts = []
+
+    try:
+        with av.open(filename, metadata_errors="ignore") as container:
+            if container.streams.video:
+                video_stream = container.streams.video[0]
+                video_time_base = video_stream.time_base
+                try:
+                    pts = _decode_video_timestamps(container)
+                except FFmpegError:
+                    warnings.warn(f"Failed decoding frames for file {filename}")
+                video_fps = float(video_stream.average_rate)
+    except FFmpegError as e:
+        msg = f"Failed to open container for {filename}; Caught error: {e}"
+        warnings.warn(msg, RuntimeWarning)
+
+    pts.sort()
+
+    if pts_unit == "sec":
+        pts = [x * video_time_base for x in pts]
+
+    return pts, video_fps