[Cherry-Pick for 0.20] Revamp decoding docs (pytorch#8633) (pytorch#8666)

NicolasHug · web-flow · commit 2d8a288f78b5 · 2024-10-01T13:51:05.000+01:00
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -3,33 +3,46 @@ Decoding / Encoding images and videos
 
 .. currentmodule:: torchvision.io
 
-The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing images and
-videos.
+The :mod:`torchvision.io` module provides utilities for decoding and encoding
+images and videos.
 
-Images
-------
+Image Decoding
+--------------
 
 Torchvision currently supports decoding JPEG, PNG, WEBP and GIF images. JPEG
 decoding can also be done on CUDA GPUs.
 
-For encoding, JPEG (cpu and CUDA) and PNG are supported.
+The main entry point is the :func:`~torchvision.io.decode_image` function, which
+you can use as an alternative to ``PIL.Image.open()``. It will decode images
+straight into image Tensors, thus saving you the conversion and allowing you to
+run transforms/preproc natively on tensors.
+
+.. code::
+
+    from torchvision.io import decode_image
+
+    img = decode_image("path_to_image", mode="RGB")
+    img.dtype  # torch.uint8
+
+    # Or
+    raw_encoded_bytes = ...  # read encoded bytes from your file system
+    img = decode_image(raw_encoded_bytes, mode="RGB")
+
+
+:func:`~torchvision.io.decode_image` will automatically detect the image format,
+and call the corresponding decoder. You can also use the lower-level
+format-specific decoders which can be more powerful, e.g. if you want to
+encode/decode JPEGs on CUDA.
 
 .. autosummary::
     :toctree: generated/
     :template: function.rst
 
     decode_image
-    encode_jpeg
     decode_jpeg
-    write_jpeg
+    encode_png
     decode_gif
     decode_webp
-    encode_png
-    decode_png
-    write_png
-    read_file
-    write_file
 
 .. autosummary::
     :toctree: generated/
@@ -41,14 +54,47 @@ Obsolete decoding function:
 
 .. autosummary::
     :toctree: generated/
-    :template: class.rst
+    :template: function.rst
 
     read_image
 
+Image Encoding
+--------------
+
+For encoding, JPEG (cpu and CUDA) and PNG are supported.
+
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    encode_jpeg
+    write_jpeg
+    encode_png
+    write_png
+
+IO operations
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_file
+    write_file
 
 Video
 -----
 
+.. warning::
+
+    Torchvision supports video decoding through different APIs listed below,
+    some of which are still in BETA stage. In the near future, we intend to
+    centralize PyTorch's video decoding capabilities within the `torchcodec
+    <https://github.com/pytorch/torchcodec>`_ project. We encourage you to try
+    it out and share your feedback, as the torchvision video decoders will
+    eventually be deprecated.
+
 .. autosummary::
     :toctree: generated/
     :template: function.rst
@@ -58,45 +104,14 @@ Video
     write_video
 
 
-Fine-grained video API
-^^^^^^^^^^^^^^^^^^^^^^
+**Fine-grained video API**
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
 It does all this whilst fully supporting torchscript.
 
-.. betastatus:: fine-grained video API
-
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
     VideoReader
-
-
-Example of inspecting a video:
-
-.. code:: python
-
-    import torchvision
-    video_path = "path to a test video"
-    # Constructor allocates memory and a threaded decoder
-    # instance per video. At the moment it takes two arguments:
-    # path to the video file, and a wanted stream.
-    reader = torchvision.io.VideoReader(video_path, "video")
-
-    # The information about the video can be retrieved using the 
-    # `get_metadata()` method. It returns a dictionary for every stream, with
-    # duration and other relevant metadata (often frame rate)
-    reader_md = reader.get_metadata()
-
-    # metadata is structured as a dict of dicts with following structure
-    # {"stream_type": {"attribute": [attribute per stream]}}
-    #
-    # following would print out the list of frame rates for every present video stream
-    print(reader_md["video"]["fps"])
-
-    # we explicitly select the stream we would like to operate on. In
-    # the constructor we select a default video stream, but
-    # in practice, we can set whichever stream we would like 
-    video.set_current_stream("video:0")
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
@@ -20,19 +20,25 @@
 
 
 class ImageReadMode(Enum):
-    """
-    Support for various modes while reading images.
+    """Allow automatic conversion to RGB, RGBA, etc while decoding.
+
+    .. note::
+
+        You don't need to use this struct, you can just pass strings to all
+        ``mode`` parameters, e.g. ``mode="RGB"``.
 
-    Use ``ImageReadMode.UNCHANGED`` for loading the image as-is,
-    ``ImageReadMode.GRAY`` for converting to grayscale,
-    ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency,
-    ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for
-    RGB with transparency.
+    The different available modes are the following.
+
+    - UNCHANGED: loads the image as-is
+    - RGB: converts to RGB
+    - RGBA: converts to RGB with transparency (also aliased as RGB_ALPHA)
+    - GRAY: converts to grayscale
+    - GRAY_ALPHA: converts to grayscale with transparency
 
     .. note::
 
-        Some decoders won't support all possible values, e.g. a decoder may only
-        support "RGB" and "RGBA" mode.
+        Some decoders won't support all possible values, e.g. GRAY and
+        GRAY_ALPHA are only supported for PNG and JPEG images.
     """
 
     UNCHANGED = 0
@@ -45,8 +51,7 @@ class ImageReadMode(Enum):
 
 def read_file(path: str) -> torch.Tensor:
     """
-    Reads and outputs the bytes contents of a file as a uint8 Tensor
-    with one dimension.
+    Return the bytes contents of a file as a uint8 1D Tensor.
 
     Args:
         path (str or ``pathlib.Path``): the path to the file to be read
@@ -62,8 +67,7 @@ def read_file(path: str) -> torch.Tensor:
 
 def write_file(filename: str, data: torch.Tensor) -> None:
     """
-    Writes the contents of an uint8 tensor with one dimension to a
-    file.
+    Write the content of an uint8 1D tensor to a file.
 
     Args:
         filename (str or ``pathlib.Path``): the path to the file to be written
@@ -93,10 +97,9 @@ def decode_png(
     Args:
         input (Tensor[1]): a one dimensional uint8 tensor containing
             the raw bytes of the PNG image.
-        mode (str or ImageReadMode): the read mode used for optionally
-            converting the image. Default: ``ImageReadMode.UNCHANGED``.
-            See `ImageReadMode` class for more information on various
-            available modes.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
         apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
             Default: False.
 
@@ -156,8 +159,7 @@ def decode_jpeg(
     device: Union[str, torch.device] = "cpu",
     apply_exif_orientation: bool = False,
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
-    """
-    Decode JPEG image(s) into 3 dimensional RGB or grayscale Tensor(s).
+    """Decode JPEG image(s) into 3D RGB or grayscale Tensor(s), on CPU or CUDA.
 
     The values of the output tensor are uint8 between 0 and 255.
 
@@ -171,12 +173,9 @@ def decode_jpeg(
         input (Tensor[1] or list[Tensor[1]]): a (list of) one dimensional uint8 tensor(s) containing
             the raw bytes of the JPEG image. The tensor(s) must be on CPU,
             regardless of the ``device`` parameter.
-        mode (str or ImageReadMode): the read mode used for optionally
-            converting the image(s). The supported modes are: ``ImageReadMode.UNCHANGED``,
-            ``ImageReadMode.GRAY`` and ``ImageReadMode.RGB``
-            Default: ``ImageReadMode.UNCHANGED``.
-            See ``ImageReadMode`` class for more information on various
-            available modes.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
         device (str or torch.device): The device on which the decoded image will
             be stored. If a cuda device is specified, the image will be decoded
             with `nvjpeg <https://developer.nvidia.com/nvjpeg>`_. This is only
@@ -228,9 +227,7 @@ def decode_jpeg(
 def encode_jpeg(
     input: Union[torch.Tensor, List[torch.Tensor]], quality: int = 75
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
-    """
-    Takes a (list of) input tensor(s) in CHW layout and returns a (list of) buffer(s) with the contents
-    of the corresponding JPEG file(s).
+    """Encode RGB tensor(s) into raw encoded jpeg bytes, on CPU or CUDA.
 
     .. note::
         Passing a list of CUDA tensors is more efficient than repeated individual calls to ``encode_jpeg``.
@@ -286,7 +283,7 @@ def decode_image(
     mode: ImageReadMode = ImageReadMode.UNCHANGED,
     apply_exif_orientation: bool = False,
 ) -> torch.Tensor:
-    """Decode an image into a tensor.
+    """Decode an image into a uint8 tensor, from a path or from raw encoded bytes.
 
     Currently supported image formats are jpeg, png, gif and webp.
 
@@ -303,10 +300,9 @@ def decode_image(
         input (Tensor or str or ``pathlib.Path``): The image to decode. If a
             tensor is passed, it must be one dimensional uint8 tensor containing
             the raw bytes of the image. Otherwise, this must be a path to the image file.
-        mode (str or ImageReadMode): the read mode used for optionally converting the image.
-            Default: ``ImageReadMode.UNCHANGED``.
-            See ``ImageReadMode`` class for more information on various
-            available modes. Only applies to JPEG and PNG images.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
         apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
            Only applies to JPEG and PNG images. Default: False.
 
@@ -367,9 +363,9 @@ def decode_webp(
     Args:
         input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
             the raw bytes of the WEBP image.
-        mode (str or ImageReadMode): The read mode used for optionally
-            converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
-            Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
 
     Returns:
         Decoded image (Tensor[image_channels, image_height, image_width])
@@ -398,9 +394,9 @@ def _decode_avif(
     Args:
         input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
             the raw bytes of the AVIF image.
-        mode (str or ImageReadMode): The read mode used for optionally
-            converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
-            Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
 
     Returns:
         Decoded image (Tensor[image_channels, image_height, image_width])
@@ -426,9 +422,9 @@ def _decode_heic(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHAN
     Args:
         input (Tensor[1]): a one dimensional contiguous uint8 tensor containing
             the raw bytes of the HEIC image.
-        mode (str or ImageReadMode): The read mode used for optionally
-            converting the image color space. Default: ``ImageReadMode.UNCHANGED``.
-            Other supported values are ``ImageReadMode.RGB`` and ``ImageReadMode.RGB_ALPHA``.
+        mode (str or ImageReadMode): The mode to convert the image to, e.g. "RGB".
+            Default is "UNCHANGED".  See :class:`~torchvision.io.ImageReadMode`
+            for available modes.
 
     Returns:
         Decoded image (Tensor[image_channels, image_height, image_width])
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -64,6 +64,14 @@ def write_video(
     """
     Writes a 4d tensor in [T, H, W, C] format in a video file
 
+    .. warning::
+
+        In the near future, we intend to centralize PyTorch's video decoding
+        capabilities within the `torchcodec
+        <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+        try it out and share your feedback, as the torchvision video decoders
+        will eventually be deprecated.
+
     Args:
         filename (str): path where the video will be saved
         video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
@@ -243,6 +251,14 @@ def read_video(
     """
     Reads a video from a file, returning both the video frames and the audio frames
 
+    .. warning::
+
+        In the near future, we intend to centralize PyTorch's video decoding
+        capabilities within the `torchcodec
+        <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+        try it out and share your feedback, as the torchvision video decoders
+        will eventually be deprecated.
+
     Args:
         filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts.
         start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional):
@@ -367,6 +383,14 @@ def read_video_timestamps(filename: str, pts_unit: str = "pts") -> Tuple[List[in
     """
     List the video frames timestamps.
 
+    .. warning::
+
+        In the near future, we intend to centralize PyTorch's video decoding
+        capabilities within the `torchcodec
+        <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+        try it out and share your feedback, as the torchvision video decoders
+        will eventually be deprecated.
+
     Note that the function decodes the whole video frame-by-frame.
 
     Args:
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
@@ -52,6 +52,14 @@ class VideoReader:
     backends: video_reader, pyav, and cuda.
     Backends can be set via `torchvision.set_video_backend` function.
 
+    .. warning::
+
+        In the near future, we intend to centralize PyTorch's video decoding
+        capabilities within the `torchcodec
+        <https://github.com/pytorch/torchcodec>`_ project. We encourage you to
+        try it out and share your feedback, as the torchvision video decoders
+        will eventually be deprecated.
+
     .. betastatus:: VideoReader class
 
     Example: