pytorch
diff --git a/‎docs/source/io.rst‎
Lines changed: 61 additions & 46 deletions b/‎docs/source/io.rst‎
Lines changed: 61 additions & 46 deletions
diff --git a/‎setup.py‎
Lines changed: 5 additions & 5 deletions b/‎setup.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎test/test_transforms_v2.py‎
Lines changed: 47 additions & 0 deletions b/‎test/test_transforms_v2.py‎
Lines changed: 47 additions & 0 deletions
@@ -3,33 +3,46 @@ Decoding / Encoding images and videos
 
 .. currentmodule:: torchvision.io
 
-The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing images and
-videos.
+The :mod:`torchvision.io` module provides utilities for decoding and encoding
+images and videos.
 
-Images
-------
+Image Decoding
+--------------
 
 Torchvision currently supports decoding JPEG, PNG, WEBP and GIF images. JPEG
 decoding can also be done on CUDA GPUs.
 
-For encoding, JPEG (cpu and CUDA) and PNG are supported.
+The main entry point is the :func:`~torchvision.io.decode_image` function, which
+you can use as an alternative to ``PIL.Image.open()``. It will decode images
+straight into image Tensors, thus saving you the conversion and allowing you to
+run transforms/preproc natively on tensors.
+
+.. code::
+
+    from torchvision.io import decode_image
+
+    img = decode_image("path_to_image", mode="RGB")
+    img.dtype  # torch.uint8
+
+    # Or
+    raw_encoded_bytes = ...  # read encoded bytes from your file system
+    img = decode_image(raw_encoded_bytes, mode="RGB")
+
+
+:func:`~torchvision.io.decode_image` will automatically detect the image format,
+and call the corresponding decoder. You can also use the lower-level
+format-specific decoders which can be more powerful, e.g. if you want to
+encode/decode JPEGs on CUDA.
 
 .. autosummary::
     :toctree: generated/
     :template: function.rst
 
     decode_image
-    encode_jpeg
     decode_jpeg
-    write_jpeg
+    encode_png
     decode_gif
     decode_webp
-    encode_png
-    decode_png
-    write_png
-    read_file
-    write_file
 
 .. autosummary::
     :toctree: generated/
@@ -41,14 +54,47 @@ Obsolete decoding function:
 
 .. autosummary::
     :toctree: generated/
-    :template: class.rst
+    :template: function.rst
 
     read_image
 
+Image Encoding
+--------------
+
+For encoding, JPEG (cpu and CUDA) and PNG are supported.
+
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    encode_jpeg
+    write_jpeg
+    encode_png
+    write_png
+
+IO operations
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_file
+    write_file
 
 Video
 -----
 
+.. warning::
+
+    Torchvision supports video decoding through different APIs listed below,
+    some of which are still in BETA stage. In the near future, we intend to
+    centralize PyTorch's video decoding capabilities within the `torchcodec
+    <https://github.com/pytorch/torchcodec>`_ project. We encourage you to try
+    it out and share your feedback, as the torchvision video decoders will
+    eventually be deprecated.
+
 .. autosummary::
     :toctree: generated/
     :template: function.rst
@@ -58,45 +104,14 @@ Video
     write_video
 
 
-Fine-grained video API
-^^^^^^^^^^^^^^^^^^^^^^
+**Fine-grained video API**
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
 It does all this whilst fully supporting torchscript.
 
-.. betastatus:: fine-grained video API
-
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
     VideoReader
-
-
-Example of inspecting a video:
-
-.. code:: python
-
-    import torchvision
-    video_path = "path to a test video"
-    # Constructor allocates memory and a threaded decoder
-    # instance per video. At the moment it takes two arguments:
-    # path to the video file, and a wanted stream.
-    reader = torchvision.io.VideoReader(video_path, "video")
-
-    # The information about the video can be retrieved using the 
-    # `get_metadata()` method. It returns a dictionary for every stream, with
-    # duration and other relevant metadata (often frame rate)
-    reader_md = reader.get_metadata()
-
-    # metadata is structured as a dict of dicts with following structure
-    # {"stream_type": {"attribute": [attribute per stream]}}
-    #
-    # following would print out the list of frame rates for every present video stream
-    print(reader_md["video"]["fps"])
-
-    # we explicitly select the stream we would like to operate on. In
-    # the constructor we select a default video stream, but
-    # in practice, we can set whichever stream we would like 
-    video.set_current_stream("video:0")
 
@@ -42,7 +42,7 @@
 IS_ROCM = (torch.version.hip is not None) and (ROCM_HOME is not None)
 BUILD_CUDA_SOURCES = (torch.cuda.is_available() and ((CUDA_HOME is not None) or IS_ROCM)) or FORCE_CUDA
 
-PACKAGE_NAME = "torchvision"
+package_name = os.getenv("TORCHVISION_PACKAGE_NAME", "torchvision")
 
 print("Torchvision build configuration:")
 print(f"{FORCE_CUDA = }")
@@ -98,7 +98,7 @@ def get_dist(pkgname):
         except DistributionNotFound:
             return None
 
-    pytorch_dep = "torch"
+    pytorch_dep = os.getenv("TORCH_PACKAGE_NAME", "torch")
     if os.getenv("PYTORCH_VERSION"):
         pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
@@ -561,7 +561,7 @@ def run(self):
     version, sha = get_version()
     write_version_file(version, sha)
 
-    print(f"Building wheel {PACKAGE_NAME}-{version}")
+    print(f"Building wheel {package_name}-{version}")
 
     with open("README.md") as f:
         readme = f.read()
@@ -573,7 +573,7 @@ def run(self):
     ]
 
     setup(
-        name=PACKAGE_NAME,
+        name=package_name,
         version=version,
         author="PyTorch Core Team",
         author_email="[email protected]",
@@ -583,7 +583,7 @@ def run(self):
         long_description_content_type="text/markdown",
         license="BSD",
         packages=find_packages(exclude=("test",)),
-        package_data={PACKAGE_NAME: ["*.dll", "*.dylib", "*.so", "prototype/datasets/_builtin/*.categories"]},
+        package_data={package_name: ["*.dll", "*.dylib", "*.so", "prototype/datasets/_builtin/*.categories"]},
         zip_safe=False,
         install_requires=get_requirements(),
         extras_require={
 
@@ -6169,3 +6169,50 @@ def test_transform_sequence_len_error(self, quality):
     def test_transform_invalid_quality_error(self, quality):
         with pytest.raises(ValueError, match="quality must be an integer from 1 to 100"):
             transforms.JPEG(quality=quality)
+
+
+class TestUtils:
+    # TODO: Still need to test has_all, has_any, check_type and get_bouding_boxes
+    @pytest.mark.parametrize(
+        "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize(
+        "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_query_size_and_query_chw(self, make_input1, make_input2, query):
+        size = (32, 64)
+        input1 = make_input1(size)
+        input2 = make_input2(size)
+
+        if query is transforms.query_chw and not any(
+            transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+            for inpt in (input1, input2)
+        ):
+            return
+
+        expected = size if query is transforms.query_size else ((3,) + size)
+        assert query([input1, input2]) == expected
+
+    @pytest.mark.parametrize(
+        "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize(
+        "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask]
+    )
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_different_sizes(self, make_input1, make_input2, query):
+        input1 = make_input1((10, 10))
+        input2 = make_input2((20, 20))
+        if query is transforms.query_chw and not all(
+            transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+            for inpt in (input1, input2)
+        ):
+            return
+        with pytest.raises(ValueError, match="Found multiple"):
+            query([input1, input2])
+
+    @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw])
+    def test_no_valid_input(self, query):
+        with pytest.raises(TypeError, match="No image"):
+            query(["blah"])