VideoAPI docs update (#2802)

bjuncek · Bruno Korbar · fmassa · web-flow · commit 2831f11abcb9 · 2020-10-13T17:22:07.000+02:00
* Video reader now returns dicts

* docs update

* Minor improvements

Co-authored-by: Bruno Korbar &lt;bjuncek@Frazz.local&gt;
Co-authored-by: Francisco Massa &lt;fvsmassa@gmail.com&gt;
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -25,10 +25,10 @@ lower-level API for more fine-grained control compared to the :mod:`read_video`
 It does all this whilst fully supporting torchscript.
 
 .. autoclass:: VideoReader
-    :members: next, get_metadata, set_current_stream, seek
+    :members: __next__, get_metadata, set_current_stream, seek
 
 
-Example of usage:
+Example of inspecting a video:
 
 .. code:: python
 
@@ -50,6 +50,11 @@ Example of usage:
     # following would print out the list of frame rates for every present video stream
     print(reader_md["video"]["fps"])
 
+    # we explicitly select the stream we would like to operate on. In
+    # the constructor we select a default video stream, but
+    # in practice, we can set whichever stream we would like 
+    video.set_current_stream("video:0")
+
 
 Image
 -----
diff --git a/test/test_video.py b/test/test_video.py
@@ -244,11 +244,11 @@ def _template_read_video(video_object, s=0, e=None):
     video_frames = torch.empty(0)
     frames = []
     video_pts = []
-    for t, pts in itertools.takewhile(lambda x: x[1] <= e, video_object):
-        if pts < s:
+    for frame in itertools.takewhile(lambda x: x['pts'] <= e, video_object):
+        if frame['pts'] < s:
             continue
-        frames.append(t)
-        video_pts.append(pts)
+        frames.append(frame['data'])
+        video_pts.append(frame['pts'])
     if len(frames) > 0:
         video_frames = torch.stack(frames, 0)
 
@@ -257,11 +257,11 @@ def _template_read_video(video_object, s=0, e=None):
     audio_frames = torch.empty(0)
     frames = []
     audio_pts = []
-    for t, pts in itertools.takewhile(lambda x: x[1] <= e, video_object):
-        if pts < s:
+    for frame in itertools.takewhile(lambda x: x['pts'] <= e, video_object):
+        if frame['pts'] < s:
             continue
-        frames.append(t)
-        audio_pts.append(pts)
+        frames.append(frame['data'])
+        audio_pts.append(frame['pts'])
     if len(frames) > 0:
         audio_frames = torch.stack(frames, 0)
 
@@ -293,8 +293,8 @@ def test_read_video_tensor(self):
             # pass 2: decode all frames using new api
             reader = VideoReader(full_path, "video")
             frames = []
-            for t, _ in reader:
-                frames.append(t)
+            for frame in reader:
+                frames.append(frame['data'])
             new_api = torch.stack(frames, 0)
             self.assertEqual(tv_result.size(), new_api.size())
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
@@ -41,21 +41,48 @@ class VideoReader:
     container.
 
     Example:
-        The following examples creates :mod:`Video` object, seeks into 2s
+        The following examples creates a :mod:`VideoReader` object, seeks into 2s
         point, and returns a single frame::
                 import torchvision
                 video_path = "path_to_a_test_video"
 
                 reader = torchvision.io.VideoReader(video_path, "video")
                 reader.seek(2.0)
-                frame, timestamp = next(reader)
+                frame = next(reader)
+
+        :mod:`VideoReader` implements the iterable API, which makes it suitable to
+        using it in conjunction with :mod:`itertools` for more advanced reading.
+        As such, we can use a :mod:`VideoReader` instance inside for loops::
+            reader.seek(2)
+            for frame in reader:
+                frames.append(frame['data'])
+            # additionally, `seek` implements a fluent API, so we can do
+            for frame in reader.seek(2):
+                frames.append(frame['data'])
+        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
+        following code::
+            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
+                frames.append(frame['data'])
+        and similarly, reading 10 frames after the 2s timestamp can be achieved
+        as follows::
+            for frame in itertools.islice(reader.seek(2), 10):
+                frames.append(frame['data'])
+
+    .. note::
+
+        Each stream descriptor consists of two parts: stream type (e.g. 'video') and
+        a unique stream id (which are determined by the video encoding).
+        In this way, if the video contaner contains multiple
+        streams of the same type, users can acces the one they want.
+        If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
 
         path (string): Path to the video file in supported format
 
-        stream (string, optional): descriptor of the required stream. Defaults to "video:0"
-            Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
+        stream (string, optional): descriptor of the required stream, followed by the stream id,
+            in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
+            Currently available options include ``['video', 'audio']``
     """
 
     def __init__(self, path, stream="video"):
@@ -67,13 +94,14 @@ def __next__(self):
         """Decodes and returns the next frame of the current stream
 
         Returns:
-            ([torch.Tensor, float]): list containing decoded frame and corresponding timestamp
+            (dict): a dictionary with fields ``data`` and ``pts``
+            containing decoded frame and corresponding timestamp
 
         """
         frame, pts = self._c.next()
         if frame.numel() == 0:
             raise StopIteration
-        return frame, pts
+        return {"data": frame, "pts": pts}
 
     def __iter__(self):
         return self
@@ -88,7 +116,7 @@ def seek(self, time_s: float):
             Current implementation is the so-called precise seek. This
             means following seek, call to :mod:`next()` will return the
             frame with the exact timestamp if it exists or
-            the first frame with timestamp larger than time_s.
+            the first frame with timestamp larger than ``time_s``.
         """
         self._c.seek(time_s)
         return self
@@ -106,8 +134,8 @@ def set_current_stream(self, stream: str):
         Explicitly define the stream we are operating on.
 
         Args:
-            stream (string): descriptor of the required stream. Defaults to "video:0"
-                Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
+            stream (string): descriptor of the required stream. Defaults to ``"video:0"``
+                Currently available stream types include ``['video', 'audio']``.
                 Each descriptor consists of two parts: stream type (e.g. 'video') and
                 a unique stream id (which are determined by video encoding).
                 In this way, if the video contaner contains multiple