Use C++ decoding APIs in sampler

NicolasHug · NicolasHug · commit 2bce920bbfad · 2024-10-22T17:08:50.000+01:00
diff --git a/src/torchcodec/samplers/_common.py b/src/torchcodec/samplers/_common.py
@@ -1,8 +1,5 @@
 from typing import Callable, Union
 
-import torch
-from torchcodec import Frame, FrameBatch
-
 _LIST_OF_INT_OR_FLOAT = Union[list[int], list[float]]
 
 
@@ -42,22 +39,6 @@ def _error_policy(
 }
 
 
-def _chunk_list(lst, chunk_size):
-    # return list of sublists of length chunk_size
-    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
-
-
-def _to_framebatch(frames: list[Frame]) -> FrameBatch:
-    # IMPORTANT: see other IMPORTANT note in _decode_all_clips_indices and
-    # _decode_all_clips_timestamps
-    data = torch.stack([frame.data for frame in frames])
-    pts_seconds = torch.tensor([frame.pts_seconds for frame in frames])
-    duration_seconds = torch.tensor([frame.duration_seconds for frame in frames])
-    return FrameBatch(
-        data=data, pts_seconds=pts_seconds, duration_seconds=duration_seconds
-    )
-
-
 def _validate_common_params(*, decoder, num_frames_per_clip, policy):
     if len(decoder) < 1:
         raise ValueError(
diff --git a/src/torchcodec/samplers/_index_based.py b/src/torchcodec/samplers/_index_based.py
@@ -1,14 +1,13 @@
-from typing import List, Literal, Optional
+from typing import Literal, Optional
 
 import torch
 
-from torchcodec import Frame, FrameBatch
+from torchcodec import FrameBatch
 from torchcodec.decoders import VideoDecoder
+from torchcodec.decoders._core import get_frames_at_indices
 from torchcodec.samplers._common import (
-    _chunk_list,
     _POLICY_FUNCTION_TYPE,
     _POLICY_FUNCTIONS,
-    _to_framebatch,
     _validate_common_params,
 )
 
@@ -117,51 +116,6 @@ def _build_all_clips_indices(
     return all_clips_indices
 
 
-def _decode_all_clips_indices(
-    decoder: VideoDecoder, all_clips_indices: list[int], num_frames_per_clip: int
-) -> list[FrameBatch]:
-    # This takes the list of all the frames to decode (in arbitrary order),
-    # decode all the frames, and then packs them into clips of length
-    # num_frames_per_clip.
-    #
-    # To avoid backwards seeks (which are slow), we:
-    # - sort all the frame indices to be decoded
-    # - dedup them
-    # - decode all unique frames in sorted order
-    # - re-assemble the decoded frames back to their original order
-    #
-    # TODO: Write this in C++ so we can avoid the copies that happen in `_to_framebatch`
-
-    all_clips_indices_sorted, argsort = zip(
-        *sorted((frame_index, i) for (i, frame_index) in enumerate(all_clips_indices))
-    )
-    previous_decoded_frame = None
-    all_decoded_frames = [None] * len(all_clips_indices)
-    for i, j in enumerate(argsort):
-        frame_index = all_clips_indices_sorted[i]
-        if (
-            previous_decoded_frame is not None  # then we know i > 0
-            and frame_index == all_clips_indices_sorted[i - 1]
-        ):
-            # Avoid decoding the same frame twice.
-            # IMPORTANT: this is only correct because a copy of the frame will
-            # happen within `_to_framebatch` when we call torch.stack.
-            # If a copy isn't made, the same underlying memory will be used for
-            # the 2 consecutive frames. When we re-write this, we should make
-            # sure to explicitly copy the data.
-            decoded_frame = previous_decoded_frame
-        else:
-            decoded_frame = decoder.get_frame_at(index=frame_index)
-        previous_decoded_frame = decoded_frame
-        all_decoded_frames[j] = decoded_frame
-
-    all_clips: list[list[Frame]] = _chunk_list(
-        all_decoded_frames, chunk_size=num_frames_per_clip
-    )
-
-    return [_to_framebatch(clip) for clip in all_clips]
-
-
 def _generic_index_based_sampler(
     kind: Literal["random", "regular"],
     decoder: VideoDecoder,
@@ -174,7 +128,7 @@ def _generic_index_based_sampler(
     # Important note: sampling_range_end defines the upper bound of where a clip
     # can *start*, not where a clip can end.
     policy: Literal["repeat_last", "wrap", "error"],
-) -> List[FrameBatch]:
+) -> FrameBatch:
 
     _validate_common_params(
         decoder=decoder,
@@ -221,11 +175,27 @@ def _generic_index_based_sampler(
         num_frames_in_video=len(decoder),
         policy_fun=_POLICY_FUNCTIONS[policy],
     )
-    return _decode_all_clips_indices(
-        decoder,
-        all_clips_indices=all_clips_indices,
-        num_frames_per_clip=num_frames_per_clip,
+
+    frames, pts_seconds, duration_seconds = get_frames_at_indices(
+        decoder._decoder,
+        stream_index=decoder.stream_index,
+        frame_indices=all_clips_indices,
+        sort_indices=True,
+    )
+    last_3_dims = frames.shape[-3:]
+    out = FrameBatch(
+        data=frames.view(num_clips, num_frames_per_clip, *last_3_dims),
+        pts_seconds=pts_seconds.view(num_clips, num_frames_per_clip),
+        duration_seconds=duration_seconds.view(num_clips, num_frames_per_clip),
     )
+    return [
+        FrameBatch(
+            out.data[i],
+            out.pts_seconds[i],
+            out.duration_seconds[i],
+        )
+        for i in range(out.data.shape[0])
+    ]
 
 
 def clips_at_random_indices(
@@ -237,7 +207,7 @@ def clips_at_random_indices(
     sampling_range_start: int = 0,
     sampling_range_end: Optional[int] = None,  # interval is [start, end).
     policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_index_based_sampler(
         kind="random",
         decoder=decoder,
@@ -259,7 +229,7 @@ def clips_at_regular_indices(
     sampling_range_start: int = 0,
     sampling_range_end: Optional[int] = None,  # interval is [start, end).
     policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
 
     return _generic_index_based_sampler(
         kind="regular",
diff --git a/src/torchcodec/samplers/_time_based.py b/src/torchcodec/samplers/_time_based.py
@@ -2,13 +2,11 @@
 
 import torch
 
-from torchcodec import Frame, FrameBatch
-from torchcodec.decoders import VideoDecoder
+from torchcodec import FrameBatch
+from torchcodec.decoders._core import get_frames_at_ptss
 from torchcodec.samplers._common import (
-    _chunk_list,
     _POLICY_FUNCTION_TYPE,
     _POLICY_FUNCTIONS,
-    _to_framebatch,
     _validate_common_params,
 )
 
@@ -147,51 +145,6 @@ def _build_all_clips_timestamps(
     return all_clips_timestamps
 
 
-def _decode_all_clips_timestamps(
-    decoder: VideoDecoder, all_clips_timestamps: list[float], num_frames_per_clip: int
-) -> list[FrameBatch]:
-    # This is 99% the same as _decode_all_clips_indices. The only change is the
-    # call to .get_frame_displayed_at(pts) instead of .get_frame_at(idx)
-
-    all_clips_timestamps_sorted, argsort = zip(
-        *sorted(
-            (frame_index, i) for (i, frame_index) in enumerate(all_clips_timestamps)
-        )
-    )
-    previous_decoded_frame = None
-    all_decoded_frames = [None] * len(all_clips_timestamps)
-    for i, j in enumerate(argsort):
-        frame_pts_seconds = all_clips_timestamps_sorted[i]
-        if (
-            previous_decoded_frame is not None  # then we know i > 0
-            and frame_pts_seconds == all_clips_timestamps_sorted[i - 1]
-        ):
-            # Avoid decoding the same frame twice.
-            # Unfortunatly this is unlikely to lead to speed-up as-is: it's
-            # pretty unlikely that 2 pts will be the same since pts are float
-            # contiguous values. Theoretically the dedup can still happen, but
-            # it would be much more efficient to implement it at the frame index
-            # level. We should do that once we implement that in C++.
-            # See also https://github.com/pytorch/torchcodec/issues/256.
-            #
-            # IMPORTANT: this is only correct because a copy of the frame will
-            # happen within `_to_framebatch` when we call torch.stack.
-            # If a copy isn't made, the same underlying memory will be used for
-            # the 2 consecutive frames. When we re-write this, we should make
-            # sure to explicitly copy the data.
-            decoded_frame = previous_decoded_frame
-        else:
-            decoded_frame = decoder.get_frame_displayed_at(seconds=frame_pts_seconds)
-        previous_decoded_frame = decoded_frame
-        all_decoded_frames[j] = decoded_frame
-
-    all_clips: list[list[Frame]] = _chunk_list(
-        all_decoded_frames, chunk_size=num_frames_per_clip
-    )
-
-    return [_to_framebatch(clip) for clip in all_clips]
-
-
 def _generic_time_based_sampler(
     kind: Literal["random", "regular"],
     decoder,
@@ -204,7 +157,7 @@ def _generic_time_based_sampler(
     sampling_range_start: Optional[float],
     sampling_range_end: Optional[float],  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     # Note: *everywhere*, sampling_range_end denotes the upper bound of where a
     # clip can start. This is an *open* upper bound, i.e. we will make sure no
     # clip starts exactly at (or above) sampling_range_end.
@@ -246,6 +199,7 @@ def _generic_time_based_sampler(
             sampling_range_end,  # excluded
             seconds_between_clip_starts,
         )
+        num_clips = len(clip_start_seconds)
 
     all_clips_timestamps = _build_all_clips_timestamps(
         clip_start_seconds=clip_start_seconds,
@@ -255,11 +209,27 @@ def _generic_time_based_sampler(
         policy_fun=_POLICY_FUNCTIONS[policy],
     )
 
-    return _decode_all_clips_timestamps(
-        decoder,
-        all_clips_timestamps=all_clips_timestamps,
-        num_frames_per_clip=num_frames_per_clip,
+    frames, pts_seconds, duration_seconds = get_frames_at_ptss(
+        decoder._decoder,
+        stream_index=decoder.stream_index,
+        frame_ptss=all_clips_timestamps,
+        sort_ptss=True,
     )
+    last_3_dims = frames.shape[-3:]
+
+    out = FrameBatch(
+        data=frames.view(num_clips, num_frames_per_clip, *last_3_dims),
+        pts_seconds=pts_seconds.view(num_clips, num_frames_per_clip),
+        duration_seconds=duration_seconds.view(num_clips, num_frames_per_clip),
+    )
+    return [
+        FrameBatch(
+            out.data[i],
+            out.pts_seconds[i],
+            out.duration_seconds[i],
+        )
+        for i in range(out.data.shape[0])
+    ]
 
 
 def clips_at_random_timestamps(
@@ -272,7 +242,7 @@ def clips_at_random_timestamps(
     sampling_range_start: Optional[float] = None,
     sampling_range_end: Optional[float] = None,  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_time_based_sampler(
         kind="random",
         decoder=decoder,
@@ -296,7 +266,7 @@ def clips_at_regular_timestamps(
     sampling_range_start: Optional[float] = None,
     sampling_range_end: Optional[float] = None,  # interval is [start, end).
     policy: str = "repeat_last",
-) -> List[FrameBatch]:
+) -> FrameBatch:
     return _generic_time_based_sampler(
         kind="regular",
         decoder=decoder,
diff --git a/test/samplers/test_samplers.py b/test/samplers/test_samplers.py
@@ -130,7 +130,7 @@ def test_time_based_sampler(sampler, seconds_between_frames):
     if sampler.func is clips_at_regular_timestamps:
         seconds_between_clip_starts = sampler.keywords["seconds_between_clip_starts"]
         expected_seconds_between_clip_starts = torch.tensor(
-            [seconds_between_clip_starts] * (len(clips) - 1), dtype=torch.float
+            [seconds_between_clip_starts] * (len(clips) - 1), dtype=torch.float64
         )
         _assert_regular_sampler(
             clips=clips,

Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ def test_time_based_sampler(sampler, seconds_between_frames):`
`130`	`130`	`if sampler.func is clips_at_regular_timestamps:`
`131`	`131`	`seconds_between_clip_starts = sampler.keywords["seconds_between_clip_starts"]`
`132`	`132`	`expected_seconds_between_clip_starts = torch.tensor(`
`133`		`- [seconds_between_clip_starts] * (len(clips) - 1), dtype=torch.float`
	`133`	`+ [seconds_between_clip_starts] * (len(clips) - 1), dtype=torch.float64`
`134`	`134`	`)`
`135`	`135`	`_assert_regular_sampler(`
`136`	`136`	`clips=clips,`