Added proper tensor support for get_frames_played_at()

Molly Xu · Molly Xu · commit 6de86928000f · 2025-10-04T11:44:40.000-07:00
Summary:
Modified get_frames_played_at in _video_decoder to accept tensors and updated all downstream functions to natively accept tensors rather than converting them to lists.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/src/torchcodec/_core/SingleStreamDecoder.cpp b/src/torchcodec/_core/SingleStreamDecoder.cpp
@@ -753,7 +753,7 @@ FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
 }
 
 FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
-    const std::vector<double>& timestamps) {
+    const torch::Tensor& timestamps) {
   validateActiveStream(AVMEDIA_TYPE_VIDEO);
 
   const auto& streamMetadata =
@@ -767,9 +767,13 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
   // avoid decoding that unique frame twice is to convert the input timestamps
   // to indices, and leverage the de-duplication logic of getFramesAtIndices.
 
-  std::vector<int64_t> frameIndices(timestamps.size());
-  for (size_t i = 0; i < timestamps.size(); ++i) {
-    auto frameSeconds = timestamps[i];
+  torch::Tensor frameIndices =
+      torch::empty({timestamps.numel()}, torch::kInt64);
+  auto frameIndicesAccessor = frameIndices.accessor<int64_t, 1>();
+  auto timestampsAccessor = timestamps.accessor<double, 1>();
+
+  for (int64_t i = 0; i < timestamps.numel(); ++i) {
+    auto frameSeconds = timestampsAccessor[i];
     TORCH_CHECK(
         frameSeconds >= minSeconds,
         "frame pts is " + std::to_string(frameSeconds) +
@@ -786,11 +790,10 @@ FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
               ".");
     }
 
-    frameIndices[i] = secondsToIndexLowerBound(frameSeconds);
+    frameIndicesAccessor[i] = secondsToIndexLowerBound(frameSeconds);
   }
 
-  // TODO: Support tensors natively instead of a vector to avoid a copy.
-  return getFramesAtIndices(torch::tensor(frameIndices));
+  return getFramesAtIndices(frameIndices);
 }
 
 FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
diff --git a/src/torchcodec/_core/SingleStreamDecoder.h b/src/torchcodec/_core/SingleStreamDecoder.h
@@ -121,7 +121,7 @@ class SingleStreamDecoder {
   // seconds=5.999, etc.
   FrameOutput getFramePlayedAt(double seconds);
 
-  FrameBatchOutput getFramesPlayedAt(const std::vector<double>& timestamps);
+  FrameBatchOutput getFramesPlayedAt(const torch::Tensor& timestamps);
 
   // Returns frames within a given pts range. The range is defined by
   // [startSeconds, stopSeconds) with respect to the pts values for frames. The
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -63,7 +63,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
   m.def(
-      "get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
+      "get_frames_by_pts(Tensor(a!) decoder, *, Tensor timestamps) -> (Tensor, Tensor, Tensor)");
   m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
@@ -405,10 +405,9 @@ OpsFrameBatchOutput get_frames_in_range(
 // Return the frames at given ptss for a given stream
 OpsFrameBatchOutput get_frames_by_pts(
     at::Tensor& decoder,
-    at::ArrayRef<double> timestamps) {
+    const at::Tensor& timestamps) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
-  auto result = videoDecoder->getFramesPlayedAt(timestampsVec);
+  auto result = videoDecoder->getFramesPlayedAt(timestamps);
   return makeOpsFrameBatchOutput(result);
 }
 
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -117,7 +117,7 @@ def load_torchcodec_shared_libraries():
 _get_frames_at_indices_tensor_input = (
     torch.ops.torchcodec_ns.get_frames_at_indices.default
 )
-get_frames_by_pts = torch.ops.torchcodec_ns.get_frames_by_pts.default
+_get_frames_by_pts_tensor_input = torch.ops.torchcodec_ns.get_frames_by_pts.default
 get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
 get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
 get_frames_by_pts_in_range_audio = (
@@ -212,6 +212,22 @@ def get_frames_at_indices(
     return _get_frames_at_indices_tensor_input(decoder, frame_indices=frame_indices)
 
 
+def get_frames_by_pts(
+    decoder: torch.Tensor, *, timestamps: Union[torch.Tensor, list[float]]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if isinstance(timestamps, torch.Tensor):
+        # Ensure indices is the correct dtype (float64)
+        timestamps = timestamps.to(torch.float64)
+    else:
+        # Convert list to tensor for dispatch
+        try:
+            timestamps = torch.tensor(timestamps, dtype=torch.float64)
+        except (ValueError, TypeError):
+            # Type validation in C++ layer
+            pass
+    return _get_frames_by_pts_tensor_input(decoder, timestamps=timestamps)
+
+
 # ==============================
 # Abstract impl for the operators. Needed by torch.compile.
 # ==============================
@@ -363,7 +379,7 @@ def get_frame_at_pts_abstract(
 def get_frames_by_pts_abstract(
     decoder: torch.Tensor,
     *,
-    timestamps: List[float],
+    timestamps: Union[torch.Tensor, List[float]],
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
diff --git a/src/torchcodec/decoders/_video_decoder.py b/src/torchcodec/decoders/_video_decoder.py
@@ -336,20 +336,17 @@ def get_frame_played_at(self, seconds: float) -> Frame:
             duration_seconds=duration_seconds.item(),
         )
 
-    def get_frames_played_at(self, seconds: list[float]) -> FrameBatch:
+    def get_frames_played_at(
+        self, seconds: Union[torch.Tensor, list[float]]
+    ) -> FrameBatch:
         """Return frames played at the given timestamps in seconds.
 
         Args:
-            seconds (list of float): The timestamps in seconds when the frames are played.
+            seconds (torch.Tensor or list of float): The timestamps in seconds when the frames are played.
 
         Returns:
             FrameBatch: The frames that are played at ``seconds``.
         """
-        if isinstance(seconds, torch.Tensor):
-            # TODO we should avoid converting tensors to lists and just let the
-            # core ops and C++ code natively accept tensors.  See
-            # https://github.com/pytorch/torchcodec/issues/879
-            seconds = seconds.to(torch.float).tolist()
 
         data, pts_seconds, duration_seconds = core.get_frames_by_pts(
             self._decoder, timestamps=seconds