Let core ops return 3D tensors

NicolasHug · NicolasHug · commit 915631d205bf · 2025-02-15T20:27:59.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -1402,10 +1402,6 @@ VideoDecoder::FrameBatchOutput ::FrameBatchOutput(
                            .dtype(torch::kFloat32)
                            .layout(torch::kStrided)
                            .device(torch::kCPU);
-  // Note that we allocate a 3D shape. We'll eventually return a 2D shape
-  // (numChannels, numSamples * numFrames) where each frame is concatenated
-  // along the 2nd dimension. Allocating tensors this way makes it much easier
-  // to use the same code paths for audio and video for batch APIs.
   data = torch::empty({numFrames, numChannels, numSamples}, tensorOptions);
 }
 
@@ -1438,15 +1434,7 @@ torch::Tensor allocateEmptyHWCTensor(
 torch::Tensor VideoDecoder::maybePermuteHWC2CHW(torch::Tensor& hwcTensor) {
   if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_AUDIO) {
     // TODO_CODE_QUALITY: Do something cleaner for handling audio
-    if (hwcTensor.dim() == 2) {
-      return hwcTensor;
-    }
-    auto shape = hwcTensor.sizes();
-    auto numFrames = shape[0];
-    auto numChannels = shape[1];
-    auto numSamples = shape[2];
-    return hwcTensor.permute({1, 0, 2}).reshape(
-        {numChannels, numSamples * numFrames});
+    return hwcTensor;
   }
   if (streamInfos_[activeStreamIndex_].videoStreamOptions.dimensionOrder ==
       "NHWC") {
diff --git a/test/decoders/test_video_decoder_ops.py b/test/decoders/test_video_decoder_ops.py
@@ -38,7 +38,6 @@
 
 from ..utils import (
     assert_frames_equal,
-    contiguous_to_stacked_audio_frames,
     cpu_and_cuda,
     NASA_AUDIO,
     NASA_VIDEO,
@@ -237,10 +236,6 @@ def test_get_frames_at_indices(self, test_ref, device):
         frames0and180, *_ = get_frames_at_indices(decoder, frame_indices=[0, 180])
         reference_frame0 = test_ref.get_frame_data_by_index(0)
         reference_frame180 = test_ref.get_frame_data_by_index(180)
-        if test_ref is NASA_AUDIO:
-            frames0and180 = contiguous_to_stacked_audio_frames(
-                frames0and180, num_frames=2
-            )
 
         assert_frames_equal(frames0and180[0], reference_frame0.to(device))
         assert_frames_equal(frames0and180[1], reference_frame180.to(device))
@@ -265,10 +260,6 @@ def test_get_frames_at_indices_unsorted_indices(self, test_ref, device):
             decoder,
             frame_indices=frame_indices,
         )
-        if test_ref is NASA_AUDIO:
-            frames = contiguous_to_stacked_audio_frames(
-                frames, num_frames=len(frame_indices)
-            )
         for frame, expected_frame in zip(frames, expected_frames):
             assert_frames_equal(frame, expected_frame)
 
diff --git a/test/utils.py b/test/utils.py
@@ -23,15 +23,6 @@ def cpu_and_cuda():
     return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
 
 
-def contiguous_to_stacked_audio_frames(frames, *, num_frames):
-    # (num_channels, num_samples * num_frames) --> (num_frames, num_channels, num_samples)
-    # Shape conversion util for audio frame. This makes it easier to index
-    # individual frames so we can use the same code paths when checking equality
-    # of video frames and audio frames.
-    num_channels = frames.shape[0]
-    return frames.reshape(num_channels, num_frames, -1).permute(1, 0, 2)
-
-
 # For use with decoded data frames. On CPU Linux, we expect exact, bit-for-bit
 # equality. On CUDA Linux, we expect a small tolerance.
 # On other platforms (e.g. MacOS), we also allow a small tolerance. FFmpeg does
@@ -128,7 +119,11 @@ def get_frame_data_by_range(
         *,
         stream_index: Optional[int] = None,
     ) -> torch.Tensor:
-        raise NotImplementedError("Override in child classes")
+        tensors = [
+            self.get_frame_data_by_index(i, stream_index=stream_index)
+            for i in range(start, stop, step)
+        ]
+        return torch.stack(tensors)
 
     def get_pts_seconds_by_range(
         self,
@@ -202,20 +197,6 @@ def get_frame_data_by_index(
         )
         return torch.load(file_path, weights_only=True).permute(2, 0, 1)
 
-    def get_frame_data_by_range(
-        self,
-        start: int,
-        stop: int,
-        step: int = 1,
-        *,
-        stream_index: Optional[int] = None,
-    ) -> torch.Tensor:
-        tensors = [
-            self.get_frame_data_by_index(i, stream_index=stream_index)
-            for i in range(start, stop, step)
-        ]
-        return torch.stack(tensors)
-
     @property
     def width(self) -> int:
         return self.stream_infos[self.default_stream_index].width
@@ -356,24 +337,10 @@ def get_frame_data_by_index(
 
         return self._reference_frames[idx]
 
-    def get_frame_data_by_range(
-        self,
-        start: int,
-        stop: int,
-        step: int = 1,
-        *,
-        stream_index: Optional[int] = None,
-    ) -> torch.Tensor:
-        tensors = [
-            self.get_frame_data_by_index(i, stream_index=stream_index)
-            for i in range(start, stop, step)
-        ]
-        return torch.cat(tensors, dim=1)
-
-    # TODO: this shouldn't be named chw
+    # TODO: this shouldn't be named chw. Also values are hard-coded
     @property
     def empty_chw_tensor(self) -> torch.Tensor:
-        return torch.empty([2, 0], dtype=torch.float32)
+        return torch.empty([0, 2, 1024], dtype=torch.float32)
 
 
 NASA_AUDIO = TestAudio(