From 1e06ea57a439a061700b2649f952d8635983907a Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 20 Oct 2025 15:38:35 -0700 Subject: [PATCH 1/7] video encoder python file --- src/torchcodec/encoders/__init__.py | 1 + src/torchcodec/encoders/_video_encoder.py | 97 +++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 src/torchcodec/encoders/_video_encoder.py diff --git a/src/torchcodec/encoders/__init__.py b/src/torchcodec/encoders/__init__.py index 51f5942b3..cf78fe427 100644 --- a/src/torchcodec/encoders/__init__.py +++ b/src/torchcodec/encoders/__init__.py @@ -1 +1,2 @@ from ._audio_encoder import AudioEncoder # noqa +from ._video_encoder import VideoEncoder # noqa diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py new file mode 100644 index 000000000..fd354abeb --- /dev/null +++ b/src/torchcodec/encoders/_video_encoder.py @@ -0,0 +1,97 @@ +from pathlib import Path +from typing import Union + +import torch +from torch import Tensor + +from torchcodec import _core + + +class VideoEncoder: + """A video encoder. + + Args: + frames (``torch.Tensor``): The frames to encode. This must be a 4D + tensor of shape ``(N, C, H, W)`` where N is the number of frames, + C is 3 channels (RGB), H is height, and W is width. + A 3D tensor of shape ``(C, H, W)`` is also accepted as a single RGB frame. + Values must be uint8 in the range ``[0, 255]``. + frame_rate (int): The frame rate to use when encoding the + **input** ``frames``. + """ + + def __init__(self, frames: Tensor, *, frame_rate: int): + torch._C._log_api_usage_once("torchcodec.encoders.VideoEncoder") + if not isinstance(frames, Tensor): + raise ValueError(f"Expected frames to be a Tensor, got {type(frames) = }.") + if frames.ndim == 3: + # make it 4D and assume single RGB frame, CHW -> NCHW + frames = torch.unsqueeze(frames, 0) + if frames.ndim != 4: + raise ValueError(f"Expected 3D or 4D frames, got {frames.shape = }.") + if frames.dtype != torch.uint8: + raise ValueError(f"Expected uint8 frames, got {frames.dtype = }.") + if frame_rate <= 0: + raise ValueError(f"{frame_rate = } must be > 0.") + + self._frames = frames + self._frame_rate = frame_rate + + def to_file( + self, + dest: Union[str, Path], + ) -> None: + """Encode frames into a file. + + Args: + dest (str or ``pathlib.Path``): The path to the output file, e.g. + ``video.mp4``. The extension of the file determines the video + format and container. + """ + _core.encode_video_to_file( + frames=self._frames, + frame_rate=self._frame_rate, + filename=str(dest), + ) + + def to_tensor( + self, + format: str, + ) -> Tensor: + """Encode frames into raw bytes, as a 1D uint8 Tensor. + + Args: + format (str): The format of the encoded frames, e.g. "mp4", "mov", + "mkv", "avi", "webm", "flv", or "gif" + + Returns: + Tensor: The raw encoded bytes as 4D uint8 Tensor. + """ + return _core.encode_video_to_tensor( + frames=self._frames, + frame_rate=self._frame_rate, + format=format, + ) + + def to_file_like( + self, + file_like, + format: str, + ) -> None: + """Encode frames into a file-like object. + + Args: + file_like: A file-like object that supports ``write()`` and + ``seek()`` methods, such as io.BytesIO(), an open file in binary + write mode, etc. Methods must have the following signature: + ``write(data: bytes) -> int`` and ``seek(offset: int, whence: + int = 0) -> int``. + format (str): The format of the encoded frames, e.g. "mp4", "mov", + "mkv", "avi", "webm", "flv", or "gif". + """ + _core.encode_video_to_file_like( + frames=self._frames, + frame_rate=self._frame_rate, + format=format, + file_like=file_like, + ) From 1e7dc3468b06998207f695afefedd3a6cde9465c Mon Sep 17 00:00:00 2001 From: Daniel Flores Date: Mon, 20 Oct 2025 15:42:03 -0700 Subject: [PATCH 2/7] testing --- test/test_encoders.py | 116 +++++++++++++++++++++++++++++++++++++++++- test/test_ops.py | 68 ++----------------------- 2 files changed, 120 insertions(+), 64 deletions(-) diff --git a/test/test_encoders.py b/test/test_encoders.py index c5946654d..4f3c0cf76 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -11,7 +11,7 @@ import torch from torchcodec.decoders import AudioDecoder -from torchcodec.encoders import AudioEncoder +from torchcodec.encoders import AudioEncoder, VideoEncoder from .utils import ( assert_tensor_close_on_at_least, @@ -564,3 +564,117 @@ def write(self, data): RuntimeError, match="File like object must implement a seek method" ): encoder.to_file_like(NoSeekMethod(), format="wav") + + +class TestVideoEncoder: + @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) + def test_bad_input_parameterized(self, tmp_path, method): + if method == "to_file": + valid_params = dict(dest=str(tmp_path / "output.mp4")) + elif method == "to_tensor": + valid_params = dict(format="mp4") + elif method == "to_file_like": + valid_params = dict(file_like=io.BytesIO(), format="mp4") + else: + raise ValueError(f"Unknown method: {method}") + + with pytest.raises( + ValueError, match="Expected uint8 frames, got frames.dtype = torch.float32" + ): + encoder = VideoEncoder( + frames=torch.rand(5, 3, 64, 64), + frame_rate=30, + ) + getattr(encoder, method)(**valid_params) + + with pytest.raises( + ValueError, match=r"Expected 3D or 4D frames, got frames.shape = torch.Size" + ): + encoder = VideoEncoder( + frames=torch.zeros(10), + frame_rate=30, + ) + getattr(encoder, method)(**valid_params) + + with pytest.raises( + RuntimeError, match=r"frame must have 3 channels \(R, G, B\), got 2" + ): + encoder = VideoEncoder( + frames=torch.zeros((5, 2, 64, 64), dtype=torch.uint8), + frame_rate=30, + ) + getattr(encoder, method)(**valid_params) + + def test_bad_input(self, tmp_path): + encoder = VideoEncoder( + frames=torch.zeros((5, 3, 64, 64), dtype=torch.uint8), + frame_rate=30, + ) + + with pytest.raises( + RuntimeError, + match=r"Couldn't allocate AVFormatContext. The destination file is ./file.bad_extension, check the desired extension\?", + ): + encoder.to_file("./file.bad_extension") + + with pytest.raises( + RuntimeError, + match=r"avio_open failed. The destination file is ./bad/path.mp3, make sure it's a valid path\?", + ): + encoder.to_file("./bad/path.mp3") + + with pytest.raises( + RuntimeError, + match=r"Couldn't allocate AVFormatContext. Check the desired format\? Got format=bad_format", + ): + encoder.to_tensor(format="bad_format") + + @pytest.mark.parametrize("method", ("to_file", "to_tensor", "to_file_like")) + def test_contiguity(self, method, tmp_path): + # Ensure that 2 sets of video frames with the same pixel values are encoded + # in the same way, regardless of their memory layout. Here we encode 2 equal + # frame tensors, one is contiguous while the other is non-contiguous. + + num_frames, channels, height, width = 5, 3, 64, 64 + contiguous_frames = ( + (torch.rand(num_frames, channels, height, width) * 255) + .to(torch.uint8) + .contiguous() + ) + assert contiguous_frames.is_contiguous() + + # Create non-contiguous frames by permuting, calling contiguous to update memory layout, + # then permuting back to the initial order + non_contiguous_frames = ( + contiguous_frames.permute(0, 3, 2, 1).contiguous().permute(0, 3, 2, 1) + ) + assert non_contiguous_frames.stride() != contiguous_frames.stride() + assert not non_contiguous_frames.is_contiguous() + + torch.testing.assert_close( + contiguous_frames, non_contiguous_frames, rtol=0, atol=0 + ) + + def encode_to_tensor(frames): + if method == "to_file": + dest = str(tmp_path / "output.mp4") + VideoEncoder(frames, frame_rate=30).to_file(dest=dest) + with open(dest, "rb") as f: + return torch.frombuffer(f.read(), dtype=torch.uint8) + elif method == "to_tensor": + return VideoEncoder(frames, frame_rate=30).to_tensor(format="mp4") + elif method == "to_file_like": + file_like = io.BytesIO() + VideoEncoder(frames, frame_rate=30).to_file_like( + file_like, format="mp4" + ) + return torch.frombuffer(file_like.getvalue(), dtype=torch.uint8) + else: + raise ValueError(f"Unknown method: {method}") + + encoded_from_contiguous = encode_to_tensor(contiguous_frames) + encoded_from_non_contiguous = encode_to_tensor(non_contiguous_frames) + + torch.testing.assert_close( + encoded_from_contiguous, encoded_from_non_contiguous, rtol=0, atol=0 + ) diff --git a/test/test_ops.py b/test/test_ops.py index 627829689..075929335 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1152,68 +1152,6 @@ def test_bad_input(self, tmp_path): class TestVideoEncoderOps: - # TODO-VideoEncoder: Test encoding against different memory layouts (ex. test_contiguity) - # TODO-VideoEncoder: Parametrize test after moving to test_encoders - def test_bad_input(self, tmp_path): - output_file = str(tmp_path / ".mp4") - - with pytest.raises( - RuntimeError, match="frames must have uint8 dtype, got float" - ): - encode_video_to_file( - frames=torch.rand((10, 3, 60, 60), dtype=torch.float), - frame_rate=10, - filename=output_file, - ) - - with pytest.raises( - RuntimeError, match=r"frames must have 4 dimensions \(N, C, H, W\), got 3" - ): - encode_video_to_file( - frames=torch.randint(high=1, size=(3, 60, 60), dtype=torch.uint8), - frame_rate=10, - filename=output_file, - ) - - with pytest.raises( - RuntimeError, match=r"frame must have 3 channels \(R, G, B\), got 2" - ): - encode_video_to_file( - frames=torch.randint(high=1, size=(10, 2, 60, 60), dtype=torch.uint8), - frame_rate=10, - filename=output_file, - ) - - with pytest.raises( - RuntimeError, - match=r"Couldn't allocate AVFormatContext. The destination file is ./file.bad_extension, check the desired extension\?", - ): - encode_video_to_file( - frames=torch.randint(high=255, size=(10, 3, 60, 60), dtype=torch.uint8), - frame_rate=10, - filename="./file.bad_extension", - ) - - with pytest.raises( - RuntimeError, - match=r"avio_open failed. The destination file is ./bad/path.mp3, make sure it's a valid path\?", - ): - encode_video_to_file( - frames=torch.randint(high=255, size=(10, 3, 60, 60), dtype=torch.uint8), - frame_rate=10, - filename="./bad/path.mp3", - ) - - with pytest.raises( - RuntimeError, - match=r"Couldn't allocate AVFormatContext. Check the desired format\? Got format=bad_format", - ): - encode_video_to_tensor( - frames=torch.randint(high=255, size=(10, 3, 60, 60), dtype=torch.uint8), - frame_rate=10, - format="bad_format", - ) - def decode(self, source=None) -> torch.Tensor: return VideoDecoder(source).get_frames_in_range(start=0, stop=60) @@ -1406,7 +1344,9 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format): ) def test_to_file_like_custom_file_object(self): - """Test with a custom file-like object that implements write and seek.""" + """Test to_file_like with a custom file-like object that implements write and seek.""" + if get_ffmpeg_major_version() == 6: + pytest.skip("Skipping round trip test for FFmpeg 6") class CustomFileObject: def __init__(self): @@ -1437,6 +1377,8 @@ def get_encoded_data(self): def test_to_file_like_real_file(self, tmp_path): """Test to_file_like with a real file opened in binary write mode.""" + if get_ffmpeg_major_version() == 6: + pytest.skip("Skipping round trip test for FFmpeg 6") source_frames = self.decode(TEST_SRC_2_720P.path).data file_path = tmp_path / "test_file_like.mp4" From cf7b75cc2a8d15def0fa41b0c991ffff47ddbfa0 Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Mon, 20 Oct 2025 16:39:32 -0700 Subject: [PATCH 3/7] delete contiguous todo --- src/torchcodec/_core/Encoder.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp index 1d9c2c089..4e5d6a604 100644 --- a/src/torchcodec/_core/Encoder.cpp +++ b/src/torchcodec/_core/Encoder.cpp @@ -531,7 +531,6 @@ torch::Tensor validateFrames(const torch::Tensor& frames) { frames.sizes()[1] == 3, "frame must have 3 channels (R, G, B), got ", frames.sizes()[1]); - // TODO-VideoEncoder: Investigate if non-contiguous frames can be accepted return frames.contiguous(); } From ee2285eb28294870d7e4f4a5186e1638d15e77b7 Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Mon, 27 Oct 2025 11:00:15 -0400 Subject: [PATCH 4/7] use randint suggestion, remove test skips --- test/test_encoders.py | 8 +++----- test/test_ops.py | 4 ---- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/test/test_encoders.py b/test/test_encoders.py index 4f3c0cf76..a73a26967 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -636,11 +636,9 @@ def test_contiguity(self, method, tmp_path): # frame tensors, one is contiguous while the other is non-contiguous. num_frames, channels, height, width = 5, 3, 64, 64 - contiguous_frames = ( - (torch.rand(num_frames, channels, height, width) * 255) - .to(torch.uint8) - .contiguous() - ) + contiguous_frames = torch.randint( + 0, 256, size=(num_frames, channels, height, width), dtype=torch.uint8 + ).contiguous() assert contiguous_frames.is_contiguous() # Create non-contiguous frames by permuting, calling contiguous to update memory layout, diff --git a/test/test_ops.py b/test/test_ops.py index 075929335..e798a7a2b 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1345,8 +1345,6 @@ def test_video_encoder_against_ffmpeg_cli(self, tmp_path, format): def test_to_file_like_custom_file_object(self): """Test to_file_like with a custom file-like object that implements write and seek.""" - if get_ffmpeg_major_version() == 6: - pytest.skip("Skipping round trip test for FFmpeg 6") class CustomFileObject: def __init__(self): @@ -1377,8 +1375,6 @@ def get_encoded_data(self): def test_to_file_like_real_file(self, tmp_path): """Test to_file_like with a real file opened in binary write mode.""" - if get_ffmpeg_major_version() == 6: - pytest.skip("Skipping round trip test for FFmpeg 6") source_frames = self.decode(TEST_SRC_2_720P.path).data file_path = tmp_path / "test_file_like.mp4" From d14deb8b122bdaf38a0ec3ec53d1f1757daa0cd2 Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Mon, 27 Oct 2025 15:41:26 -0400 Subject: [PATCH 5/7] assert contiguity with channels_last --- test/test_encoders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_encoders.py b/test/test_encoders.py index a73a26967..f901fbd22 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -641,13 +641,13 @@ def test_contiguity(self, method, tmp_path): ).contiguous() assert contiguous_frames.is_contiguous() - # Create non-contiguous frames by permuting, calling contiguous to update memory layout, - # then permuting back to the initial order + # Permute NCHW to NHWC, then update the memory layout, then permute back non_contiguous_frames = ( - contiguous_frames.permute(0, 3, 2, 1).contiguous().permute(0, 3, 2, 1) + contiguous_frames.permute(0, 2, 3, 1).contiguous().permute(0, 3, 1, 2) ) assert non_contiguous_frames.stride() != contiguous_frames.stride() assert not non_contiguous_frames.is_contiguous() + assert non_contiguous_frames.is_contiguous(memory_format=torch.channels_last) torch.testing.assert_close( contiguous_frames, non_contiguous_frames, rtol=0, atol=0 From 7ed7f21ac6dcd91324c33df4963ecca257a79a0b Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Wed, 29 Oct 2025 13:54:25 -0400 Subject: [PATCH 6/7] incorporate feedback --- src/torchcodec/encoders/_video_encoder.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/torchcodec/encoders/_video_encoder.py b/src/torchcodec/encoders/_video_encoder.py index fd354abeb..f6a725278 100644 --- a/src/torchcodec/encoders/_video_encoder.py +++ b/src/torchcodec/encoders/_video_encoder.py @@ -14,21 +14,16 @@ class VideoEncoder: frames (``torch.Tensor``): The frames to encode. This must be a 4D tensor of shape ``(N, C, H, W)`` where N is the number of frames, C is 3 channels (RGB), H is height, and W is width. - A 3D tensor of shape ``(C, H, W)`` is also accepted as a single RGB frame. Values must be uint8 in the range ``[0, 255]``. - frame_rate (int): The frame rate to use when encoding the - **input** ``frames``. + frame_rate (int): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate. """ def __init__(self, frames: Tensor, *, frame_rate: int): torch._C._log_api_usage_once("torchcodec.encoders.VideoEncoder") if not isinstance(frames, Tensor): raise ValueError(f"Expected frames to be a Tensor, got {type(frames) = }.") - if frames.ndim == 3: - # make it 4D and assume single RGB frame, CHW -> NCHW - frames = torch.unsqueeze(frames, 0) if frames.ndim != 4: - raise ValueError(f"Expected 3D or 4D frames, got {frames.shape = }.") + raise ValueError(f"Expected 4D frames, got {frames.shape = }.") if frames.dtype != torch.uint8: raise ValueError(f"Expected uint8 frames, got {frames.dtype = }.") if frame_rate <= 0: @@ -46,7 +41,7 @@ def to_file( Args: dest (str or ``pathlib.Path``): The path to the output file, e.g. ``video.mp4``. The extension of the file determines the video - format and container. + container format. """ _core.encode_video_to_file( frames=self._frames, @@ -61,7 +56,7 @@ def to_tensor( """Encode frames into raw bytes, as a 1D uint8 Tensor. Args: - format (str): The format of the encoded frames, e.g. "mp4", "mov", + format (str): The container format of the encoded frames, e.g. "mp4", "mov", "mkv", "avi", "webm", "flv", or "gif" Returns: @@ -86,7 +81,7 @@ def to_file_like( write mode, etc. Methods must have the following signature: ``write(data: bytes) -> int`` and ``seek(offset: int, whence: int = 0) -> int``. - format (str): The format of the encoded frames, e.g. "mp4", "mov", + format (str): The container format of the encoded frames, e.g. "mp4", "mov", "mkv", "avi", "webm", "flv", or "gif". """ _core.encode_video_to_file_like( From b10d80bf1be27d2842a3a8e0e19270cbbe5b7d13 Mon Sep 17 00:00:00 2001 From: Dan-Flores Date: Wed, 29 Oct 2025 14:25:50 -0400 Subject: [PATCH 7/7] update test_bad_input match text --- test/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_encoders.py b/test/test_encoders.py index f901fbd22..b7223c88a 100644 --- a/test/test_encoders.py +++ b/test/test_encoders.py @@ -588,7 +588,7 @@ def test_bad_input_parameterized(self, tmp_path, method): getattr(encoder, method)(**valid_params) with pytest.raises( - ValueError, match=r"Expected 3D or 4D frames, got frames.shape = torch.Size" + ValueError, match=r"Expected 4D frames, got frames.shape = torch.Size" ): encoder = VideoEncoder( frames=torch.zeros(10),