Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "torch/types.h"

extern "C" {
#include <libavutil/opt.h>
#include <libavutil/pixdesc.h>
}

Expand Down Expand Up @@ -568,6 +569,43 @@ AVPixelFormat validatePixelFormat(
}
TORCH_CHECK(false, errorMsg.str());
}

void validateDoubleOption(
const AVCodec& avCodec,
const char* optionName,
double value) {
if (!avCodec.priv_class) {
return;
}
const AVOption* option = av_opt_find2(
// Convert obj arg from const AVClass* const* to non-const void*
// First cast to remove const, then cast to void*
const_cast<void*>(static_cast<const void*>(&avCodec.priv_class)),
optionName,
nullptr,
0,
AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
// If the option was not found, let FFmpeg handle it later
if (!option) {
return;
}
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
TORCH_CHECK(
value >= option->min && value <= option->max,
optionName,
"=",
value,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
}
}
} // namespace

VideoEncoder::~VideoEncoder() {
Expand Down Expand Up @@ -700,12 +738,17 @@ void VideoEncoder::initializeEncoder(
// Apply videoStreamOptions
AVDictionary* options = nullptr;
if (videoStreamOptions.crf.has_value()) {
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
av_dict_set(
&options,
"crf",
std::to_string(videoStreamOptions.crf.value()).c_str(),
0);
}
if (videoStreamOptions.preset.has_value()) {
av_dict_set(
&options, "preset", videoStreamOptions.preset.value().c_str(), 0);
}
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
av_dict_free(&options);

Expand Down
6 changes: 2 additions & 4 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@ struct VideoStreamOptions {
std::string_view deviceVariant = "ffmpeg";

// Encoding options
// TODO-VideoEncoder: Consider adding other optional fields here
// (bit rate, gop size, max b frames, preset)
std::optional<int> crf;

// Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
// If not specified, uses codec's default format.
std::optional<std::string> pixelFormat;
std::optional<double> crf;
std::optional<std::string> preset;
};

struct AudioStreamOptions {
Expand Down
24 changes: 15 additions & 9 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, int? crf=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, int? crf=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, int? crf=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -603,11 +603,13 @@ void encode_video_to_file(
const at::Tensor& frames,
int64_t frame_rate,
std::string_view file_name,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;
VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -620,12 +622,14 @@ at::Tensor encode_video_to_tensor(
const at::Tensor& frames,
int64_t frame_rate,
std::string_view format,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;
return VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -640,8 +644,9 @@ void _encode_video_to_file_like(
int64_t frame_rate,
std::string_view format,
int64_t file_like_context,
std::optional<std::string> pixel_format = std::nullopt,
std::optional<int64_t> crf = std::nullopt) {
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand All @@ -651,6 +656,7 @@ void _encode_video_to_file_like(
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

VideoEncoder encoder(
frames,
Expand Down
14 changes: 10 additions & 4 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,9 @@ def encode_video_to_file_like(
frame_rate: int,
format: str,
file_like: Union[io.RawIOBase, io.BufferedIOBase],
crf: Optional[int] = None,
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
preset: Optional[str] = None,
) -> None:
"""Encode video frames to a file-like object.

Expand All @@ -225,6 +226,7 @@ def encode_video_to_file_like(
file_like: File-like object that supports write() and seek() methods
crf: Optional constant rate factor for encoding quality
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
"""
assert _pybind_ops is not None

Expand All @@ -235,6 +237,7 @@ def encode_video_to_file_like(
_pybind_ops.create_file_like_context(file_like, True), # True means for writing
pixel_format,
crf,
preset,
)


Expand Down Expand Up @@ -322,8 +325,9 @@ def encode_video_to_file_abstract(
frames: torch.Tensor,
frame_rate: int,
filename: str,
crf: Optional[int] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> None:
return

Expand All @@ -333,8 +337,9 @@ def encode_video_to_tensor_abstract(
frames: torch.Tensor,
frame_rate: int,
format: str,
crf: Optional[int] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)

Expand All @@ -345,8 +350,9 @@ def _encode_video_to_file_like_abstract(
frame_rate: int,
format: str,
file_like_context: int,
crf: Optional[int] = None,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
) -> None:
return

Expand Down
40 changes: 38 additions & 2 deletions src/torchcodec/encoders/_video_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def to_file(
dest: Union[str, Path],
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> None:
"""Encode frames into a file.

Expand All @@ -46,36 +48,58 @@ def to_file(
container format.
pixel_format (str, optional): The pixel format for encoding (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file(
frames=self._frames,
frame_rate=self._frame_rate,
filename=str(dest),
pixel_format=pixel_format,
crf=crf,
preset=preset,
)

def to_tensor(
self,
format: str,
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> Tensor:
"""Encode frames into raw bytes, as a 1D uint8 Tensor.

Args:
format (str): The container format of the encoded frames, e.g. "mp4", "mov",
"mkv", "avi", "webm", "flv", or "gif"
"mkv", "avi", "webm", "flv", etc.
pixel_format (str, optional): The pixel format to encode frames into (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).

Returns:
Tensor: The raw encoded bytes as 4D uint8 Tensor.
"""
preset_value = str(preset) if isinstance(preset, int) else preset
return _core.encode_video_to_tensor(
frames=self._frames,
frame_rate=self._frame_rate,
format=format,
pixel_format=pixel_format,
crf=crf,
preset=preset_value,
)

def to_file_like(
Expand All @@ -84,6 +108,8 @@ def to_file_like(
format: str,
*,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[Union[str, int]] = None,
) -> None:
"""Encode frames into a file-like object.

Expand All @@ -94,14 +120,24 @@ def to_file_like(
``write(data: bytes) -> int`` and ``seek(offset: int, whence:
int = 0) -> int``.
format (str): The container format of the encoded frames, e.g. "mp4", "mov",
"mkv", "avi", "webm", "flv", or "gif".
"mkv", "avi", "webm", "flv", etc.
pixel_format (str, optional): The pixel format for encoding (e.g.,
"yuv420p", "yuv444p"). If not specified, uses codec's default format.
crf (int or float, optional): Constant Rate Factor for encoding quality. Lower values
mean better quality. Valid range depends on the encoder (commonly 0-51).
Defaults to None (which will use encoder's default).
preset (str or int, optional): Encoder option that controls the tradeoff between
encoding speed and compression. Valid values depend on the encoder (commonly
a string: "fast", "medium", "slow"). Defaults to None
(which will use encoder's default).
"""
preset = str(preset) if isinstance(preset, int) else preset
_core.encode_video_to_file_like(
frames=self._frames,
frame_rate=self._frame_rate,
format=format,
file_like=file_like,
pixel_format=pixel_format,
crf=crf,
preset=preset,
)
Loading
Loading