Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 65 additions & 22 deletions src/torchcodec/_core/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
TORCH_CHECK(false, errorMsg.str());
}

void validateDoubleOption(
void tryToValidateCodecOption(
const AVCodec& avCodec,
const char* optionName,
double value) {
const std::string& value) {
if (!avCodec.priv_class) {
return;
}
Expand All @@ -586,24 +586,36 @@ void validateDoubleOption(
0,
AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
// If the option was not found, let FFmpeg handle it later
// If option is not found we cannot validate it, let FFmpeg handle it
if (!option) {
return;
}
// Validate options defined as a numeric type
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
TORCH_CHECK(
value >= option->min && value <= option->max,
optionName,
"=",
value,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
try {
double numericValue = std::stod(value);
TORCH_CHECK(
numericValue >= option->min && numericValue <= option->max,
optionName,
"=",
numericValue,
" is out of valid range [",
option->min,
", ",
option->max,
"] for this codec. For more details, run 'ffmpeg -h encoder=",
avCodec.name,
"'");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is OK for now, I suspect min and max may not always be set on for all parameters, in which case we may error out when we shouldn't? We'll know if / when we get user reports about that. Let's keep it as-is for now and see if we need to revisit in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only access min/max on numeric parameters, my expectation (hope) it is populated for those.

} catch (const std::invalid_argument& e) {
TORCH_CHECK(
false,
"Option ",
optionName,
" expects a numeric value but got '",
value,
"'");
}
}
}
} // namespace
Expand Down Expand Up @@ -685,6 +697,30 @@ VideoEncoder::VideoEncoder(
initializeEncoder(videoStreamOptions);
}

void VideoEncoder::sortCodecOptions(
const std::map<std::string, std::string>& codecOptions,
AVDictionary** codecDict,
AVDictionary** formatDict) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This interface is OK for now but we should consider changing it once we use RAII types for the dics (see my other follow-up suggestion).

// Search AVFormatContext's AVClass for options
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove this comment, it doesn't add much value. Let's also document what this function is doing: it takes some options as input and sorts them into codec options and format options, which are returned into two separate dicts.

const AVClass* formatClass = avformat_get_class();
for (const auto& [key, value] : codecOptions) {
const AVOption* fmtOpt = av_opt_find2(
&formatClass,
key.c_str(),
nullptr,
0,
AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
nullptr);
if (fmtOpt) {
av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
} else {
// Default to codec option (includes AVCodecContext + encoder-private)
// validateCodecOption(*avCodecContext_->codec, key.c_str(), value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove:

Suggested change
// validateCodecOption(*avCodecContext_->codec, key.c_str(), value);

av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
}
}
}

void VideoEncoder::initializeEncoder(
const VideoStreamOptions& videoStreamOptions) {
const AVCodec* avCodec =
Expand Down Expand Up @@ -737,13 +773,19 @@ void VideoEncoder::initializeEncoder(

// Apply videoStreamOptions
AVDictionary* options = nullptr;
if (videoStreamOptions.codecOptions.has_value()) {
// Validate all codec options before setting them
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, I think we can remove this comment, it doesn't add much value

for (const auto& [key, value] : videoStreamOptions.codecOptions.value()) {
tryToValidateCodecOption(*avCodec, key.c_str(), value);
}
sortCodecOptions(
videoStreamOptions.codecOptions.value(), &options, &formatOptions_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's rename these variables so that it's clearer what they refer to:

Suggested change
videoStreamOptions.codecOptions.value(), &options, &formatOptions_);
videoStreamOptions.codecOptions.value(), &avCodecOptions, &avFormatOptions_);

}

if (videoStreamOptions.crf.has_value()) {
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
av_dict_set(
&options,
"crf",
std::to_string(videoStreamOptions.crf.value()).c_str(),
0);
std::string crfValue = std::to_string(videoStreamOptions.crf.value());
tryToValidateCodecOption(*avCodec, "crf", crfValue);
av_dict_set(&options, "crf", crfValue.c_str(), 0);
}
if (videoStreamOptions.preset.has_value()) {
av_dict_set(
Expand Down Expand Up @@ -775,7 +817,8 @@ void VideoEncoder::encode() {
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
encodeWasCalled_ = true;

int status = avformat_write_header(avFormatContext_.get(), nullptr);
int status = avformat_write_header(avFormatContext_.get(), &formatOptions_);
av_dict_free(&formatOptions_);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's free it in the destructor rather than here, it's slightly less surprising.

This makes me realize: we should avoid calling av_dict_free and define a unique_ptr type with RAII semantics on AVDictionary, like we do for all the other FFmpeg types.

Let's open an issue to follow-up on that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened #1053 to follow up on this, thanks for the suggestion

TORCH_CHECK(
status == AVSUCCESS,
"Error in avformat_write_header: ",
Expand Down
11 changes: 11 additions & 0 deletions src/torchcodec/_core/Encoder.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#pragma once
#include <torch/types.h>
#include <map>
#include <string>
#include "AVIOContextHolder.h"
#include "FFMPEGCommon.h"
#include "StreamOptions.h"

extern "C" {
#include <libavutil/dict.h>
}

namespace facebook::torchcodec {
class AudioEncoder {
public:
Expand Down Expand Up @@ -154,6 +160,10 @@ class VideoEncoder {

private:
void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
void sortCodecOptions(
const std::map<std::string, std::string>& codecOptions,
AVDictionary** codecDict,
AVDictionary** formatDict);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it could be a pure function in an anonymous namespace rather than a method?

UniqueAVFrame convertTensorToAVFrame(
const torch::Tensor& frame,
int frameIndex);
Expand All @@ -179,6 +189,7 @@ class VideoEncoder {
std::unique_ptr<AVIOContextHolder> avioContextHolder_;

bool encodeWasCalled_ = false;
AVDictionary* formatOptions_ = nullptr;
};

} // namespace facebook::torchcodec
2 changes: 2 additions & 0 deletions src/torchcodec/_core/StreamOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#pragma once

#include <torch/types.h>
#include <map>
#include <optional>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -50,6 +51,7 @@ struct VideoStreamOptions {
std::optional<std::string> pixelFormat;
std::optional<double> crf;
std::optional<std::string> preset;
std::optional<std::map<std::string, std::string>> codecOptions;
};

struct AudioStreamOptions {
Expand Down
42 changes: 36 additions & 6 deletions src/torchcodec/_core/custom_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
m.def(
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
m.def(
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
m.def(
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> Tensor");
m.def(
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
m.def(
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
m.def(
Expand Down Expand Up @@ -158,6 +158,16 @@ std::string quoteValue(const std::string& value) {
return "\"" + value + "\"";
}

// Helper function to unflatten codec_options, alternating keys and values
std::map<std::string, std::string> unflattenCodecOptions(
const std::vector<std::string>& opts) {
std::map<std::string, std::string> optionsMap;
for (size_t i = 0; i < opts.size(); i += 2) {
optionsMap[opts[i]] = opts[i + 1];
}
return optionsMap;
}

std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
std::stringstream ss;
ss << "{\n";
Expand Down Expand Up @@ -605,11 +615,18 @@ void encode_video_to_file(
std::string_view file_name,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (codec_options.has_value()) {
videoStreamOptions.codecOptions =
unflattenCodecOptions(codec_options.value());
}

VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -624,12 +641,19 @@ at::Tensor encode_video_to_tensor(
std::string_view format,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
VideoStreamOptions videoStreamOptions;
videoStreamOptions.pixelFormat = pixel_format;
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (codec_options.has_value()) {
videoStreamOptions.codecOptions =
unflattenCodecOptions(codec_options.value());
}

return VideoEncoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand All @@ -646,7 +670,8 @@ void _encode_video_to_file_like(
int64_t file_like_context,
std::optional<std::string_view> pixel_format = std::nullopt,
std::optional<double> crf = std::nullopt,
std::optional<std::string_view> preset = std::nullopt) {
std::optional<std::string_view> preset = std::nullopt,
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
auto fileLikeContext =
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
TORCH_CHECK(
Expand All @@ -658,6 +683,11 @@ void _encode_video_to_file_like(
videoStreamOptions.crf = crf;
videoStreamOptions.preset = preset;

if (codec_options.has_value()) {
videoStreamOptions.codecOptions =
unflattenCodecOptions(codec_options.value());
}

VideoEncoder encoder(
frames,
validateInt64ToInt(frame_rate, "frame_rate"),
Expand Down
12 changes: 9 additions & 3 deletions src/torchcodec/_core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def encode_video_to_file_like(
crf: Optional[Union[int, float]] = None,
pixel_format: Optional[str] = None,
preset: Optional[str] = None,
codec_options: Optional[list[str]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm starting to wonder if that's the right name for this parameter. IIUC, this can also be used to set the AVFormatContext parameters, right? So it's not just related to the codec itself?

Maybe this could be extra_options or something like that? CC @scotts @mollyxu

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, extra_options is probably better.

) -> None:
"""Encode video frames to a file-like object.

Expand All @@ -227,6 +228,7 @@ def encode_video_to_file_like(
crf: Optional constant rate factor for encoding quality
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
codec_options: Optional list of codec options as flattened key-value pairs
"""
assert _pybind_ops is not None

Expand All @@ -238,6 +240,7 @@ def encode_video_to_file_like(
pixel_format,
crf,
preset,
codec_options,
)


Expand Down Expand Up @@ -326,8 +329,9 @@ def encode_video_to_file_abstract(
frame_rate: int,
filename: str,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
codec_options: Optional[list[str]] = None,
) -> None:
return

Expand All @@ -338,8 +342,9 @@ def encode_video_to_tensor_abstract(
frame_rate: int,
format: str,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
codec_options: Optional[list[str]] = None,
) -> torch.Tensor:
return torch.empty([], dtype=torch.long)

Expand All @@ -351,8 +356,9 @@ def _encode_video_to_file_like_abstract(
format: str,
file_like_context: int,
pixel_format: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
preset: Optional[str] = None,
crf: Optional[Union[int, float]] = None,
codec_options: Optional[list[str]] = None,
) -> None:
return

Expand Down
Loading
Loading