Skip to content

Commit 1d79594

Browse files
committed
add codec options, apply numeric error handling
1 parent e170212 commit 1d79594

File tree

7 files changed

+194
-32
lines changed

7 files changed

+194
-32
lines changed

src/torchcodec/_core/Encoder.cpp

Lines changed: 65 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
570570
TORCH_CHECK(false, errorMsg.str());
571571
}
572572

573-
void validateDoubleOption(
573+
void tryToValidateCodecOption(
574574
const AVCodec& avCodec,
575575
const char* optionName,
576-
double value) {
576+
const std::string& value) {
577577
if (!avCodec.priv_class) {
578578
return;
579579
}
@@ -586,24 +586,36 @@ void validateDoubleOption(
586586
0,
587587
AV_OPT_SEARCH_FAKE_OBJ,
588588
nullptr);
589-
// If the option was not found, let FFmpeg handle it later
589+
// If option is not found we cannot validate it, let FFmpeg handle it
590590
if (!option) {
591591
return;
592592
}
593+
// Validate options defined as a numeric type
593594
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
594595
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
595-
TORCH_CHECK(
596-
value >= option->min && value <= option->max,
597-
optionName,
598-
"=",
599-
value,
600-
" is out of valid range [",
601-
option->min,
602-
", ",
603-
option->max,
604-
"] for this codec. For more details, run 'ffmpeg -h encoder=",
605-
avCodec.name,
606-
"'");
596+
try {
597+
double numericValue = std::stod(value);
598+
TORCH_CHECK(
599+
numericValue >= option->min && numericValue <= option->max,
600+
optionName,
601+
"=",
602+
numericValue,
603+
" is out of valid range [",
604+
option->min,
605+
", ",
606+
option->max,
607+
"] for this codec. For more details, run 'ffmpeg -h encoder=",
608+
avCodec.name,
609+
"'");
610+
} catch (const std::invalid_argument& e) {
611+
TORCH_CHECK(
612+
false,
613+
"Option ",
614+
optionName,
615+
" expects a numeric value but got '",
616+
value,
617+
"'");
618+
}
607619
}
608620
}
609621
} // namespace
@@ -685,6 +697,30 @@ VideoEncoder::VideoEncoder(
685697
initializeEncoder(videoStreamOptions);
686698
}
687699

700+
void VideoEncoder::sortCodecOptions(
701+
const std::map<std::string, std::string>& codecOptions,
702+
AVDictionary** codecDict,
703+
AVDictionary** formatDict) {
704+
// Search AVFormatContext's AVClass for options
705+
const AVClass* formatClass = avformat_get_class();
706+
for (const auto& [key, value] : codecOptions) {
707+
const AVOption* fmtOpt = av_opt_find2(
708+
&formatClass,
709+
key.c_str(),
710+
nullptr,
711+
0,
712+
AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
713+
nullptr);
714+
if (fmtOpt) {
715+
av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
716+
} else {
717+
// Default to codec option (includes AVCodecContext + encoder-private)
718+
// validateCodecOption(*avCodecContext_->codec, key.c_str(), value);
719+
av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
720+
}
721+
}
722+
}
723+
688724
void VideoEncoder::initializeEncoder(
689725
const VideoStreamOptions& videoStreamOptions) {
690726
const AVCodec* avCodec =
@@ -737,13 +773,19 @@ void VideoEncoder::initializeEncoder(
737773

738774
// Apply videoStreamOptions
739775
AVDictionary* options = nullptr;
776+
if (videoStreamOptions.codecOptions.has_value()) {
777+
// Validate all codec options before setting them
778+
for (const auto& [key, value] : videoStreamOptions.codecOptions.value()) {
779+
tryToValidateCodecOption(*avCodec, key.c_str(), value);
780+
}
781+
sortCodecOptions(
782+
videoStreamOptions.codecOptions.value(), &options, &formatOptions_);
783+
}
784+
740785
if (videoStreamOptions.crf.has_value()) {
741-
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
742-
av_dict_set(
743-
&options,
744-
"crf",
745-
std::to_string(videoStreamOptions.crf.value()).c_str(),
746-
0);
786+
std::string crfValue = std::to_string(videoStreamOptions.crf.value());
787+
tryToValidateCodecOption(*avCodec, "crf", crfValue);
788+
av_dict_set(&options, "crf", crfValue.c_str(), 0);
747789
}
748790
if (videoStreamOptions.preset.has_value()) {
749791
av_dict_set(
@@ -775,7 +817,8 @@ void VideoEncoder::encode() {
775817
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
776818
encodeWasCalled_ = true;
777819

778-
int status = avformat_write_header(avFormatContext_.get(), nullptr);
820+
int status = avformat_write_header(avFormatContext_.get(), &formatOptions_);
821+
av_dict_free(&formatOptions_);
779822
TORCH_CHECK(
780823
status == AVSUCCESS,
781824
"Error in avformat_write_header: ",

src/torchcodec/_core/Encoder.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
#pragma once
22
#include <torch/types.h>
3+
#include <map>
4+
#include <string>
35
#include "AVIOContextHolder.h"
46
#include "FFMPEGCommon.h"
57
#include "StreamOptions.h"
68

9+
extern "C" {
10+
#include <libavutil/dict.h>
11+
}
12+
713
namespace facebook::torchcodec {
814
class AudioEncoder {
915
public:
@@ -154,6 +160,10 @@ class VideoEncoder {
154160

155161
private:
156162
void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
163+
void sortCodecOptions(
164+
const std::map<std::string, std::string>& codecOptions,
165+
AVDictionary** codecDict,
166+
AVDictionary** formatDict);
157167
UniqueAVFrame convertTensorToAVFrame(
158168
const torch::Tensor& frame,
159169
int frameIndex);
@@ -179,6 +189,7 @@ class VideoEncoder {
179189
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
180190

181191
bool encodeWasCalled_ = false;
192+
AVDictionary* formatOptions_ = nullptr;
182193
};
183194

184195
} // namespace facebook::torchcodec

src/torchcodec/_core/StreamOptions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#pragma once
88

99
#include <torch/types.h>
10+
#include <map>
1011
#include <optional>
1112
#include <string>
1213
#include <string_view>
@@ -50,6 +51,7 @@ struct VideoStreamOptions {
5051
std::optional<std::string> pixelFormat;
5152
std::optional<double> crf;
5253
std::optional<std::string> preset;
54+
std::optional<std::map<std::string, std::string>> codecOptions;
5355
};
5456

5557
struct AudioStreamOptions {

src/torchcodec/_core/custom_ops.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
3737
m.def(
3838
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
3939
m.def(
40-
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
40+
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
4141
m.def(
42-
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
42+
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> Tensor");
4343
m.def(
44-
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
44+
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None, str[]? codec_options=None) -> ()");
4545
m.def(
4646
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
4747
m.def(
@@ -158,6 +158,16 @@ std::string quoteValue(const std::string& value) {
158158
return "\"" + value + "\"";
159159
}
160160

161+
// Helper function to unflatten codec_options, alternating keys and values
162+
std::map<std::string, std::string> unflattenCodecOptions(
163+
const std::vector<std::string>& opts) {
164+
std::map<std::string, std::string> optionsMap;
165+
for (size_t i = 0; i < opts.size(); i += 2) {
166+
optionsMap[opts[i]] = opts[i + 1];
167+
}
168+
return optionsMap;
169+
}
170+
161171
std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
162172
std::stringstream ss;
163173
ss << "{\n";
@@ -605,11 +615,18 @@ void encode_video_to_file(
605615
std::string_view file_name,
606616
std::optional<std::string_view> pixel_format = std::nullopt,
607617
std::optional<double> crf = std::nullopt,
608-
std::optional<std::string_view> preset = std::nullopt) {
618+
std::optional<std::string_view> preset = std::nullopt,
619+
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
609620
VideoStreamOptions videoStreamOptions;
610621
videoStreamOptions.pixelFormat = pixel_format;
611622
videoStreamOptions.crf = crf;
612623
videoStreamOptions.preset = preset;
624+
625+
if (codec_options.has_value()) {
626+
videoStreamOptions.codecOptions =
627+
unflattenCodecOptions(codec_options.value());
628+
}
629+
613630
VideoEncoder(
614631
frames,
615632
validateInt64ToInt(frame_rate, "frame_rate"),
@@ -624,12 +641,19 @@ at::Tensor encode_video_to_tensor(
624641
std::string_view format,
625642
std::optional<std::string_view> pixel_format = std::nullopt,
626643
std::optional<double> crf = std::nullopt,
627-
std::optional<std::string_view> preset = std::nullopt) {
644+
std::optional<std::string_view> preset = std::nullopt,
645+
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
628646
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
629647
VideoStreamOptions videoStreamOptions;
630648
videoStreamOptions.pixelFormat = pixel_format;
631649
videoStreamOptions.crf = crf;
632650
videoStreamOptions.preset = preset;
651+
652+
if (codec_options.has_value()) {
653+
videoStreamOptions.codecOptions =
654+
unflattenCodecOptions(codec_options.value());
655+
}
656+
633657
return VideoEncoder(
634658
frames,
635659
validateInt64ToInt(frame_rate, "frame_rate"),
@@ -646,7 +670,8 @@ void _encode_video_to_file_like(
646670
int64_t file_like_context,
647671
std::optional<std::string_view> pixel_format = std::nullopt,
648672
std::optional<double> crf = std::nullopt,
649-
std::optional<std::string_view> preset = std::nullopt) {
673+
std::optional<std::string_view> preset = std::nullopt,
674+
std::optional<std::vector<std::string>> codec_options = std::nullopt) {
650675
auto fileLikeContext =
651676
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
652677
TORCH_CHECK(
@@ -658,6 +683,11 @@ void _encode_video_to_file_like(
658683
videoStreamOptions.crf = crf;
659684
videoStreamOptions.preset = preset;
660685

686+
if (codec_options.has_value()) {
687+
videoStreamOptions.codecOptions =
688+
unflattenCodecOptions(codec_options.value());
689+
}
690+
661691
VideoEncoder encoder(
662692
frames,
663693
validateInt64ToInt(frame_rate, "frame_rate"),

src/torchcodec/_core/ops.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def encode_video_to_file_like(
216216
crf: Optional[Union[int, float]] = None,
217217
pixel_format: Optional[str] = None,
218218
preset: Optional[str] = None,
219+
codec_options: Optional[list[str]] = None,
219220
) -> None:
220221
"""Encode video frames to a file-like object.
221222
@@ -227,6 +228,7 @@ def encode_video_to_file_like(
227228
crf: Optional constant rate factor for encoding quality
228229
pixel_format: Optional pixel format (e.g., "yuv420p", "yuv444p")
229230
preset: Optional encoder preset as string (e.g., "ultrafast", "medium")
231+
codec_options: Optional list of codec options as flattened key-value pairs
230232
"""
231233
assert _pybind_ops is not None
232234

@@ -238,6 +240,7 @@ def encode_video_to_file_like(
238240
pixel_format,
239241
crf,
240242
preset,
243+
codec_options,
241244
)
242245

243246

@@ -326,8 +329,9 @@ def encode_video_to_file_abstract(
326329
frame_rate: int,
327330
filename: str,
328331
pixel_format: Optional[str] = None,
329-
crf: Optional[Union[int, float]] = None,
330332
preset: Optional[str] = None,
333+
crf: Optional[Union[int, float]] = None,
334+
codec_options: Optional[list[str]] = None,
331335
) -> None:
332336
return
333337

@@ -338,8 +342,9 @@ def encode_video_to_tensor_abstract(
338342
frame_rate: int,
339343
format: str,
340344
pixel_format: Optional[str] = None,
341-
crf: Optional[Union[int, float]] = None,
342345
preset: Optional[str] = None,
346+
crf: Optional[Union[int, float]] = None,
347+
codec_options: Optional[list[str]] = None,
343348
) -> torch.Tensor:
344349
return torch.empty([], dtype=torch.long)
345350

@@ -351,8 +356,9 @@ def _encode_video_to_file_like_abstract(
351356
format: str,
352357
file_like_context: int,
353358
pixel_format: Optional[str] = None,
354-
crf: Optional[Union[int, float]] = None,
355359
preset: Optional[str] = None,
360+
crf: Optional[Union[int, float]] = None,
361+
codec_options: Optional[list[str]] = None,
356362
) -> None:
357363
return
358364

0 commit comments

Comments
 (0)