Skip to content

Commit e81e476

Browse files
author
pytorchbot
committed
2025-11-16 nightly release (c69064f)
1 parent 039ad71 commit e81e476

File tree

9 files changed

+396
-44
lines changed

9 files changed

+396
-44
lines changed

.github/workflows/docs.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ jobs:
8484
${CONDA_RUN} conda info
8585
${CONDA_RUN} nvidia-smi
8686
${CONDA_RUN} conda list
87+
echo LD_LIBRARY_PATH=$CONDA_PREFIX/lib:/usr/local/cuda/lib64/:${LD_LIBRARY_PATH} >> $GITHUB_ENV
8788
- name: Assert ffmpeg exists
8889
run: |
8990
${CONDA_RUN} ffmpeg -buildconf

.github/workflows/linux_cuda_wheel.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,13 @@ jobs:
9595
# We install conda packages at the start because otherwise conda may have conflicts with dependencies.
9696
# Note: xorg-libxau was addded to fix a problem with ffmpeg 4. We should consider removing it.
9797
default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }} conda-forge::xorg-libxau"
98-
- name: Check env
98+
- name: Check env, set LD_LIBRARY_PATH
9999
run: |
100100
${CONDA_RUN} env
101101
${CONDA_RUN} conda info
102102
${CONDA_RUN} nvidia-smi
103103
${CONDA_RUN} conda list
104+
echo LD_LIBRARY_PATH=$CONDA_PREFIX/lib:/usr/local/cuda/lib64/:${LD_LIBRARY_PATH} >> $GITHUB_ENV
104105
- name: Assert ffmpeg exists
105106
run: |
106107
${CONDA_RUN} ffmpeg -buildconf

src/torchcodec/_core/Encoder.cpp

Lines changed: 100 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
570570
TORCH_CHECK(false, errorMsg.str());
571571
}
572572

573-
void validateDoubleOption(
573+
void tryToValidateCodecOption(
574574
const AVCodec& avCodec,
575575
const char* optionName,
576-
double value) {
576+
const std::string& value) {
577577
if (!avCodec.priv_class) {
578578
return;
579579
}
@@ -586,24 +586,60 @@ void validateDoubleOption(
586586
0,
587587
AV_OPT_SEARCH_FAKE_OBJ,
588588
nullptr);
589-
// If the option was not found, let FFmpeg handle it later
589+
// If option is not found we cannot validate it, let FFmpeg handle it
590590
if (!option) {
591591
return;
592592
}
593+
// Validate if option is defined as a numeric type
593594
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
594595
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
595-
TORCH_CHECK(
596-
value >= option->min && value <= option->max,
597-
optionName,
598-
"=",
599-
value,
600-
" is out of valid range [",
601-
option->min,
602-
", ",
603-
option->max,
604-
"] for this codec. For more details, run 'ffmpeg -h encoder=",
605-
avCodec.name,
606-
"'");
596+
try {
597+
double numericValue = std::stod(value);
598+
TORCH_CHECK(
599+
numericValue >= option->min && numericValue <= option->max,
600+
optionName,
601+
"=",
602+
numericValue,
603+
" is out of valid range [",
604+
option->min,
605+
", ",
606+
option->max,
607+
"] for this codec. For more details, run 'ffmpeg -h encoder=",
608+
avCodec.name,
609+
"'");
610+
} catch (const std::invalid_argument& e) {
611+
TORCH_CHECK(
612+
false,
613+
"Option ",
614+
optionName,
615+
" expects a numeric value but got '",
616+
value,
617+
"'");
618+
}
619+
}
620+
}
621+
622+
void sortCodecOptions(
623+
const std::map<std::string, std::string>& extraOptions,
624+
AVDictionary** codecDict,
625+
AVDictionary** formatDict) {
626+
// Accepts a map of options as input, then sorts them into codec options and
627+
// format options. The sorted options are returned into two separate dicts.
628+
const AVClass* formatClass = avformat_get_class();
629+
for (const auto& [key, value] : extraOptions) {
630+
const AVOption* fmtOpt = av_opt_find2(
631+
&formatClass,
632+
key.c_str(),
633+
nullptr,
634+
0,
635+
AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
636+
nullptr);
637+
if (fmtOpt) {
638+
av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
639+
} else {
640+
// Default to codec option (includes AVCodecContext + encoder-private)
641+
av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
642+
}
607643
}
608644
}
609645
} // namespace
@@ -621,6 +657,7 @@ VideoEncoder::~VideoEncoder() {
621657
avFormatContext_->pb = nullptr;
622658
}
623659
}
660+
av_dict_free(&avFormatOptions_);
624661
}
625662

626663
VideoEncoder::VideoEncoder(
@@ -687,9 +724,33 @@ VideoEncoder::VideoEncoder(
687724

688725
void VideoEncoder::initializeEncoder(
689726
const VideoStreamOptions& videoStreamOptions) {
690-
const AVCodec* avCodec =
691-
avcodec_find_encoder(avFormatContext_->oformat->video_codec);
692-
TORCH_CHECK(avCodec != nullptr, "Video codec not found");
727+
const AVCodec* avCodec = nullptr;
728+
// If codec arg is provided, find codec using logic similar to FFmpeg:
729+
// https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
730+
if (videoStreamOptions.codec.has_value()) {
731+
const std::string& codec = videoStreamOptions.codec.value();
732+
// Try to find codec by name ("libx264", "libsvtav1")
733+
avCodec = avcodec_find_encoder_by_name(codec.c_str());
734+
// Try to find by codec descriptor ("h264", "av1")
735+
if (!avCodec) {
736+
const AVCodecDescriptor* desc =
737+
avcodec_descriptor_get_by_name(codec.c_str());
738+
if (desc) {
739+
avCodec = avcodec_find_encoder(desc->id);
740+
}
741+
}
742+
TORCH_CHECK(
743+
avCodec != nullptr,
744+
"Video codec ",
745+
codec,
746+
" not found. To see available codecs, run: ffmpeg -encoders");
747+
} else {
748+
TORCH_CHECK(
749+
avFormatContext_->oformat != nullptr,
750+
"Output format is null, unable to find default codec.");
751+
avCodec = avcodec_find_encoder(avFormatContext_->oformat->video_codec);
752+
TORCH_CHECK(avCodec != nullptr, "Video codec not found");
753+
}
693754

694755
AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
695756
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
@@ -736,21 +797,31 @@ void VideoEncoder::initializeEncoder(
736797
}
737798

738799
// Apply videoStreamOptions
739-
AVDictionary* options = nullptr;
800+
AVDictionary* avCodecOptions = nullptr;
801+
if (videoStreamOptions.extraOptions.has_value()) {
802+
for (const auto& [key, value] : videoStreamOptions.extraOptions.value()) {
803+
tryToValidateCodecOption(*avCodec, key.c_str(), value);
804+
}
805+
sortCodecOptions(
806+
videoStreamOptions.extraOptions.value(),
807+
&avCodecOptions,
808+
&avFormatOptions_);
809+
}
810+
740811
if (videoStreamOptions.crf.has_value()) {
741-
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
742-
av_dict_set(
743-
&options,
744-
"crf",
745-
std::to_string(videoStreamOptions.crf.value()).c_str(),
746-
0);
812+
std::string crfValue = std::to_string(videoStreamOptions.crf.value());
813+
tryToValidateCodecOption(*avCodec, "crf", crfValue);
814+
av_dict_set(&avCodecOptions, "crf", crfValue.c_str(), 0);
747815
}
748816
if (videoStreamOptions.preset.has_value()) {
749817
av_dict_set(
750-
&options, "preset", videoStreamOptions.preset.value().c_str(), 0);
818+
&avCodecOptions,
819+
"preset",
820+
videoStreamOptions.preset.value().c_str(),
821+
0);
751822
}
752-
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
753-
av_dict_free(&options);
823+
int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
824+
av_dict_free(&avCodecOptions);
754825

755826
TORCH_CHECK(
756827
status == AVSUCCESS,
@@ -775,7 +846,7 @@ void VideoEncoder::encode() {
775846
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
776847
encodeWasCalled_ = true;
777848

778-
int status = avformat_write_header(avFormatContext_.get(), nullptr);
849+
int status = avformat_write_header(avFormatContext_.get(), &avFormatOptions_);
779850
TORCH_CHECK(
780851
status == AVSUCCESS,
781852
"Error in avformat_write_header: ",

src/torchcodec/_core/Encoder.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
#pragma once
22
#include <torch/types.h>
3+
#include <map>
4+
#include <string>
35
#include "AVIOContextHolder.h"
46
#include "FFMPEGCommon.h"
57
#include "StreamOptions.h"
68

9+
extern "C" {
10+
#include <libavutil/dict.h>
11+
}
12+
713
namespace facebook::torchcodec {
814
class AudioEncoder {
915
public:
@@ -179,6 +185,7 @@ class VideoEncoder {
179185
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
180186

181187
bool encodeWasCalled_ = false;
188+
AVDictionary* avFormatOptions_ = nullptr;
182189
};
183190

184191
} // namespace facebook::torchcodec

src/torchcodec/_core/StreamOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#pragma once
88

99
#include <torch/types.h>
10+
#include <map>
1011
#include <optional>
1112
#include <string>
1213
#include <string_view>
@@ -45,11 +46,13 @@ struct VideoStreamOptions {
4546
std::string_view deviceVariant = "ffmpeg";
4647

4748
// Encoding options
49+
std::optional<std::string> codec;
4850
// Optional pixel format for video encoding (e.g., "yuv420p", "yuv444p")
4951
// If not specified, uses codec's default format.
5052
std::optional<std::string> pixelFormat;
5153
std::optional<double> crf;
5254
std::optional<std::string> preset;
55+
std::optional<std::map<std::string, std::string>> extraOptions;
5356
};
5457

5558
struct AudioStreamOptions {

src/torchcodec/_core/custom_ops.cpp

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ TORCH_LIBRARY(torchcodec_ns, m) {
3737
m.def(
3838
"_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
3939
m.def(
40-
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
40+
"encode_video_to_file(Tensor frames, int frame_rate, str filename, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
4141
m.def(
42-
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? pixel_format=None, float? crf=None, str? preset=None) -> Tensor");
42+
"encode_video_to_tensor(Tensor frames, int frame_rate, str format, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> Tensor");
4343
m.def(
44-
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? pixel_format=None, float? crf=None, str? preset=None) -> ()");
44+
"_encode_video_to_file_like(Tensor frames, int frame_rate, str format, int file_like_context, str? codec=None, str? pixel_format=None, float? crf=None, str? preset=None, str[]? extra_options=None) -> ()");
4545
m.def(
4646
"create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
4747
m.def(
@@ -158,6 +158,16 @@ std::string quoteValue(const std::string& value) {
158158
return "\"" + value + "\"";
159159
}
160160

161+
// Helper function to unflatten extra_options, alternating keys and values
162+
std::map<std::string, std::string> unflattenExtraOptions(
163+
const std::vector<std::string>& opts) {
164+
std::map<std::string, std::string> optionsMap;
165+
for (size_t i = 0; i < opts.size(); i += 2) {
166+
optionsMap[opts[i]] = opts[i + 1];
167+
}
168+
return optionsMap;
169+
}
170+
161171
std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
162172
std::stringstream ss;
163173
ss << "{\n";
@@ -603,13 +613,22 @@ void encode_video_to_file(
603613
const at::Tensor& frames,
604614
int64_t frame_rate,
605615
std::string_view file_name,
616+
std::optional<std::string> codec = std::nullopt,
606617
std::optional<std::string_view> pixel_format = std::nullopt,
607618
std::optional<double> crf = std::nullopt,
608-
std::optional<std::string_view> preset = std::nullopt) {
619+
std::optional<std::string_view> preset = std::nullopt,
620+
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
609621
VideoStreamOptions videoStreamOptions;
622+
videoStreamOptions.codec = codec;
610623
videoStreamOptions.pixelFormat = pixel_format;
611624
videoStreamOptions.crf = crf;
612625
videoStreamOptions.preset = preset;
626+
627+
if (extra_options.has_value()) {
628+
videoStreamOptions.extraOptions =
629+
unflattenExtraOptions(extra_options.value());
630+
}
631+
613632
VideoEncoder(
614633
frames,
615634
validateInt64ToInt(frame_rate, "frame_rate"),
@@ -622,14 +641,23 @@ at::Tensor encode_video_to_tensor(
622641
const at::Tensor& frames,
623642
int64_t frame_rate,
624643
std::string_view format,
644+
std::optional<std::string> codec = std::nullopt,
625645
std::optional<std::string_view> pixel_format = std::nullopt,
626646
std::optional<double> crf = std::nullopt,
627-
std::optional<std::string_view> preset = std::nullopt) {
647+
std::optional<std::string_view> preset = std::nullopt,
648+
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
628649
auto avioContextHolder = std::make_unique<AVIOToTensorContext>();
629650
VideoStreamOptions videoStreamOptions;
651+
videoStreamOptions.codec = codec;
630652
videoStreamOptions.pixelFormat = pixel_format;
631653
videoStreamOptions.crf = crf;
632654
videoStreamOptions.preset = preset;
655+
656+
if (extra_options.has_value()) {
657+
videoStreamOptions.extraOptions =
658+
unflattenExtraOptions(extra_options.value());
659+
}
660+
633661
return VideoEncoder(
634662
frames,
635663
validateInt64ToInt(frame_rate, "frame_rate"),
@@ -644,20 +672,28 @@ void _encode_video_to_file_like(
644672
int64_t frame_rate,
645673
std::string_view format,
646674
int64_t file_like_context,
675+
std::optional<std::string> codec = std::nullopt,
647676
std::optional<std::string_view> pixel_format = std::nullopt,
648677
std::optional<double> crf = std::nullopt,
649-
std::optional<std::string_view> preset = std::nullopt) {
678+
std::optional<std::string_view> preset = std::nullopt,
679+
std::optional<std::vector<std::string>> extra_options = std::nullopt) {
650680
auto fileLikeContext =
651681
reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
652682
TORCH_CHECK(
653683
fileLikeContext != nullptr, "file_like_context must be a valid pointer");
654684
std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
655685

656686
VideoStreamOptions videoStreamOptions;
687+
videoStreamOptions.codec = codec;
657688
videoStreamOptions.pixelFormat = pixel_format;
658689
videoStreamOptions.crf = crf;
659690
videoStreamOptions.preset = preset;
660691

692+
if (extra_options.has_value()) {
693+
videoStreamOptions.extraOptions =
694+
unflattenExtraOptions(extra_options.value());
695+
}
696+
661697
VideoEncoder encoder(
662698
frames,
663699
validateInt64ToInt(frame_rate, "frame_rate"),

0 commit comments

Comments
 (0)