Skip to content

Commit af2e1ab

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into random_crop
2 parents a8a8cea + 408b373 commit af2e1ab

18 files changed

+456
-167
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
1+
[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://meta-pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://meta-pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
22

33
# TorchCodec
44

@@ -23,7 +23,7 @@ We achieve these capabilities through:
2323

2424
Here's a condensed summary of what you can do with TorchCodec. For more detailed
2525
examples, [check out our
26-
documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
26+
documentation](https://meta-pytorch.org/torchcodec/stable/generated_examples/)!
2727

2828
#### Decoding
2929

@@ -219,7 +219,7 @@ The bottom row is [promotional video from NASA](https://download.pytorch.org/tor
219219
that has a resolution of 960x540 at 29.7 fps and is 206 seconds long. Both videos were
220220
encoded with libx264 and yuv420p pixel format. All decoders, except for TorchVision, used FFmpeg 6.1.2. TorchVision used FFmpeg 4.2.2.
221221

222-
For TorchCodec, the "approx" label means that it was using [approximate mode](https://pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html)
222+
For TorchCodec, the "approx" label means that it was using [approximate mode](https://meta-pytorch.org/torchcodec/stable/generated_examples/decoding/approximate_mode.html)
223223
for seeking.
224224

225225
## Contributing

src/torchcodec/_core/Encoder.cpp

Lines changed: 73 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -570,10 +570,10 @@ AVPixelFormat validatePixelFormat(
570570
TORCH_CHECK(false, errorMsg.str());
571571
}
572572

573-
void validateDoubleOption(
573+
void tryToValidateCodecOption(
574574
const AVCodec& avCodec,
575575
const char* optionName,
576-
double value) {
576+
const std::string& value) {
577577
if (!avCodec.priv_class) {
578578
return;
579579
}
@@ -586,24 +586,60 @@ void validateDoubleOption(
586586
0,
587587
AV_OPT_SEARCH_FAKE_OBJ,
588588
nullptr);
589-
// If the option was not found, let FFmpeg handle it later
589+
// If option is not found we cannot validate it, let FFmpeg handle it
590590
if (!option) {
591591
return;
592592
}
593+
// Validate if option is defined as a numeric type
593594
if (option->type == AV_OPT_TYPE_INT || option->type == AV_OPT_TYPE_INT64 ||
594595
option->type == AV_OPT_TYPE_FLOAT || option->type == AV_OPT_TYPE_DOUBLE) {
595-
TORCH_CHECK(
596-
value >= option->min && value <= option->max,
597-
optionName,
598-
"=",
599-
value,
600-
" is out of valid range [",
601-
option->min,
602-
", ",
603-
option->max,
604-
"] for this codec. For more details, run 'ffmpeg -h encoder=",
605-
avCodec.name,
606-
"'");
596+
try {
597+
double numericValue = std::stod(value);
598+
TORCH_CHECK(
599+
numericValue >= option->min && numericValue <= option->max,
600+
optionName,
601+
"=",
602+
numericValue,
603+
" is out of valid range [",
604+
option->min,
605+
", ",
606+
option->max,
607+
"] for this codec. For more details, run 'ffmpeg -h encoder=",
608+
avCodec.name,
609+
"'");
610+
} catch (const std::invalid_argument&) {
611+
TORCH_CHECK(
612+
false,
613+
"Option ",
614+
optionName,
615+
" expects a numeric value but got '",
616+
value,
617+
"'");
618+
}
619+
}
620+
}
621+
622+
void sortCodecOptions(
623+
const std::map<std::string, std::string>& extraOptions,
624+
AVDictionary** codecDict,
625+
AVDictionary** formatDict) {
626+
// Accepts a map of options as input, then sorts them into codec options and
627+
// format options. The sorted options are returned into two separate dicts.
628+
const AVClass* formatClass = avformat_get_class();
629+
for (const auto& [key, value] : extraOptions) {
630+
const AVOption* fmtOpt = av_opt_find2(
631+
&formatClass,
632+
key.c_str(),
633+
nullptr,
634+
0,
635+
AV_OPT_SEARCH_CHILDREN | AV_OPT_SEARCH_FAKE_OBJ,
636+
nullptr);
637+
if (fmtOpt) {
638+
av_dict_set(formatDict, key.c_str(), value.c_str(), 0);
639+
} else {
640+
// Default to codec option (includes AVCodecContext + encoder-private)
641+
av_dict_set(codecDict, key.c_str(), value.c_str(), 0);
642+
}
607643
}
608644
}
609645
} // namespace
@@ -621,6 +657,7 @@ VideoEncoder::~VideoEncoder() {
621657
avFormatContext_->pb = nullptr;
622658
}
623659
}
660+
av_dict_free(&avFormatOptions_);
624661
}
625662

626663
VideoEncoder::VideoEncoder(
@@ -760,21 +797,31 @@ void VideoEncoder::initializeEncoder(
760797
}
761798

762799
// Apply videoStreamOptions
763-
AVDictionary* options = nullptr;
800+
AVDictionary* avCodecOptions = nullptr;
801+
if (videoStreamOptions.extraOptions.has_value()) {
802+
for (const auto& [key, value] : videoStreamOptions.extraOptions.value()) {
803+
tryToValidateCodecOption(*avCodec, key.c_str(), value);
804+
}
805+
sortCodecOptions(
806+
videoStreamOptions.extraOptions.value(),
807+
&avCodecOptions,
808+
&avFormatOptions_);
809+
}
810+
764811
if (videoStreamOptions.crf.has_value()) {
765-
validateDoubleOption(*avCodec, "crf", videoStreamOptions.crf.value());
766-
av_dict_set(
767-
&options,
768-
"crf",
769-
std::to_string(videoStreamOptions.crf.value()).c_str(),
770-
0);
812+
std::string crfValue = std::to_string(videoStreamOptions.crf.value());
813+
tryToValidateCodecOption(*avCodec, "crf", crfValue);
814+
av_dict_set(&avCodecOptions, "crf", crfValue.c_str(), 0);
771815
}
772816
if (videoStreamOptions.preset.has_value()) {
773817
av_dict_set(
774-
&options, "preset", videoStreamOptions.preset.value().c_str(), 0);
818+
&avCodecOptions,
819+
"preset",
820+
videoStreamOptions.preset.value().c_str(),
821+
0);
775822
}
776-
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
777-
av_dict_free(&options);
823+
int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
824+
av_dict_free(&avCodecOptions);
778825

779826
TORCH_CHECK(
780827
status == AVSUCCESS,
@@ -799,7 +846,7 @@ void VideoEncoder::encode() {
799846
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
800847
encodeWasCalled_ = true;
801848

802-
int status = avformat_write_header(avFormatContext_.get(), nullptr);
849+
int status = avformat_write_header(avFormatContext_.get(), &avFormatOptions_);
803850
TORCH_CHECK(
804851
status == AVSUCCESS,
805852
"Error in avformat_write_header: ",

src/torchcodec/_core/Encoder.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
#pragma once
22
#include <torch/types.h>
3+
#include <map>
4+
#include <string>
35
#include "AVIOContextHolder.h"
46
#include "FFMPEGCommon.h"
57
#include "StreamOptions.h"
68

9+
extern "C" {
10+
#include <libavutil/dict.h>
11+
}
12+
713
namespace facebook::torchcodec {
814
class AudioEncoder {
915
public:
@@ -179,6 +185,7 @@ class VideoEncoder {
179185
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
180186

181187
bool encodeWasCalled_ = false;
188+
AVDictionary* avFormatOptions_ = nullptr;
182189
};
183190

184191
} // namespace facebook::torchcodec

src/torchcodec/_core/FFMPEGCommon.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,16 @@ int getNumChannels(const SharedAVCodecContext& avCodecContext) {
158158
#endif
159159
}
160160

161+
int getNumChannels(const AVCodecParameters* codecpar) {
162+
TORCH_CHECK(codecpar != nullptr, "codecpar is null")
163+
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
164+
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
165+
return codecpar->ch_layout.nb_channels;
166+
#else
167+
return codecpar->channels;
168+
#endif
169+
}
170+
161171
void setDefaultChannelLayout(
162172
UniqueAVCodecContext& avCodecContext,
163173
int numChannels) {

src/torchcodec/_core/FFMPEGCommon.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
180180

181181
int getNumChannels(const UniqueAVFrame& avFrame);
182182
int getNumChannels(const SharedAVCodecContext& avCodecContext);
183+
int getNumChannels(const AVCodecParameters* codecpar);
183184

184185
void setDefaultChannelLayout(
185186
UniqueAVCodecContext& avCodecContext,

src/torchcodec/_core/Metadata.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ std::optional<double> StreamMetadata::getDurationSeconds(
2929
return static_cast<double>(numFramesFromHeader.value()) /
3030
averageFpsFromHeader.value();
3131
}
32+
if (durationSecondsFromContainer.has_value()) {
33+
return durationSecondsFromContainer.value();
34+
}
3235
return std::nullopt;
3336
default:
3437
TORCH_CHECK(false, "Unknown SeekMode");
@@ -80,13 +83,13 @@ std::optional<int64_t> StreamMetadata::getNumFrames(SeekMode seekMode) const {
8083
numFramesFromContent.has_value(), "Missing numFramesFromContent");
8184
return numFramesFromContent.value();
8285
case SeekMode::approximate: {
86+
auto durationSeconds = getDurationSeconds(seekMode);
8387
if (numFramesFromHeader.has_value()) {
8488
return numFramesFromHeader.value();
8589
}
86-
if (averageFpsFromHeader.has_value() &&
87-
durationSecondsFromHeader.has_value()) {
90+
if (averageFpsFromHeader.has_value() && durationSeconds.has_value()) {
8891
return static_cast<int64_t>(
89-
averageFpsFromHeader.value() * durationSecondsFromHeader.value());
92+
averageFpsFromHeader.value() * durationSeconds.value());
9093
}
9194
return std::nullopt;
9295
}

src/torchcodec/_core/Metadata.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ enum class SeekMode { exact, approximate, custom_frame_mappings };
2323
struct StreamMetadata {
2424
// Common (video and audio) fields derived from the AVStream.
2525
int streamIndex;
26+
2627
// See this link for what various values are available:
2728
// https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
2829
AVMediaType mediaType;
30+
2931
std::optional<AVCodecID> codecId;
3032
std::optional<std::string> codecName;
3133
std::optional<double> durationSecondsFromHeader;
@@ -35,17 +37,22 @@ struct StreamMetadata {
3537
std::optional<double> averageFpsFromHeader;
3638
std::optional<double> bitRate;
3739

40+
// Used as fallback in approximate mode when stream duration is unavailable.
41+
std::optional<double> durationSecondsFromContainer;
42+
3843
// More accurate duration, obtained by scanning the file.
3944
// These presentation timestamps are in time base.
4045
std::optional<int64_t> beginStreamPtsFromContent;
4146
std::optional<int64_t> endStreamPtsFromContent;
47+
4248
// These presentation timestamps are in seconds.
4349
std::optional<double> beginStreamPtsSecondsFromContent;
4450
std::optional<double> endStreamPtsSecondsFromContent;
51+
4552
// This can be useful for index-based seeking.
4653
std::optional<int64_t> numFramesFromContent;
4754

48-
// Video-only fields derived from the AVCodecContext.
55+
// Video-only fields
4956
std::optional<int> width;
5057
std::optional<int> height;
5158
std::optional<AVRational> sampleAspectRatio;
@@ -67,13 +74,17 @@ struct ContainerMetadata {
6774
std::vector<StreamMetadata> allStreamMetadata;
6875
int numAudioStreams = 0;
6976
int numVideoStreams = 0;
77+
7078
// Note that this is the container-level duration, which is usually the max
7179
// of all stream durations available in the container.
7280
std::optional<double> durationSecondsFromHeader;
81+
7382
// Total BitRate level information at the container level in bit/s
7483
std::optional<double> bitRate;
84+
7585
// If set, this is the index to the default audio stream.
7686
std::optional<int> bestAudioStreamIndex;
87+
7788
// If set, this is the index to the default video stream.
7889
std::optional<int> bestVideoStreamIndex;
7990
};

0 commit comments

Comments
 (0)