Skip to content

Commit 9b14d17

Browse files
committed
Deal with custom frame mapping
1 parent 7832834 commit 9b14d17

File tree

2 files changed

+53
-19
lines changed

2 files changed

+53
-19
lines changed

src/torchcodec/_core/Metadata.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ enum class SeekMode { exact, approximate, custom_frame_mappings };
2323
struct StreamMetadata {
2424
// Common (video and audio) fields derived from the AVStream.
2525
int streamIndex;
26+
2627
// See this link for what various values are available:
2728
// https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
2829
AVMediaType mediaType;
30+
2931
std::optional<AVCodecID> codecId;
3032
std::optional<std::string> codecName;
3133
std::optional<double> durationSecondsFromHeader;
@@ -39,13 +41,15 @@ struct StreamMetadata {
3941
// These presentation timestamps are in time base.
4042
std::optional<int64_t> beginStreamPtsFromContent;
4143
std::optional<int64_t> endStreamPtsFromContent;
44+
4245
// These presentation timestamps are in seconds.
4346
std::optional<double> beginStreamPtsSecondsFromContent;
4447
std::optional<double> endStreamPtsSecondsFromContent;
48+
4549
// This can be useful for index-based seeking.
4650
std::optional<int64_t> numFramesFromContent;
4751

48-
// Video-only fields derived from the AVCodecContext.
52+
// Video-only fields
4953
std::optional<int> width;
5054
std::optional<int> height;
5155
std::optional<AVRational> sampleAspectRatio;
@@ -67,13 +71,17 @@ struct ContainerMetadata {
6771
std::vector<StreamMetadata> allStreamMetadata;
6872
int numAudioStreams = 0;
6973
int numVideoStreams = 0;
74+
7075
// Note that this is the container-level duration, which is usually the max
7176
// of all stream durations available in the container.
7277
std::optional<double> durationSecondsFromHeader;
78+
7379
// Total BitRate level information at the container level in bit/s
7480
std::optional<double> bitRate;
81+
7582
// If set, this is the index to the default audio stream.
7683
std::optional<int> bestAudioStreamIndex;
84+
7785
// If set, this is the index to the default video stream.
7886
std::optional<int> bestVideoStreamIndex;
7987
};

src/torchcodec/_core/custom_ops.cpp

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,34 @@ SeekMode seekModeFromString(std::string_view seekMode) {
198198
}
199199
}
200200

201+
void writeFallbackBasedMetadata(
202+
std::map<std::string, std::string>& map,
203+
const StreamMetadata& streamMetadata,
204+
SeekMode seekMode) {
205+
auto durationSeconds = streamMetadata.getDurationSeconds(seekMode);
206+
if (durationSeconds.has_value()) {
207+
map["durationSeconds"] = std::to_string(durationSeconds.value());
208+
}
209+
210+
auto numFrames = streamMetadata.getNumFrames(seekMode);
211+
if (numFrames.has_value()) {
212+
map["numFrames"] = std::to_string(numFrames.value());
213+
}
214+
215+
double beginStreamSeconds = streamMetadata.getBeginStreamSeconds(seekMode);
216+
map["beginStreamSeconds"] = std::to_string(beginStreamSeconds);
217+
218+
auto endStreamSeconds = streamMetadata.getEndStreamSeconds(seekMode);
219+
if (endStreamSeconds.has_value()) {
220+
map["endStreamSeconds"] = std::to_string(endStreamSeconds.value());
221+
}
222+
223+
auto averageFps = streamMetadata.getAverageFps(seekMode);
224+
if (averageFps.has_value()) {
225+
map["averageFps"] = std::to_string(averageFps.value());
226+
}
227+
}
228+
201229
int checkedToPositiveInt(const std::string& str) {
202230
int ret = 0;
203231
try {
@@ -917,30 +945,28 @@ std::string get_stream_json_metadata(
917945
// In approximate mode: content-based metadata does not exist for any stream.
918946
// In custom_frame_mappings: content-based metadata exists only for the active
919947
// stream.
948+
//
920949
// Our fallback logic assumes content-based metadata is available.
921950
// It is available for decoding on the active stream, but would break
922951
// when getting metadata from non-active streams.
923952
if ((seekMode != SeekMode::custom_frame_mappings) ||
924953
(seekMode == SeekMode::custom_frame_mappings &&
925954
stream_index == activeStreamIndex)) {
926-
if (streamMetadata.getDurationSeconds(seekMode).has_value()) {
927-
map["durationSeconds"] =
928-
std::to_string(streamMetadata.getDurationSeconds(seekMode).value());
929-
}
930-
if (streamMetadata.getNumFrames(seekMode).has_value()) {
931-
map["numFrames"] =
932-
std::to_string(streamMetadata.getNumFrames(seekMode).value());
933-
}
934-
map["beginStreamSeconds"] =
935-
std::to_string(streamMetadata.getBeginStreamSeconds(seekMode));
936-
if (streamMetadata.getEndStreamSeconds(seekMode).has_value()) {
937-
map["endStreamSeconds"] =
938-
std::to_string(streamMetadata.getEndStreamSeconds(seekMode).value());
939-
}
940-
if (streamMetadata.getAverageFps(seekMode).has_value()) {
941-
map["averageFps"] =
942-
std::to_string(streamMetadata.getAverageFps(seekMode).value());
943-
}
955+
writeFallbackBasedMetadata(map, streamMetadata, seekMode);
956+
} else if (seekMode == SeekMode::custom_frame_mappings) {
957+
// If this is not the active stream, then we don't have content-based
958+
// metadata for custom frame mappings. In that case, we want the same
959+
// behavior as we would get with approximate mode. Encoding this behavior in
960+
// the fallback logic itself is tricky and not worth it for this corner
961+
// case. So we hardcode in approximate mode.
962+
//
963+
// TODO: This hacky behavior is only necessary because the custom frame
964+
// mapping is supplied in SingleStreamDecoder::addVideoStream() rather
965+
// than in the constructor. And it's supplied to addVideoStream() and
966+
// not the constructor because we need to know the stream index. If we
967+
// can encode the relevant stream indices into custom frame mappings
968+
// itself, then we can put it in the constructor.
969+
writeFallbackBasedMetadata(map, streamMetadata, SeekMode::approximate);
944970
}
945971

946972
return mapToJson(map);

0 commit comments

Comments
 (0)