Skip to content

Commit cf781ef

Browse files
committed
Merge branch 'main' of github.com:pytorch/torchcodec into align
2 parents 7a63a54 + b1719e7 commit cf781ef

File tree

3 files changed

+27
-41
lines changed

3 files changed

+27
-41
lines changed

.github/workflows/build_ffmpeg.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ jobs:
2929
matrix:
3030
ffmpeg-version: ["4.4.4", "5.1.4", "6.1.1", "7.0.1"]
3131
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
32+
permissions:
33+
id-token: write
34+
contents: read
3235
with:
3336
job-name: Build
3437
upload-artifact: ffmpeg-lgpl

src/torchcodec/decoders/_core/VideoDecoder.cpp

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -562,13 +562,14 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
562562
if (packet->flags & AV_PKT_FLAG_DISCARD) {
563563
continue;
564564
}
565-
auto& stream = containerMetadata_.streams[streamIndex];
566-
stream.minPtsFromScan =
567-
std::min(stream.minPtsFromScan.value_or(INT64_MAX), packet->pts);
568-
stream.maxPtsFromScan = std::max(
569-
stream.maxPtsFromScan.value_or(INT64_MIN),
565+
auto& streamMetadata = containerMetadata_.streams[streamIndex];
566+
streamMetadata.minPtsFromScan = std::min(
567+
streamMetadata.minPtsFromScan.value_or(INT64_MAX), packet->pts);
568+
streamMetadata.maxPtsFromScan = std::max(
569+
streamMetadata.maxPtsFromScan.value_or(INT64_MIN),
570570
packet->pts + packet->duration);
571-
stream.numFramesFromScan = stream.numFramesFromScan.value_or(0) + 1;
571+
streamMetadata.numFramesFromScan =
572+
streamMetadata.numFramesFromScan.value_or(0) + 1;
572573

573574
FrameInfo frameInfo;
574575
frameInfo.pts = packet->pts;
@@ -578,16 +579,17 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
578579
}
579580
streams_[streamIndex].allFrames.push_back(frameInfo);
580581
}
581-
for (size_t i = 0; i < containerMetadata_.streams.size(); ++i) {
582-
auto& streamMetadata = containerMetadata_.streams[i];
583-
auto stream = formatContext_->streams[i];
582+
for (size_t streamIndex = 0; streamIndex < containerMetadata_.streams.size();
583+
++streamIndex) {
584+
auto& streamMetadata = containerMetadata_.streams[streamIndex];
585+
auto avStream = formatContext_->streams[streamIndex];
584586
if (streamMetadata.minPtsFromScan.has_value()) {
585587
streamMetadata.minPtsSecondsFromScan =
586-
*streamMetadata.minPtsFromScan * av_q2d(stream->time_base);
588+
*streamMetadata.minPtsFromScan * av_q2d(avStream->time_base);
587589
}
588590
if (streamMetadata.maxPtsFromScan.has_value()) {
589591
streamMetadata.maxPtsSecondsFromScan =
590-
*streamMetadata.maxPtsFromScan * av_q2d(stream->time_base);
592+
*streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base);
591593
}
592594
}
593595
int ffmepgStatus =
@@ -597,23 +599,23 @@ void VideoDecoder::scanFileAndUpdateMetadataAndIndex() {
597599
"Could not seek file to pts=0: " +
598600
getFFMPEGErrorStringFromErrorCode(ffmepgStatus));
599601
}
600-
for (auto& [streamIndex, stream] : streams_) {
602+
for (auto& [streamIndex, streamInfo] : streams_) {
601603
std::sort(
602-
stream.keyFrames.begin(),
603-
stream.keyFrames.end(),
604+
streamInfo.keyFrames.begin(),
605+
streamInfo.keyFrames.end(),
604606
[](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
605607
return frameInfo1.pts < frameInfo2.pts;
606608
});
607609
std::sort(
608-
stream.allFrames.begin(),
609-
stream.allFrames.end(),
610+
streamInfo.allFrames.begin(),
611+
streamInfo.allFrames.end(),
610612
[](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
611613
return frameInfo1.pts < frameInfo2.pts;
612614
});
613615

614-
for (size_t i = 0; i < stream.allFrames.size(); ++i) {
615-
if (i + 1 < stream.allFrames.size()) {
616-
stream.allFrames[i].nextPts = stream.allFrames[i + 1].pts;
616+
for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
617+
if (i + 1 < streamInfo.allFrames.size()) {
618+
streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
617619
}
618620
}
619621
}
@@ -869,6 +871,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput(
869871
AVFrame* frame = rawOutput.frame.get();
870872
output.streamIndex = streamIndex;
871873
auto& streamInfo = streams_[streamIndex];
874+
TORCH_CHECK(streamInfo.stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO);
872875
output.ptsSeconds =
873876
ptsToSeconds(frame->pts, formatContext_->streams[streamIndex]->time_base);
874877
output.durationSeconds = ptsToSeconds(

src/torchcodec/decoders/_core/VideoDecoder.h

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -160,33 +160,13 @@ class VideoDecoder {
160160
// Calling getNextFrameOutputNoDemuxInternal() will return the first frame at
161161
// or after this position.
162162
void setCursorPtsInSeconds(double seconds);
163-
// This is an internal structure that is used to store the decoded output
164-
// from decoding a frame through color conversion. Example usage is:
165-
//
166-
// RawDecodedOutput rawOutput = getDecodedOutputWithFilter();
167-
// // Now allocate a single tensor or a batch tensor.
168-
// torch::Tensor userOutput = torch::empty(...);
169-
// // Now fill in `data` and `size`.
170-
// rawOutput.data = userOutput.data_ptr();
171-
// // Now run the color conversion.
172-
// convertFrameToBufferUsingSwsScale(rawOutput);
173-
//
174-
// This structure ensures we always keep the streamIndex and frame together
175-
// with the data output. Note that AVFrame itself doesn't retain the
176-
// streamIndex.
163+
// This structure ensures we always keep the streamIndex and AVFrame together
164+
// Note that AVFrame itself doesn't retain the streamIndex.
177165
struct RawDecodedOutput {
178166
// The actual decoded output as a unique pointer to an AVFrame.
179167
UniqueAVFrame frame;
180168
// The stream index of the decoded frame.
181169
int streamIndex;
182-
// This is an unowned pointer that we copy the frame data to after color
183-
// conversion.
184-
// For a single tensor this points to the start of data_ptr. For a batch
185-
// tensor it may point to the middle of the allocated batch tensor.
186-
void* data = nullptr;
187-
// We carry around the size to ensure we don't stomp on memory while doing
188-
// color conversion.
189-
size_t size = 0;
190170
};
191171
struct DecodedOutput {
192172
// The actual decoded output as a Tensor.

0 commit comments

Comments
 (0)