diff --git a/src/torchcodec/decoders/_core/CudaDevice.cpp b/src/torchcodec/decoders/_core/CudaDevice.cpp index 7c3964cca..887c8fadc 100644 --- a/src/torchcodec/decoders/_core/CudaDevice.cpp +++ b/src/torchcodec/decoders/_core/CudaDevice.cpp @@ -224,6 +224,9 @@ void convertAVFrameToDecodedOutputOnCuda( auto start = std::chrono::high_resolution_clock::now(); + // TODO height and width info of output tensor comes from the metadata, which + // may not be accurate. How do we make sure we won't corrupt memory if the + // allocated tensor is too short/large? NppStatus status = nppiNV12ToRGB_8u_P2C3R( input, src->linesize[0], diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp index 096275e86..b68d66803 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp @@ -880,7 +880,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( // speed-up when swscale is used. With swscale, we can tell ffmpeg to place the // decoded frame directly into `preAllocatedtensor.data_ptr()`. We haven't yet // found a way to do that with filtegraph. -// TODO: Figure out whether that's possilbe! +// TODO: Figure out whether that's possible! // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of // `dimension_order` parameter. It's up to callers to re-shape it if needed. void VideoDecoder::convertAVFrameToDecodedOutputOnCPU( @@ -890,41 +890,68 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU( int streamIndex = rawOutput.streamIndex; AVFrame* frame = rawOutput.frame.get(); auto& streamInfo = streams_[streamIndex]; - torch::Tensor tensor; + + auto frameDims = + getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame); + int expectedOutputHeight = frameDims.height; + int expectedOutputWidth = frameDims.width; + + if (preAllocatedOutputTensor.has_value()) { + auto shape = preAllocatedOutputTensor.value().sizes(); + TORCH_CHECK( + (shape.size() == 3) && (shape[0] == expectedOutputHeight) && + (shape[1] == expectedOutputWidth) && (shape[2] == 3), + "Expected pre-allocated tensor of shape ", + expectedOutputHeight, + "x", + expectedOutputWidth, + "x3, got ", + shape); + } + + torch::Tensor outputTensor; if (output.streamType == AVMEDIA_TYPE_VIDEO) { if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) { - auto frameDims = - getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame); - int height = frameDims.height; - int width = frameDims.width; - if (preAllocatedOutputTensor.has_value()) { - tensor = preAllocatedOutputTensor.value(); - auto shape = tensor.sizes(); - TORCH_CHECK( - (shape.size() == 3) && (shape[0] == height) && - (shape[1] == width) && (shape[2] == 3), - "Expected tensor of shape ", - height, - "x", - width, - "x3, got ", - shape); - } else { - tensor = allocateEmptyHWCTensor(height, width, torch::kCPU); - } - rawOutput.data = tensor.data_ptr(); - convertFrameToBufferUsingSwsScale(rawOutput); - - output.frame = tensor; + outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor( + expectedOutputHeight, expectedOutputWidth, torch::kCPU)); + + int resultHeight = + convertFrameToBufferUsingSwsScale(streamIndex, frame, outputTensor); + // If this check failed, it would mean that the frame wasn't reshaped to + // the expected height. + // TODO: Can we do the same check for width? + TORCH_CHECK( + resultHeight == expectedOutputHeight, + "resultHeight != expectedOutputHeight: ", + resultHeight, + " != ", + expectedOutputHeight); + + output.frame = outputTensor; } else if ( streamInfo.colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) { - tensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame); + outputTensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame); + + // Similarly to above, if this check fails it means the frame wasn't + // reshaped to its expected dimensions by filtergraph. + auto shape = outputTensor.sizes(); + TORCH_CHECK( + (shape.size() == 3) && (shape[0] == expectedOutputHeight) && + (shape[1] == expectedOutputWidth) && (shape[2] == 3), + "Expected output tensor of shape ", + expectedOutputHeight, + "x", + expectedOutputWidth, + "x3, got ", + shape); if (preAllocatedOutputTensor.has_value()) { - preAllocatedOutputTensor.value().copy_(tensor); + // We have already validated that preAllocatedOutputTensor and + // outputTensor have the same shape. + preAllocatedOutputTensor.value().copy_(outputTensor); output.frame = preAllocatedOutputTensor.value(); } else { - output.frame = tensor; + output.frame = outputTensor; } } else { throw std::runtime_error( @@ -1303,24 +1330,23 @@ double VideoDecoder::getPtsSecondsForFrame( return ptsToSeconds(stream.allFrames[frameIndex].pts, stream.timeBase); } -void VideoDecoder::convertFrameToBufferUsingSwsScale( - RawDecodedOutput& rawOutput) { - AVFrame* frame = rawOutput.frame.get(); - int streamIndex = rawOutput.streamIndex; +int VideoDecoder::convertFrameToBufferUsingSwsScale( + int streamIndex, + const AVFrame* frame, + torch::Tensor& outputTensor) { enum AVPixelFormat frameFormat = static_cast(frame->format); StreamInfo& activeStream = streams_[streamIndex]; - auto frameDims = - getHeightAndWidthFromOptionsOrAVFrame(activeStream.options, *frame); - int outputHeight = frameDims.height; - int outputWidth = frameDims.width; + + int expectedOutputHeight = outputTensor.sizes()[0]; + int expectedOutputWidth = outputTensor.sizes()[1]; if (activeStream.swsContext.get() == nullptr) { SwsContext* swsContext = sws_getContext( frame->width, frame->height, frameFormat, - outputWidth, - outputHeight, + expectedOutputWidth, + expectedOutputHeight, AV_PIX_FMT_RGB24, SWS_BILINEAR, nullptr, @@ -1352,8 +1378,8 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale( } SwsContext* swsContext = activeStream.swsContext.get(); uint8_t* pointers[4] = { - static_cast(rawOutput.data), nullptr, nullptr, nullptr}; - int linesizes[4] = {outputWidth * 3, 0, 0, 0}; + outputTensor.data_ptr(), nullptr, nullptr, nullptr}; + int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0}; int resultHeight = sws_scale( swsContext, frame->data, @@ -1362,9 +1388,7 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale( frame->height, pointers, linesizes); - TORCH_CHECK( - outputHeight == resultHeight, - "outputHeight(" + std::to_string(resultHeight) + ") != resultHeight"); + return resultHeight; } torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph( @@ -1379,8 +1403,7 @@ torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph( ffmpegStatus = av_buffersink_get_frame(filterState.sinkContext, filteredFrame.get()); TORCH_CHECK_EQ(filteredFrame->format, AV_PIX_FMT_RGB24); - auto frameDims = getHeightAndWidthFromOptionsOrAVFrame( - streams_[streamIndex].options, *filteredFrame.get()); + auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredFrame.get()); int height = frameDims.height; int width = frameDims.width; std::vector shape = {height, width, 3}; @@ -1406,6 +1429,10 @@ VideoDecoder::~VideoDecoder() { } } +FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) { + return FrameDims(resizedAVFrame.height, resizedAVFrame.width); +} + FrameDims getHeightAndWidthFromOptionsOrMetadata( const VideoDecoder::VideoStreamDecoderOptions& options, const VideoDecoder::StreamMetadata& metadata) { diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h index f944c01a9..37e789256 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.h +++ b/src/torchcodec/decoders/_core/VideoDecoder.h @@ -383,7 +383,10 @@ class VideoDecoder { torch::Tensor convertFrameToTensorUsingFilterGraph( int streamIndex, const AVFrame* frame); - void convertFrameToBufferUsingSwsScale(RawDecodedOutput& rawOutput); + int convertFrameToBufferUsingSwsScale( + int streamIndex, + const AVFrame* frame, + torch::Tensor& outputTensor); DecodedOutput convertAVFrameToDecodedOutput( RawDecodedOutput& rawOutput, std::optional preAllocatedOutputTensor = std::nullopt); @@ -426,30 +429,32 @@ class VideoDecoder { // MaybePermuteHWC2CHW(). // // Also, importantly, the way we figure out the the height and width of the -// output frame varies and depends on the decoding entry-point: -// - In all cases, if the user requested specific height and width from the -// options, we honor that. Otherwise we fall into one of the categories below. -// - In Batch decoding APIs (e.g. getFramesAtIndices), we get height and width -// from the stream metadata, which itself got its value from the CodecContext, -// when the stream was added. -// - In single frames APIs: -// - On CPU we get height and width from the AVFrame. -// - On GPU, we get height and width from the metadata (same as batch APIs) -// -// These 2 strategies are encapsulated within -// getHeightAndWidthFromOptionsOrMetadata() and -// getHeightAndWidthFromOptionsOrAVFrame(). The reason they exist is to make it -// very obvious which logic is used in which place, and they allow for `git -// grep`ing. +// output frame tensor varies, and depends on the decoding entry-point. In +// *decreasing order of accuracy*, we use the following sources for determining +// height and width: +// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the +// AVframe, *post*-resizing. This is only used for single-frame decoding APIs, +// on CPU, with filtergraph. +// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from +// the user-specified options if they exist, or the height and width of the +// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within +// our code or within FFmpeg code, this should be exactly the same as +// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame +// decoding APIs, on CPU, with swscale. +// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from +// the user-specified options if they exist, or the height and width form the +// stream metadata, which itself got its value from the CodecContext, when the +// stream was added. This is used by batch decoding APIs, or by GPU-APIs (both +// batch and single-frames). // -// The source of truth for height and width really is the AVFrame: it's the -// decoded ouptut from FFmpeg. The info from the metadata (i.e. from the -// CodecContext) may not be as accurate. However, the AVFrame is only available -// late in the call stack, when the frame is decoded, while the CodecContext is -// available early when a stream is added. This is why we use the CodecContext -// for pre-allocating batched output tensors (we could pre-allocate those only -// once we decode the first frame to get the info frame the AVFrame, but that's -// a more complex logic). +// The source of truth for height and width really is the (resized) AVFrame: +// it's the decoded ouptut from FFmpeg. The info from the metadata (i.e. from +// the CodecContext) may not be as accurate. However, the AVFrame is only +// available late in the call stack, when the frame is decoded, while the +// CodecContext is available early when a stream is added. This is why we use +// the CodecContext for pre-allocating batched output tensors (we could +// pre-allocate those only once we decode the first frame to get the info frame +// the AVFrame, but that's a more complex logic). // // Because the sources for height and width may disagree, we may end up with // conflicts: e.g. if we pre-allocate a batch output tensor based on the @@ -463,6 +468,10 @@ struct FrameDims { FrameDims(int h, int w) : height(h), width(w) {} }; +// There's nothing preventing you from calling this on a non-resized frame, but +// please don't. +FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame); + FrameDims getHeightAndWidthFromOptionsOrMetadata( const VideoDecoder::VideoStreamDecoderOptions& options, const VideoDecoder::StreamMetadata& metadata);