-
Notifications
You must be signed in to change notification settings - Fork 74
Validation of allocated output tensor shapes #339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -880,7 +880,7 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( | |
| // speed-up when swscale is used. With swscale, we can tell ffmpeg to place the | ||
| // decoded frame directly into `preAllocatedtensor.data_ptr()`. We haven't yet | ||
| // found a way to do that with filtegraph. | ||
| // TODO: Figure out whether that's possilbe! | ||
| // TODO: Figure out whether that's possible! | ||
| // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of | ||
| // `dimension_order` parameter. It's up to callers to re-shape it if needed. | ||
| void VideoDecoder::convertAVFrameToDecodedOutputOnCPU( | ||
|
|
@@ -890,41 +890,68 @@ void VideoDecoder::convertAVFrameToDecodedOutputOnCPU( | |
| int streamIndex = rawOutput.streamIndex; | ||
| AVFrame* frame = rawOutput.frame.get(); | ||
| auto& streamInfo = streams_[streamIndex]; | ||
| torch::Tensor tensor; | ||
|
|
||
| auto frameDims = | ||
| getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame); | ||
| int expectedOutputHeight = frameDims.height; | ||
| int expectedOutputWidth = frameDims.width; | ||
|
|
||
| if (preAllocatedOutputTensor.has_value()) { | ||
| auto shape = preAllocatedOutputTensor.value().sizes(); | ||
| TORCH_CHECK( | ||
| (shape.size() == 3) && (shape[0] == expectedOutputHeight) && | ||
| (shape[1] == expectedOutputWidth) && (shape[2] == 3), | ||
| "Expected pre-allocated tensor of shape ", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should also be able to do |
||
| expectedOutputHeight, | ||
| "x", | ||
| expectedOutputWidth, | ||
| "x3, got ", | ||
| shape); | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check above validates the shape of the pre-allocated output tensor. It was previously only done for swscale, but it should also be done for filtergraph, hence why I moved it up. |
||
|
|
||
| torch::Tensor outputTensor; | ||
| if (output.streamType == AVMEDIA_TYPE_VIDEO) { | ||
| if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) { | ||
| auto frameDims = | ||
| getHeightAndWidthFromOptionsOrAVFrame(streamInfo.options, *frame); | ||
| int height = frameDims.height; | ||
| int width = frameDims.width; | ||
| if (preAllocatedOutputTensor.has_value()) { | ||
| tensor = preAllocatedOutputTensor.value(); | ||
| auto shape = tensor.sizes(); | ||
| TORCH_CHECK( | ||
| (shape.size() == 3) && (shape[0] == height) && | ||
| (shape[1] == width) && (shape[2] == 3), | ||
| "Expected tensor of shape ", | ||
| height, | ||
| "x", | ||
| width, | ||
| "x3, got ", | ||
| shape); | ||
| } else { | ||
| tensor = allocateEmptyHWCTensor(height, width, torch::kCPU); | ||
| } | ||
| rawOutput.data = tensor.data_ptr<uint8_t>(); | ||
| convertFrameToBufferUsingSwsScale(rawOutput); | ||
|
|
||
| output.frame = tensor; | ||
| outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor( | ||
| expectedOutputHeight, expectedOutputWidth, torch::kCPU)); | ||
|
|
||
| int resultHeight = | ||
| convertFrameToBufferUsingSwsScale(streamIndex, frame, outputTensor); | ||
| // If this check failed, it would mean that the frame wasn't reshaped to | ||
| // the expected height. | ||
| // TODO: Can we do the same check for width? | ||
| TORCH_CHECK( | ||
| resultHeight == expectedOutputHeight, | ||
| "resultHeight != expectedOutputHeight: ", | ||
| resultHeight, | ||
| " != ", | ||
| expectedOutputHeight); | ||
|
|
||
| output.frame = outputTensor; | ||
| } else if ( | ||
| streamInfo.colorConversionLibrary == | ||
| ColorConversionLibrary::FILTERGRAPH) { | ||
| tensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame); | ||
| outputTensor = convertFrameToTensorUsingFilterGraph(streamIndex, frame); | ||
|
|
||
| // Similarly to above, if this check fails it means the frame wasn't | ||
| // reshaped to its expected dimensions by filtergraph. | ||
| auto shape = outputTensor.sizes(); | ||
| TORCH_CHECK( | ||
| (shape.size() == 3) && (shape[0] == expectedOutputHeight) && | ||
| (shape[1] == expectedOutputWidth) && (shape[2] == 3), | ||
| "Expected output tensor of shape ", | ||
| expectedOutputHeight, | ||
| "x", | ||
| expectedOutputWidth, | ||
| "x3, got ", | ||
| shape); | ||
| if (preAllocatedOutputTensor.has_value()) { | ||
| preAllocatedOutputTensor.value().copy_(tensor); | ||
| // We have already validated that preAllocatedOutputTensor and | ||
| // outputTensor have the same shape. | ||
| preAllocatedOutputTensor.value().copy_(outputTensor); | ||
| output.frame = preAllocatedOutputTensor.value(); | ||
| } else { | ||
| output.frame = tensor; | ||
| output.frame = outputTensor; | ||
| } | ||
| } else { | ||
| throw std::runtime_error( | ||
|
|
@@ -1303,24 +1330,23 @@ double VideoDecoder::getPtsSecondsForFrame( | |
| return ptsToSeconds(stream.allFrames[frameIndex].pts, stream.timeBase); | ||
| } | ||
|
|
||
| void VideoDecoder::convertFrameToBufferUsingSwsScale( | ||
| RawDecodedOutput& rawOutput) { | ||
| AVFrame* frame = rawOutput.frame.get(); | ||
| int streamIndex = rawOutput.streamIndex; | ||
| int VideoDecoder::convertFrameToBufferUsingSwsScale( | ||
| int streamIndex, | ||
| const AVFrame* frame, | ||
| torch::Tensor& outputTensor) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have updated the signature of Their signatures being so different previously was a constant source of confusion for me. I realize that this change seemingly goes against our TODO of folding the pre-allocated tensor within
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this function now returning the height?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So that we can check against it and have the swscale check next to the filtergraph check. It makes the logic of both libraries more symmetric. |
||
| enum AVPixelFormat frameFormat = | ||
| static_cast<enum AVPixelFormat>(frame->format); | ||
| StreamInfo& activeStream = streams_[streamIndex]; | ||
| auto frameDims = | ||
| getHeightAndWidthFromOptionsOrAVFrame(activeStream.options, *frame); | ||
| int outputHeight = frameDims.height; | ||
| int outputWidth = frameDims.width; | ||
|
|
||
| int expectedOutputHeight = outputTensor.sizes()[0]; | ||
| int expectedOutputWidth = outputTensor.sizes()[1]; | ||
| if (activeStream.swsContext.get() == nullptr) { | ||
| SwsContext* swsContext = sws_getContext( | ||
| frame->width, | ||
| frame->height, | ||
| frameFormat, | ||
| outputWidth, | ||
| outputHeight, | ||
| expectedOutputWidth, | ||
| expectedOutputHeight, | ||
| AV_PIX_FMT_RGB24, | ||
| SWS_BILINEAR, | ||
| nullptr, | ||
|
|
@@ -1352,8 +1378,8 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale( | |
| } | ||
| SwsContext* swsContext = activeStream.swsContext.get(); | ||
| uint8_t* pointers[4] = { | ||
| static_cast<uint8_t*>(rawOutput.data), nullptr, nullptr, nullptr}; | ||
| int linesizes[4] = {outputWidth * 3, 0, 0, 0}; | ||
| outputTensor.data_ptr<uint8_t>(), nullptr, nullptr, nullptr}; | ||
| int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0}; | ||
| int resultHeight = sws_scale( | ||
| swsContext, | ||
| frame->data, | ||
|
|
@@ -1362,9 +1388,7 @@ void VideoDecoder::convertFrameToBufferUsingSwsScale( | |
| frame->height, | ||
| pointers, | ||
| linesizes); | ||
| TORCH_CHECK( | ||
| outputHeight == resultHeight, | ||
| "outputHeight(" + std::to_string(resultHeight) + ") != resultHeight"); | ||
| return resultHeight; | ||
| } | ||
|
|
||
| torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph( | ||
|
|
@@ -1379,8 +1403,7 @@ torch::Tensor VideoDecoder::convertFrameToTensorUsingFilterGraph( | |
| ffmpegStatus = | ||
| av_buffersink_get_frame(filterState.sinkContext, filteredFrame.get()); | ||
| TORCH_CHECK_EQ(filteredFrame->format, AV_PIX_FMT_RGB24); | ||
| auto frameDims = getHeightAndWidthFromOptionsOrAVFrame( | ||
| streams_[streamIndex].options, *filteredFrame.get()); | ||
| auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredFrame.get()); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have reverted this logic to what it previously was before #332. I had some doubt in #332 (comment) already, and now I think it's clear that this logic for finding I have updated the big note below, with my updated understanding. (I know it's a bit complicated. But the complexity isn't caused by this PR!) |
||
| int height = frameDims.height; | ||
| int width = frameDims.width; | ||
| std::vector<int64_t> shape = {height, width, 3}; | ||
|
|
@@ -1406,6 +1429,10 @@ VideoDecoder::~VideoDecoder() { | |
| } | ||
| } | ||
|
|
||
| FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Read comment below. I get the motivation for grepping through the code. I don't think this is amazing, but it should help us get to a cleaner place, and it does make explicit how convoluted things are. |
||
| return FrameDims(resizedAVFrame.height, resizedAVFrame.width); | ||
| } | ||
|
|
||
| FrameDims getHeightAndWidthFromOptionsOrMetadata( | ||
| const VideoDecoder::VideoStreamDecoderOptions& options, | ||
| const VideoDecoder::StreamMetadata& metadata) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -383,7 +383,10 @@ class VideoDecoder { | |
| torch::Tensor convertFrameToTensorUsingFilterGraph( | ||
| int streamIndex, | ||
| const AVFrame* frame); | ||
| void convertFrameToBufferUsingSwsScale(RawDecodedOutput& rawOutput); | ||
| int convertFrameToBufferUsingSwsScale( | ||
| int streamIndex, | ||
| const AVFrame* frame, | ||
| torch::Tensor& outputTensor); | ||
| DecodedOutput convertAVFrameToDecodedOutput( | ||
| RawDecodedOutput& rawOutput, | ||
| std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt); | ||
|
|
@@ -426,30 +429,32 @@ class VideoDecoder { | |
| // MaybePermuteHWC2CHW(). | ||
| // | ||
| // Also, importantly, the way we figure out the the height and width of the | ||
| // output frame varies and depends on the decoding entry-point: | ||
| // - In all cases, if the user requested specific height and width from the | ||
| // options, we honor that. Otherwise we fall into one of the categories below. | ||
| // - In Batch decoding APIs (e.g. getFramesAtIndices), we get height and width | ||
| // from the stream metadata, which itself got its value from the CodecContext, | ||
| // when the stream was added. | ||
| // - In single frames APIs: | ||
| // - On CPU we get height and width from the AVFrame. | ||
| // - On GPU, we get height and width from the metadata (same as batch APIs) | ||
| // | ||
| // These 2 strategies are encapsulated within | ||
| // getHeightAndWidthFromOptionsOrMetadata() and | ||
| // getHeightAndWidthFromOptionsOrAVFrame(). The reason they exist is to make it | ||
| // very obvious which logic is used in which place, and they allow for `git | ||
| // grep`ing. | ||
| // output frame tensor varies, and depends on the decoding entry-point. In | ||
| // *decreasing order of accuracy*, we use the following sources for determining | ||
| // height and width: | ||
| // - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the | ||
| // AVframe, *post*-resizing. This is only used for single-frame decoding APIs, | ||
| // on CPU, with filtergraph. | ||
| // - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from | ||
| // the user-specified options if they exist, or the height and width of the | ||
| // AVFrame *before* it is resized. In theory, i.e. if there are no bugs within | ||
| // our code or within FFmpeg code, this should be exactly the same as | ||
| // getHeightAndWidthFromResizedAVFrame(). This is used by single-frame | ||
| // decoding APIs, on CPU, with swscale. | ||
| // - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from | ||
| // the user-specified options if they exist, or the height and width form the | ||
| // stream metadata, which itself got its value from the CodecContext, when the | ||
| // stream was added. This is used by batch decoding APIs, or by GPU-APIs (both | ||
| // batch and single-frames). | ||
| // | ||
| // The source of truth for height and width really is the AVFrame: it's the | ||
| // decoded ouptut from FFmpeg. The info from the metadata (i.e. from the | ||
| // CodecContext) may not be as accurate. However, the AVFrame is only available | ||
| // late in the call stack, when the frame is decoded, while the CodecContext is | ||
| // available early when a stream is added. This is why we use the CodecContext | ||
| // for pre-allocating batched output tensors (we could pre-allocate those only | ||
| // once we decode the first frame to get the info frame the AVFrame, but that's | ||
| // a more complex logic). | ||
| // The source of truth for height and width really is the (resized) AVFrame: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The decoded output from ffmpeg isn't resized. The resize happens later |
||
| // it's the decoded ouptut from FFmpeg. The info from the metadata (i.e. from | ||
| // the CodecContext) may not be as accurate. However, the AVFrame is only | ||
| // available late in the call stack, when the frame is decoded, while the | ||
| // CodecContext is available early when a stream is added. This is why we use | ||
| // the CodecContext for pre-allocating batched output tensors (we could | ||
| // pre-allocate those only once we decode the first frame to get the info frame | ||
| // the AVFrame, but that's a more complex logic). | ||
| // | ||
| // Because the sources for height and width may disagree, we may end up with | ||
| // conflicts: e.g. if we pre-allocate a batch output tensor based on the | ||
|
|
@@ -463,6 +468,10 @@ struct FrameDims { | |
| FrameDims(int h, int w) : height(h), width(w) {} | ||
| }; | ||
|
|
||
| // There's nothing preventing you from calling this on a non-resized frame, but | ||
| // please don't. | ||
| FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. getHeightAndWidth can just be called getFrameDims |
||
|
|
||
| FrameDims getHeightAndWidthFromOptionsOrMetadata( | ||
| const VideoDecoder::VideoStreamDecoderOptions& options, | ||
| const VideoDecoder::StreamMetadata& metadata); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ahmadsharif1 any idea on how to do this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can look at rawOutput.frame's dimensions and use those, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you suggesting to get height and width from the AVFrame instead of from the Metadata then? I don't mind but that's a change of logic. Do you know why it wasn't done like that before?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
discussed offline, I'll submit a follow-up to do that.