-
Notifications
You must be signed in to change notification settings - Fork 74
Pass pre-allocate tensors in batch APIs to avoid copies #266
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
8e06aa6
025bf27
f83ada9
72717bd
291bc87
887ae42
9418cb3
6b3da59
6a2190c
c8f2e79
5113b9c
9387537
bcb4e50
5db658e
e23acb7
96deb24
c2f2e59
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -846,7 +846,8 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getDecodedOutputWithFilter( | |
| } | ||
|
|
||
| VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( | ||
| VideoDecoder::RawDecodedOutput& rawOutput) { | ||
| VideoDecoder::RawDecodedOutput& rawOutput, | ||
| torch::Tensor& preAllocatedOutputTensor) { | ||
| // Convert the frame to tensor. | ||
| DecodedOutput output; | ||
| int streamIndex = rawOutput.streamIndex; | ||
|
|
@@ -861,8 +862,10 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( | |
| output.durationSeconds = ptsToSeconds( | ||
| getDuration(frame), formatContext_->streams[streamIndex]->time_base); | ||
| if (streamInfo.options.device.type() == torch::kCPU) { | ||
| convertAVFrameToDecodedOutputOnCPU(rawOutput, output); | ||
| convertAVFrameToDecodedOutputOnCPU( | ||
| rawOutput, output, preAllocatedOutputTensor); | ||
| } else if (streamInfo.options.device.type() == torch::kCUDA) { | ||
| // TODO: handle pre-allocated output tensor | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR is a no-op for CUDA devices. I'm leaving-out CUDA pre-allocation because this is strongly tied to #189 and can be treated separately. |
||
| convertAVFrameToDecodedOutputOnCuda( | ||
| streamInfo.options.device, | ||
| streamInfo.options, | ||
|
|
@@ -878,16 +881,24 @@ VideoDecoder::DecodedOutput VideoDecoder::convertAVFrameToDecodedOutput( | |
|
|
||
| void VideoDecoder::convertAVFrameToDecodedOutputOnCPU( | ||
| VideoDecoder::RawDecodedOutput& rawOutput, | ||
| DecodedOutput& output) { | ||
| DecodedOutput& output, | ||
| torch::Tensor& preAllocatedOutputTensor) { | ||
| int streamIndex = rawOutput.streamIndex; | ||
| AVFrame* frame = rawOutput.frame.get(); | ||
| auto& streamInfo = streams_[streamIndex]; | ||
| if (output.streamType == AVMEDIA_TYPE_VIDEO) { | ||
| if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) { | ||
| int width = streamInfo.options.width.value_or(frame->width); | ||
| int height = streamInfo.options.height.value_or(frame->height); | ||
| torch::Tensor tensor = torch::empty( | ||
| {height, width, 3}, torch::TensorOptions().dtype({torch::kUInt8})); | ||
| torch::Tensor tensor; | ||
| if (preAllocatedOutputTensor.numel() != 0) { | ||
| // TODO: check shape of preAllocatedOutputTensor? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should TORCH_CHECK for height, width, shape, etc. here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have this a try thinking it would be a simple assert like assert `shape[-3] == H, W, 3`But it turns out it's not as simple. Some tensors come as HWC while some other come pas HWC. This is because the pre-allocated batched tensors are allocated as such: It then me realize that everything works, but it's pretty magical. We end up doing the I want to fix this as an immediate follow-up if that's OK. I gave it a try here, but it's not trivial and it might be preferable not to overcomplexify this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm in favor of @NicolasHug suggestion. The logic he points out is legacy from way back when, and it wasn't necessarily throught through in terms of long term maintenance and code health. Always doing it one way, and then permuting as needed on the way out, sounds easier and cleaner.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good to me |
||
| tensor = preAllocatedOutputTensor; | ||
| } else { | ||
| int width = streamInfo.options.width.value_or(frame->width); | ||
| int height = streamInfo.options.height.value_or(frame->height); | ||
| tensor = torch::empty( | ||
| {height, width, 3}, torch::TensorOptions().dtype({torch::kUInt8})); | ||
| } | ||
|
|
||
| rawOutput.data = tensor.data_ptr<uint8_t>(); | ||
| convertFrameToBufferUsingSwsScale(rawOutput); | ||
|
|
||
|
|
@@ -945,7 +956,8 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameDisplayedAtTimestampNoDemux( | |
| return seconds >= frameStartTime && seconds < frameEndTime; | ||
| }); | ||
| // Convert the frame to tensor. | ||
| return convertAVFrameToDecodedOutput(rawOutput); | ||
| auto preAllocatedOutputTensor = torch::empty({0}); | ||
NicolasHug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| return convertAVFrameToDecodedOutput(rawOutput, preAllocatedOutputTensor); | ||
| } | ||
|
|
||
| void VideoDecoder::validateUserProvidedStreamIndex(uint64_t streamIndex) { | ||
|
|
@@ -980,7 +992,8 @@ void VideoDecoder::validateFrameIndex( | |
|
|
||
| VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex( | ||
| int streamIndex, | ||
| int64_t frameIndex) { | ||
| int64_t frameIndex, | ||
| torch::Tensor& preAllocatedOutputTensor) { | ||
| validateUserProvidedStreamIndex(streamIndex); | ||
| validateScannedAllStreams("getFrameAtIndex"); | ||
|
|
||
|
|
@@ -989,7 +1002,7 @@ VideoDecoder::DecodedOutput VideoDecoder::getFrameAtIndex( | |
|
|
||
| int64_t pts = stream.allFrames[frameIndex].pts; | ||
| setCursorPtsInSeconds(ptsToSeconds(pts, stream.timeBase)); | ||
| return getNextDecodedOutputNoDemux(); | ||
| return getNextDecodedOutputNoDemux(preAllocatedOutputTensor); | ||
| } | ||
|
|
||
| VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesAtIndices( | ||
|
|
@@ -1061,8 +1074,9 @@ VideoDecoder::BatchDecodedOutput VideoDecoder::getFramesInRange( | |
| BatchDecodedOutput output(numOutputFrames, options, streamMetadata); | ||
|
|
||
| for (int64_t i = start, f = 0; i < stop; i += step, ++f) { | ||
| DecodedOutput singleOut = getFrameAtIndex(streamIndex, i); | ||
| output.frames[f] = singleOut.frame; | ||
| auto preAllocatedOutputTensor = output.frames[f]; | ||
| DecodedOutput singleOut = | ||
| getFrameAtIndex(streamIndex, i, preAllocatedOutputTensor); | ||
scotts marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
NicolasHug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| output.ptsSeconds[f] = singleOut.ptsSeconds; | ||
| output.durationSeconds[f] = singleOut.durationSeconds; | ||
| } | ||
|
|
@@ -1154,8 +1168,9 @@ VideoDecoder::getFramesDisplayedByTimestampInRange( | |
| int64_t numFrames = stopFrameIndex - startFrameIndex; | ||
| BatchDecodedOutput output(numFrames, options, streamMetadata); | ||
| for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) { | ||
| DecodedOutput singleOut = getFrameAtIndex(streamIndex, i); | ||
| output.frames[f] = singleOut.frame; | ||
| auto preAllocatedOutputTensor = output.frames[f]; | ||
| DecodedOutput singleOut = | ||
| getFrameAtIndex(streamIndex, i, preAllocatedOutputTensor); | ||
| output.ptsSeconds[f] = singleOut.ptsSeconds; | ||
| output.durationSeconds[f] = singleOut.durationSeconds; | ||
| } | ||
|
|
@@ -1174,8 +1189,13 @@ VideoDecoder::RawDecodedOutput VideoDecoder::getNextRawDecodedOutputNoDemux() { | |
| } | ||
|
|
||
| VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux() { | ||
| auto preAllocatedOutputTensor = torch::empty({0}); | ||
| return VideoDecoder::getNextDecodedOutputNoDemux(preAllocatedOutputTensor); | ||
| } | ||
| VideoDecoder::DecodedOutput VideoDecoder::getNextDecodedOutputNoDemux( | ||
| torch::Tensor& preAllocatedOutputTensor) { | ||
| auto rawOutput = getNextRawDecodedOutputNoDemux(); | ||
| return convertAVFrameToDecodedOutput(rawOutput); | ||
| return convertAVFrameToDecodedOutput(rawOutput, preAllocatedOutputTensor); | ||
| } | ||
|
|
||
| void VideoDecoder::setCursorPtsInSeconds(double seconds) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -215,14 +215,19 @@ class VideoDecoder { | |
| // Decodes the frame where the current cursor position is. It also advances | ||
| // the cursor to the next frame. | ||
| DecodedOutput getNextDecodedOutputNoDemux(); | ||
| DecodedOutput getNextDecodedOutputNoDemux( | ||
| torch::Tensor& preAllocatedOutputTensor); | ||
|
||
| // Decodes the first frame in any added stream that is visible at a given | ||
| // timestamp. Frames in the video have a presentation timestamp and a | ||
| // duration. For example, if a frame has presentation timestamp of 5.0s and a | ||
| // duration of 1.0s, it will be visible in the timestamp range [5.0, 6.0). | ||
| // i.e. it will be returned when this function is called with seconds=5.0 or | ||
| // seconds=5.999, etc. | ||
| DecodedOutput getFrameDisplayedAtTimestampNoDemux(double seconds); | ||
| DecodedOutput getFrameAtIndex(int streamIndex, int64_t frameIndex); | ||
| DecodedOutput getFrameAtIndex( | ||
| int streamIndex, | ||
| int64_t frameIndex, | ||
| torch::Tensor& preAllocatedOutputTensor); | ||
| struct BatchDecodedOutput { | ||
| torch::Tensor frames; | ||
| torch::Tensor ptsSeconds; | ||
|
|
@@ -363,10 +368,13 @@ class VideoDecoder { | |
| int streamIndex, | ||
| const AVFrame* frame); | ||
| void convertFrameToBufferUsingSwsScale(RawDecodedOutput& rawOutput); | ||
| DecodedOutput convertAVFrameToDecodedOutput(RawDecodedOutput& rawOutput); | ||
| DecodedOutput convertAVFrameToDecodedOutput( | ||
| RawDecodedOutput& rawOutput, | ||
| torch::Tensor& preAllocatedOutputTensor); | ||
| void convertAVFrameToDecodedOutputOnCPU( | ||
| RawDecodedOutput& rawOutput, | ||
| DecodedOutput& output); | ||
| DecodedOutput& output, | ||
| torch::Tensor& preAllocatedOutputTensor); | ||
|
|
||
| DecoderOptions options_; | ||
| ContainerMetadata containerMetadata_; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.