Skip to content

Commit d072832

Browse files
committed
Fix output surface un-mapping
1 parent 52a5347 commit d072832

File tree

2 files changed

+36
-19
lines changed

2 files changed

+36
-19
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
108108
" vs supported:",
109109
caps.nMaxMBCount);
110110

111-
// Decoder creation parameters, taken from DALI
111+
// Decoder creation parameters, most are taken from DALI
112112
CUVIDDECODECREATEINFO decoderParams = {};
113113
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
114114
decoderParams.ChromaFormat = videoFormat->chroma_format;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
124124
decoderParams.ulTargetWidth =
125125
videoFormat->display_area.right - videoFormat->display_area.left;
126126
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
127-
decoderParams.ulNumOutputSurfaces = 2;
127+
// We should only ever need 1 output surface, since we process frames
128+
// sequentially, and we always unmap the previous frame before mapping a new
129+
// one.
130+
// TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
131+
// benchmark to see if this makes any difference.
132+
decoderParams.ulNumOutputSurfaces = 1;
128133
decoderParams.display_area.left = videoFormat->display_area.left;
129134
decoderParams.display_area.right = videoFormat->display_area.right;
130135
decoderParams.display_area.top = videoFormat->display_area.top;
@@ -173,6 +178,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
173178
// What happens to those decode surfaces that haven't yet been mapped is
174179
// unclear.
175180
flush();
181+
unmapPreviousFrame();
176182
NVDECCache::getCache(device_.index())
177183
.returnDecoder(&videoFormat_, std::move(decoder_));
178184
}
@@ -430,6 +436,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
430436
// packets, or to stop if EOF was already sent.
431437
return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
432438
}
439+
433440
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
434441
readyFrames_.pop();
435442

@@ -448,34 +455,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
448455
// to "map" it to an "output surface" before we can use its data. This is a
449456
// blocking calls that waits until the frame is fully decoded and ready to be
450457
// used.
458+
// When a frame is mapped to an output surface, it needs to be unmapped
459+
// eventually, so that the decoder can re-use the output surface. Failing to
460+
// unmap will cause map to eventually fail. DALI unmaps frames almost
461+
// immediately after mapping them: they do the color-conversion in-between,
462+
// which involves a copy of the data, so that works.
463+
// We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
464+
// we unmap here: just before mapping a new frame. At that point we know that
465+
// the previously-mapped frame is no longer needed: it was either
466+
// color-converted (with a copy), or that's a frame that was discarded in
467+
// SingleStreamDecoder. Either way, the underlying output surface can be
468+
// safely re-used.
469+
unmapPreviousFrame();
451470
CUresult result = cuvidMapVideoFrame(
452471
*decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
453-
454472
if (result != CUDA_SUCCESS) {
455473
return AVERROR_EXTERNAL;
456474
}
475+
previouslyMappedFrame_ = framePtr;
457476

458477
avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
459478

460-
// Unmap the frame so that the decoder can reuse its corresponding output
461-
// surface. Whether this is blocking is unclear?
462-
cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
463-
// TODONVDEC P0: Get clarity on this:
464-
// We assume that the framePtr is still valid after unmapping. That framePtr
465-
// is now part of the avFrame, which we'll return to the caller, and the
466-
// caller will immediately use it for color-conversion, at which point a copy
467-
// happens. After the copy, it doesn't matter whether framePtr is still valid.
468-
// And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
469-
// the copy is made, so there should be no risk of overwriting the data before
470-
// the copy.
471-
// Buuuut yeah, we need get more clarity on what actually happens, and on
472-
// what's needed. IIUC DALI makes the color-conversion copy immediately after
473-
// cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
474-
// in between. So maybe we should do the same.
475-
476479
return AVSUCCESS;
477480
}
478481

482+
void BetaCudaDeviceInterface::unmapPreviousFrame() {
483+
if (previouslyMappedFrame_ == 0) {
484+
return;
485+
}
486+
CUresult result =
487+
cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
488+
TORCH_CHECK(
489+
result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
490+
previouslyMappedFrame_ = 0;
491+
}
492+
479493
UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
480494
CUdeviceptr framePtr,
481495
unsigned int pitch,

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
7070
const AVCodecParameters* codecPar,
7171
const UniqueDecodingAVFormatContext& avFormatCtx);
7272

73+
CUdeviceptr previouslyMappedFrame_ = 0;
74+
void unmapPreviousFrame();
75+
7376
UniqueAVFrame convertCudaFrameToAVFrame(
7477
CUdeviceptr framePtr,
7578
unsigned int pitch,

0 commit comments

Comments
 (0)