@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
108108 " vs supported:" ,
109109 caps.nMaxMBCount );
110110
111- // Decoder creation parameters, taken from DALI
111+ // Decoder creation parameters, most are taken from DALI
112112 CUVIDDECODECREATEINFO decoderParams = {};
113113 decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8 ;
114114 decoderParams.ChromaFormat = videoFormat->chroma_format ;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
124124 decoderParams.ulTargetWidth =
125125 videoFormat->display_area .right - videoFormat->display_area .left ;
126126 decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces ;
127- decoderParams.ulNumOutputSurfaces = 2 ;
127+ // We should only ever need 1 output surface, since we process frames
128+ // sequentially, and we always unmap the previous frame before mapping a new
129+ // one.
130+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
131+ // benchmark to see if this makes any difference.
132+ decoderParams.ulNumOutputSurfaces = 1 ;
128133 decoderParams.display_area .left = videoFormat->display_area .left ;
129134 decoderParams.display_area .right = videoFormat->display_area .right ;
130135 decoderParams.display_area .top = videoFormat->display_area .top ;
@@ -173,6 +178,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
173178 // What happens to those decode surfaces that haven't yet been mapped is
174179 // unclear.
175180 flush ();
181+ unmapPreviousFrame ();
176182 NVDECCache::getCache (device_.index ())
177183 .returnDecoder (&videoFormat_, std::move (decoder_));
178184 }
@@ -430,6 +436,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
430436 // packets, or to stop if EOF was already sent.
431437 return eofSent_ ? AVERROR_EOF : AVERROR (EAGAIN);
432438 }
439+
433440 CUVIDPARSERDISPINFO dispInfo = readyFrames_.front ();
434441 readyFrames_.pop ();
435442
@@ -448,34 +455,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
448455 // to "map" it to an "output surface" before we can use its data. This is a
449456 // blocking calls that waits until the frame is fully decoded and ready to be
450457 // used.
458+ // When a frame is mapped to an output surface, it needs to be unmapped
459+ // eventually, so that the decoder can re-use the output surface. Failing to
460+ // unmap will cause map to eventually fail. DALI unmaps frames almost
461+ // immediately after mapping them: they do the color-conversion in-between,
462+ // which involves a copy of the data, so that works.
463+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
464+ // we unmap here: just before mapping a new frame. At that point we know that
465+ // the previously-mapped frame is no longer needed: it was either
466+ // color-converted (with a copy), or that's a frame that was discarded in
467+ // SingleStreamDecoder. Either way, the underlying output surface can be
468+ // safely re-used.
469+ unmapPreviousFrame ();
451470 CUresult result = cuvidMapVideoFrame (
452471 *decoder_.get (), dispInfo.picture_index , &framePtr, &pitch, &procParams);
453-
454472 if (result != CUDA_SUCCESS) {
455473 return AVERROR_EXTERNAL;
456474 }
475+ previouslyMappedFrame_ = framePtr;
457476
458477 avFrame = convertCudaFrameToAVFrame (framePtr, pitch, dispInfo);
459478
460- // Unmap the frame so that the decoder can reuse its corresponding output
461- // surface. Whether this is blocking is unclear?
462- cuvidUnmapVideoFrame (*decoder_.get (), framePtr);
463- // TODONVDEC P0: Get clarity on this:
464- // We assume that the framePtr is still valid after unmapping. That framePtr
465- // is now part of the avFrame, which we'll return to the caller, and the
466- // caller will immediately use it for color-conversion, at which point a copy
467- // happens. After the copy, it doesn't matter whether framePtr is still valid.
468- // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
469- // the copy is made, so there should be no risk of overwriting the data before
470- // the copy.
471- // Buuuut yeah, we need get more clarity on what actually happens, and on
472- // what's needed. IIUC DALI makes the color-conversion copy immediately after
473- // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
474- // in between. So maybe we should do the same.
475-
476479 return AVSUCCESS;
477480}
478481
482+ void BetaCudaDeviceInterface::unmapPreviousFrame () {
483+ if (previouslyMappedFrame_ == 0 ) {
484+ return ;
485+ }
486+ CUresult result =
487+ cuvidUnmapVideoFrame (*decoder_.get (), previouslyMappedFrame_);
488+ TORCH_CHECK (
489+ result == CUDA_SUCCESS, " Failed to unmap previous frame: " , result);
490+ previouslyMappedFrame_ = 0 ;
491+ }
492+
479493UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame (
480494 CUdeviceptr framePtr,
481495 unsigned int pitch,
0 commit comments