Fix output surface un-mapping

NicolasHug · NicolasHug · commit d07283224ee9 · 2025-10-04T12:40:52.000+01:00
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
       " vs supported:",
       caps.nMaxMBCount);
 
-  // Decoder creation parameters, taken from DALI
+  // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
   decoderParams.ChromaFormat = videoFormat->chroma_format;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   decoderParams.ulTargetWidth =
       videoFormat->display_area.right - videoFormat->display_area.left;
   decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
-  decoderParams.ulNumOutputSurfaces = 2;
+  // We should only ever need 1 output surface, since we process frames
+  // sequentially, and we always unmap the previous frame before mapping a new
+  // one.
+  // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
+  // benchmark to see if this makes any difference.
+  decoderParams.ulNumOutputSurfaces = 1;
   decoderParams.display_area.left = videoFormat->display_area.left;
   decoderParams.display_area.right = videoFormat->display_area.right;
   decoderParams.display_area.top = videoFormat->display_area.top;
@@ -173,6 +178,7 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     // What happens to those decode surfaces that haven't yet been mapped is
     // unclear.
     flush();
+    unmapPreviousFrame();
     NVDECCache::getCache(device_.index())
         .returnDecoder(&videoFormat_, std::move(decoder_));
   }
@@ -430,6 +436,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
     // packets, or to stop if EOF was already sent.
     return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
   }
+
   CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
   readyFrames_.pop();
 
@@ -448,34 +455,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   // to "map" it to an "output surface" before we can use its data. This is a
   // blocking calls that waits until the frame is fully decoded and ready to be
   // used.
+  // When a frame is mapped to an output surface, it needs to be unmapped
+  // eventually, so that the decoder can re-use the output surface. Failing to
+  // unmap will cause map to eventually fail. DALI unmaps frames almost
+  // immediately  after mapping them: they do the color-conversion in-between,
+  // which involves a copy of the data, so that works.
+  // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
+  // we unmap here: just before mapping a new frame. At that point we know that
+  // the previously-mapped frame is no longer needed: it was either
+  // color-converted (with a copy), or that's a frame that was discarded in
+  // SingleStreamDecoder. Either way, the underlying output surface can be
+  // safely re-used.
+  unmapPreviousFrame();
   CUresult result = cuvidMapVideoFrame(
       *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
-
   if (result != CUDA_SUCCESS) {
     return AVERROR_EXTERNAL;
   }
+  previouslyMappedFrame_ = framePtr;
 
   avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
 
-  // Unmap the frame so that the decoder can reuse its corresponding output
-  // surface. Whether this is blocking is unclear?
-  cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
-  // TODONVDEC P0: Get clarity on this:
-  // We assume that the framePtr is still valid after unmapping. That framePtr
-  // is now part of the avFrame, which we'll return to the caller, and the
-  // caller will immediately use it for color-conversion, at which point a copy
-  // happens. After the copy, it doesn't matter whether framePtr is still valid.
-  // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
-  // the copy is made, so there should be no risk of overwriting the data before
-  // the copy.
-  // Buuuut yeah, we need get more clarity on what actually happens, and on
-  // what's needed. IIUC DALI makes the color-conversion copy immediately after
-  // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
-  // in between. So maybe we should do the same.
-
   return AVSUCCESS;
 }
 
+void BetaCudaDeviceInterface::unmapPreviousFrame() {
+  if (previouslyMappedFrame_ == 0) {
+    return;
+  }
+  CUresult result =
+      cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
+  TORCH_CHECK(
+      result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
+  previouslyMappedFrame_ = 0;
+}
+
 UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
     CUdeviceptr framePtr,
     unsigned int pitch,
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.h b/src/torchcodec/_core/BetaCudaDeviceInterface.h
@@ -70,6 +70,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       const AVCodecParameters* codecPar,
       const UniqueDecodingAVFormatContext& avFormatCtx);
 
+  CUdeviceptr previouslyMappedFrame_ = 0;
+  void unmapPreviousFrame();
+
   UniqueAVFrame convertCudaFrameToAVFrame(
       CUdeviceptr framePtr,
       unsigned int pitch,