@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
108
108
" vs supported:" ,
109
109
caps.nMaxMBCount );
110
110
111
- // Decoder creation parameters, taken from DALI
111
+ // Decoder creation parameters, most are taken from DALI
112
112
CUVIDDECODECREATEINFO decoderParams = {};
113
113
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8 ;
114
114
decoderParams.ChromaFormat = videoFormat->chroma_format ;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
124
124
decoderParams.ulTargetWidth =
125
125
videoFormat->display_area .right - videoFormat->display_area .left ;
126
126
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces ;
127
- decoderParams.ulNumOutputSurfaces = 2 ;
127
+ // We should only ever need 1 output surface, since we process frames
128
+ // sequentially, and we always unmap the previous frame before mapping a new
129
+ // one.
130
+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
131
+ // benchmark to see if this makes any difference.
132
+ decoderParams.ulNumOutputSurfaces = 1 ;
128
133
decoderParams.display_area .left = videoFormat->display_area .left ;
129
134
decoderParams.display_area .right = videoFormat->display_area .right ;
130
135
decoderParams.display_area .top = videoFormat->display_area .top ;
@@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
166
171
}
167
172
168
173
BetaCudaDeviceInterface::~BetaCudaDeviceInterface () {
169
- // TODONVDEC P0: we probably need to free the frames that have been decoded by
170
- // NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?
171
-
172
174
if (decoder_) {
175
+ // DALI doesn't seem to do any particular cleanup of the decoder before
176
+ // sending it to the cache, so we probably don't need to do anything either.
177
+ // Just to be safe, we flush.
178
+ // What happens to those decode surfaces that haven't yet been mapped is
179
+ // unclear.
180
+ flush ();
181
+ unmapPreviousFrame ();
173
182
NVDECCache::getCache (device_.index ())
174
183
.returnDecoder (&videoFormat_, std::move (decoder_));
175
184
}
@@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
320
329
decoder_ = NVDECCache::getCache (device_.index ()).getDecoder (videoFormat);
321
330
322
331
if (!decoder_) {
323
- // TODONVDEC P0 : consider re-configuring an existing decoder instead of
332
+ // TODONVDEC P2 : consider re-configuring an existing decoder instead of
324
333
// re-creating one. See docs, see DALI.
325
334
decoder_ = createDecoder (videoFormat);
326
335
}
@@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
341
350
packet.get () && packet->data && packet->size > 0 ,
342
351
" sendPacket received an empty packet, this is unexpected, please report." );
343
352
344
- applyBSF (packet);
353
+ // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
354
+ // the original one if no BSF is needed. This new filtered packet must be
355
+ // allocated outside of applyBSF: if it were allocated inside applyBSF, it
356
+ // would be destroyed at the end of the function, leaving us with a dangling
357
+ // reference.
358
+ AutoAVPacket filteredAutoPacket;
359
+ ReferenceAVPacket filteredPacket (filteredAutoPacket);
360
+ ReferenceAVPacket& packetToSend = applyBSF (packet, filteredPacket);
345
361
346
362
CUVIDSOURCEDATAPACKET cuvidPacket = {};
347
- cuvidPacket.payload = packet ->data ;
348
- cuvidPacket.payload_size = packet ->size ;
363
+ cuvidPacket.payload = packetToSend ->data ;
364
+ cuvidPacket.payload_size = packetToSend ->size ;
349
365
cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
350
- cuvidPacket.timestamp = packet ->pts ;
366
+ cuvidPacket.timestamp = packetToSend ->pts ;
351
367
352
368
return sendCuvidPacket (cuvidPacket);
353
369
}
@@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
366
382
return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
367
383
}
368
384
369
- void BetaCudaDeviceInterface::applyBSF (ReferenceAVPacket& packet) {
385
+ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF (
386
+ ReferenceAVPacket& packet,
387
+ ReferenceAVPacket& filteredPacket) {
370
388
if (!bitstreamFilter_) {
371
- return ;
389
+ return packet ;
372
390
}
373
391
374
392
int retVal = av_bsf_send_packet (bitstreamFilter_.get (), packet.get ());
@@ -377,41 +395,26 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
377
395
" Failed to send packet to bitstream filter: " ,
378
396
getFFMPEGErrorStringFromErrorCode (retVal));
379
397
380
- // Create a temporary packet to receive the filtered data
381
398
// TODO P1: the docs mention there can theoretically be multiple output
382
399
// packets for a single input, i.e. we may need to call av_bsf_receive_packet
383
400
// more than once. We should figure out whether that applies to the BSF we're
384
401
// using.
385
- AutoAVPacket filteredAutoPacket;
386
- ReferenceAVPacket filteredPacket (filteredAutoPacket);
387
402
retVal = av_bsf_receive_packet (bitstreamFilter_.get (), filteredPacket.get ());
388
403
TORCH_CHECK (
389
404
retVal >= AVSUCCESS,
390
405
" Failed to receive packet from bitstream filter: " ,
391
406
getFFMPEGErrorStringFromErrorCode (retVal));
392
407
393
- // Free the original packet's data which isn't needed anymore, and move the
394
- // fields of the filtered packet into the original packet. The filtered packet
395
- // fields are re-set by av_packet_move_ref, so when it goes out of scope and
396
- // gets destructed, it's not going to affect the original packet.
397
- packet.reset (filteredPacket);
398
- // TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
399
- // applyBSF return a new packet, and maybe that new packet needs to be a field
400
- // on the interface to avoid complex lifetime issues.
408
+ return filteredPacket;
401
409
}
402
410
403
411
// Parser triggers this callback within cuvidParseVideoData when a frame is
404
412
// ready to be decoded, i.e. the parser received all the necessary packets for a
405
413
// given frame. It means we can send that frame to be decoded by the hardware
406
414
// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
407
415
int BetaCudaDeviceInterface::frameReadyForDecoding (CUVIDPICPARAMS* picParams) {
408
- if (isFlushing_) {
409
- return 0 ;
410
- }
411
-
412
416
TORCH_CHECK (picParams != nullptr , " Invalid picture parameters" );
413
417
TORCH_CHECK (decoder_, " Decoder not initialized before picture decode" );
414
-
415
418
// Send frame to be decoded by NVDEC - non-blocking call.
416
419
CUresult result = cuvidDecodePicture (*decoder_.get (), picParams);
417
420
@@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
432
435
// packets, or to stop if EOF was already sent.
433
436
return eofSent_ ? AVERROR_EOF : AVERROR (EAGAIN);
434
437
}
438
+
435
439
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front ();
436
440
readyFrames_.pop ();
437
441
@@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
450
454
// to "map" it to an "output surface" before we can use its data. This is a
451
455
// blocking calls that waits until the frame is fully decoded and ready to be
452
456
// used.
457
+ // When a frame is mapped to an output surface, it needs to be unmapped
458
+ // eventually, so that the decoder can re-use the output surface. Failing to
459
+ // unmap will cause map to eventually fail. DALI unmaps frames almost
460
+ // immediately after mapping them: they do the color-conversion in-between,
461
+ // which involves a copy of the data, so that works.
462
+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
463
+ // we unmap here: just before mapping a new frame. At that point we know that
464
+ // the previously-mapped frame is no longer needed: it was either
465
+ // color-converted (with a copy), or that's a frame that was discarded in
466
+ // SingleStreamDecoder. Either way, the underlying output surface can be
467
+ // safely re-used.
468
+ unmapPreviousFrame ();
453
469
CUresult result = cuvidMapVideoFrame (
454
470
*decoder_.get (), dispInfo.picture_index , &framePtr, &pitch, &procParams);
455
-
456
471
if (result != CUDA_SUCCESS) {
457
472
return AVERROR_EXTERNAL;
458
473
}
474
+ previouslyMappedFrame_ = framePtr;
459
475
460
476
avFrame = convertCudaFrameToAVFrame (framePtr, pitch, dispInfo);
461
477
462
- // Unmap the frame so that the decoder can reuse its corresponding output
463
- // surface. Whether this is blocking is unclear?
464
- cuvidUnmapVideoFrame (*decoder_.get (), framePtr);
465
- // TODONVDEC P0: Get clarity on this:
466
- // We assume that the framePtr is still valid after unmapping. That framePtr
467
- // is now part of the avFrame, which we'll return to the caller, and the
468
- // caller will immediately use it for color-conversion, at which point a copy
469
- // happens. After the copy, it doesn't matter whether framePtr is still valid.
470
- // And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
471
- // the copy is made, so there should be no risk of overwriting the data before
472
- // the copy.
473
- // Buuuut yeah, we need get more clarity on what actually happens, and on
474
- // what's needed. IIUC DALI makes the color-conversion copy immediately after
475
- // cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
476
- // in between. So maybe we should do the same.
477
-
478
478
return AVSUCCESS;
479
479
}
480
480
481
+ void BetaCudaDeviceInterface::unmapPreviousFrame () {
482
+ if (previouslyMappedFrame_ == 0 ) {
483
+ return ;
484
+ }
485
+ CUresult result =
486
+ cuvidUnmapVideoFrame (*decoder_.get (), previouslyMappedFrame_);
487
+ TORCH_CHECK (
488
+ result == CUDA_SUCCESS, " Failed to unmap previous frame: " , result);
489
+ previouslyMappedFrame_ = 0 ;
490
+ }
491
+
481
492
UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame (
482
493
CUdeviceptr framePtr,
483
494
unsigned int pitch,
@@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
554
565
}
555
566
556
567
void BetaCudaDeviceInterface::flush () {
557
- isFlushing_ = true ;
558
-
568
+ // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
569
+ // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
570
+ // don't say whether this should be an empty packet, or whether it should be a
571
+ // flag on the next non-empty packet. It doesn't matter: neither work :)
572
+ // Sending an EOF packet, however, does work. So we do that. And we re-set the
573
+ // eofSent_ flag to false because that's not a true EOF notification.
559
574
sendEOFPacket ();
560
-
561
- isFlushing_ = false ;
575
+ eofSent_ = false ;
562
576
563
577
std::queue<CUVIDPARSERDISPINFO> emptyQueue;
564
578
std::swap (readyFrames_, emptyQueue);
565
-
566
- eofSent_ = false ;
567
579
}
568
580
569
581
void BetaCudaDeviceInterface::convertAVFrameToFrameOutput (
0 commit comments