Skip to content

Commit 3fa5271

Browse files
authored
BETA CUDA interface: Address P0 TODOs and cleanups (#928)
1 parent fbd16f6 commit 3fa5271

File tree

4 files changed

+72
-65
lines changed

4 files changed

+72
-65
lines changed

src/torchcodec/_core/BetaCudaDeviceInterface.cpp

Lines changed: 63 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
108108
" vs supported:",
109109
caps.nMaxMBCount);
110110

111-
// Decoder creation parameters, taken from DALI
111+
// Decoder creation parameters, most are taken from DALI
112112
CUVIDDECODECREATEINFO decoderParams = {};
113113
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
114114
decoderParams.ChromaFormat = videoFormat->chroma_format;
@@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
124124
decoderParams.ulTargetWidth =
125125
videoFormat->display_area.right - videoFormat->display_area.left;
126126
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
127-
decoderParams.ulNumOutputSurfaces = 2;
127+
// We should only ever need 1 output surface, since we process frames
128+
// sequentially, and we always unmap the previous frame before mapping a new
129+
// one.
130+
// TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
131+
// benchmark to see if this makes any difference.
132+
decoderParams.ulNumOutputSurfaces = 1;
128133
decoderParams.display_area.left = videoFormat->display_area.left;
129134
decoderParams.display_area.right = videoFormat->display_area.right;
130135
decoderParams.display_area.top = videoFormat->display_area.top;
@@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
166171
}
167172

168173
BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
169-
// TODONVDEC P0: we probably need to free the frames that have been decoded by
170-
// NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?
171-
172174
if (decoder_) {
175+
// DALI doesn't seem to do any particular cleanup of the decoder before
176+
// sending it to the cache, so we probably don't need to do anything either.
177+
// Just to be safe, we flush.
178+
// What happens to those decode surfaces that haven't yet been mapped is
179+
// unclear.
180+
flush();
181+
unmapPreviousFrame();
173182
NVDECCache::getCache(device_.index())
174183
.returnDecoder(&videoFormat_, std::move(decoder_));
175184
}
@@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
320329
decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
321330

322331
if (!decoder_) {
323-
// TODONVDEC P0: consider re-configuring an existing decoder instead of
332+
// TODONVDEC P2: consider re-configuring an existing decoder instead of
324333
// re-creating one. See docs, see DALI.
325334
decoder_ = createDecoder(videoFormat);
326335
}
@@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
341350
packet.get() && packet->data && packet->size > 0,
342351
"sendPacket received an empty packet, this is unexpected, please report.");
343352

344-
applyBSF(packet);
353+
// Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
354+
// the original one if no BSF is needed. This new filtered packet must be
355+
// allocated outside of applyBSF: if it were allocated inside applyBSF, it
356+
// would be destroyed at the end of the function, leaving us with a dangling
357+
// reference.
358+
AutoAVPacket filteredAutoPacket;
359+
ReferenceAVPacket filteredPacket(filteredAutoPacket);
360+
ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);
345361

346362
CUVIDSOURCEDATAPACKET cuvidPacket = {};
347-
cuvidPacket.payload = packet->data;
348-
cuvidPacket.payload_size = packet->size;
363+
cuvidPacket.payload = packetToSend->data;
364+
cuvidPacket.payload_size = packetToSend->size;
349365
cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
350-
cuvidPacket.timestamp = packet->pts;
366+
cuvidPacket.timestamp = packetToSend->pts;
351367

352368
return sendCuvidPacket(cuvidPacket);
353369
}
@@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
366382
return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
367383
}
368384

369-
void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
385+
ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
386+
ReferenceAVPacket& packet,
387+
ReferenceAVPacket& filteredPacket) {
370388
if (!bitstreamFilter_) {
371-
return;
389+
return packet;
372390
}
373391

374392
int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
@@ -377,41 +395,26 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
377395
"Failed to send packet to bitstream filter: ",
378396
getFFMPEGErrorStringFromErrorCode(retVal));
379397

380-
// Create a temporary packet to receive the filtered data
381398
// TODO P1: the docs mention there can theoretically be multiple output
382399
// packets for a single input, i.e. we may need to call av_bsf_receive_packet
383400
// more than once. We should figure out whether that applies to the BSF we're
384401
// using.
385-
AutoAVPacket filteredAutoPacket;
386-
ReferenceAVPacket filteredPacket(filteredAutoPacket);
387402
retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
388403
TORCH_CHECK(
389404
retVal >= AVSUCCESS,
390405
"Failed to receive packet from bitstream filter: ",
391406
getFFMPEGErrorStringFromErrorCode(retVal));
392407

393-
// Free the original packet's data which isn't needed anymore, and move the
394-
// fields of the filtered packet into the original packet. The filtered packet
395-
// fields are re-set by av_packet_move_ref, so when it goes out of scope and
396-
// gets destructed, it's not going to affect the original packet.
397-
packet.reset(filteredPacket);
398-
// TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
399-
// applyBSF return a new packet, and maybe that new packet needs to be a field
400-
// on the interface to avoid complex lifetime issues.
408+
return filteredPacket;
401409
}
402410

403411
// Parser triggers this callback within cuvidParseVideoData when a frame is
404412
// ready to be decoded, i.e. the parser received all the necessary packets for a
405413
// given frame. It means we can send that frame to be decoded by the hardware
406414
// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
407415
int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
408-
if (isFlushing_) {
409-
return 0;
410-
}
411-
412416
TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
413417
TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
414-
415418
// Send frame to be decoded by NVDEC - non-blocking call.
416419
CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
417420

@@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
432435
// packets, or to stop if EOF was already sent.
433436
return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
434437
}
438+
435439
CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
436440
readyFrames_.pop();
437441

@@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
450454
// to "map" it to an "output surface" before we can use its data. This is a
451455
// blocking calls that waits until the frame is fully decoded and ready to be
452456
// used.
457+
// When a frame is mapped to an output surface, it needs to be unmapped
458+
// eventually, so that the decoder can re-use the output surface. Failing to
459+
// unmap will cause map to eventually fail. DALI unmaps frames almost
460+
// immediately after mapping them: they do the color-conversion in-between,
461+
// which involves a copy of the data, so that works.
462+
// We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
463+
// we unmap here: just before mapping a new frame. At that point we know that
464+
// the previously-mapped frame is no longer needed: it was either
465+
// color-converted (with a copy), or that's a frame that was discarded in
466+
// SingleStreamDecoder. Either way, the underlying output surface can be
467+
// safely re-used.
468+
unmapPreviousFrame();
453469
CUresult result = cuvidMapVideoFrame(
454470
*decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
455-
456471
if (result != CUDA_SUCCESS) {
457472
return AVERROR_EXTERNAL;
458473
}
474+
previouslyMappedFrame_ = framePtr;
459475

460476
avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
461477

462-
// Unmap the frame so that the decoder can reuse its corresponding output
463-
// surface. Whether this is blocking is unclear?
464-
cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
465-
// TODONVDEC P0: Get clarity on this:
466-
// We assume that the framePtr is still valid after unmapping. That framePtr
467-
// is now part of the avFrame, which we'll return to the caller, and the
468-
// caller will immediately use it for color-conversion, at which point a copy
469-
// happens. After the copy, it doesn't matter whether framePtr is still valid.
470-
// And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
471-
// the copy is made, so there should be no risk of overwriting the data before
472-
// the copy.
473-
// Buuuut yeah, we need get more clarity on what actually happens, and on
474-
// what's needed. IIUC DALI makes the color-conversion copy immediately after
475-
// cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
476-
// in between. So maybe we should do the same.
477-
478478
return AVSUCCESS;
479479
}
480480

481+
void BetaCudaDeviceInterface::unmapPreviousFrame() {
482+
if (previouslyMappedFrame_ == 0) {
483+
return;
484+
}
485+
CUresult result =
486+
cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
487+
TORCH_CHECK(
488+
result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
489+
previouslyMappedFrame_ = 0;
490+
}
491+
481492
UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
482493
CUdeviceptr framePtr,
483494
unsigned int pitch,
@@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
554565
}
555566

556567
void BetaCudaDeviceInterface::flush() {
557-
isFlushing_ = true;
558-
568+
// The NVCUVID docs mention that after seeking, i.e. when flush() is called,
569+
// we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
570+
// don't say whether this should be an empty packet, or whether it should be a
571+
// flag on the next non-empty packet. It doesn't matter: neither work :)
572+
// Sending an EOF packet, however, does work. So we do that. And we re-set the
573+
// eofSent_ flag to false because that's not a true EOF notification.
559574
sendEOFPacket();
560-
561-
isFlushing_ = false;
575+
eofSent_ = false;
562576

563577
std::queue<CUVIDPARSERDISPINFO> emptyQueue;
564578
std::swap(readyFrames_, emptyQueue);
565-
566-
eofSent_ = false;
567579
}
568580

569581
void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(

src/torchcodec/_core/BetaCudaDeviceInterface.h

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,18 @@ class BetaCudaDeviceInterface : public DeviceInterface {
6363

6464
private:
6565
int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
66-
// Apply bitstream filter, modifies packet in-place
67-
void applyBSF(ReferenceAVPacket& packet);
66+
6867
void initializeBSF(
6968
const AVCodecParameters* codecPar,
7069
const UniqueDecodingAVFormatContext& avFormatCtx);
70+
// Apply bitstream filter, returns filtered packet or original if no filter
71+
// needed.
72+
ReferenceAVPacket& applyBSF(
73+
ReferenceAVPacket& packet,
74+
ReferenceAVPacket& filteredPacket);
75+
76+
CUdeviceptr previouslyMappedFrame_ = 0;
77+
void unmapPreviousFrame();
7178

7279
UniqueAVFrame convertCudaFrameToAVFrame(
7380
CUdeviceptr framePtr,
@@ -82,10 +89,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
8289

8390
bool eofSent_ = false;
8491

85-
// Flush flag to prevent decode operations during flush (like DALI's
86-
// isFlushing_)
87-
bool isFlushing_ = false;
88-
8992
AVRational timeBase_ = {0, 1};
9093
AVRational frameRateAvgFromFFmpeg_ = {0, 1};
9194

src/torchcodec/_core/FFMPEGCommon.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,6 @@ AVPacket* ReferenceAVPacket::operator->() {
3333
return avPacket_;
3434
}
3535

36-
void ReferenceAVPacket::reset(ReferenceAVPacket& other) {
37-
if (this != &other) {
38-
av_packet_unref(avPacket_);
39-
av_packet_move_ref(avPacket_, other.avPacket_);
40-
}
41-
}
42-
4336
AVCodecOnlyUseForCallingAVFindBestStream
4437
makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
4538
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)

src/torchcodec/_core/FFMPEGCommon.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ class ReferenceAVPacket {
135135
~ReferenceAVPacket();
136136
AVPacket* get();
137137
AVPacket* operator->();
138-
void reset(ReferenceAVPacket& other);
139138
};
140139

141140
// av_find_best_stream is not const-correct before commit:

0 commit comments

Comments
 (0)