Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
78ab058
Let's just commit 3k loc in a single commit
NicolasHug Sep 25, 2025
b45decc
Fixes
NicolasHug Sep 26, 2025
316f218
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Sep 30, 2025
d0192ec
GetCache -> getCache
NicolasHug Sep 30, 2025
515deb5
Make UniqueCUvideodecoder a pointer on CUvideodecoder, not void
NicolasHug Sep 30, 2025
13fad10
Make device and device_variant have a default instead of being std::o…
NicolasHug Sep 30, 2025
eb8de72
Remove old registerDeviceInterface
NicolasHug Sep 30, 2025
4f7a4fb
Call std::memset
NicolasHug Sep 30, 2025
dcf3124
remove unnecessary cuda_runtime.h include, update cmake accordingly
NicolasHug Sep 30, 2025
0ad7370
abstract frameBuffer_ into a FrameBuffer class
NicolasHug Sep 30, 2025
aad142e
Cleanup BSF logic
NicolasHug Sep 30, 2025
2592888
Return int in callback instead of unsigned char
NicolasHug Sep 30, 2025
b5fe9bc
define width and height as unsigned int
NicolasHug Sep 30, 2025
5605c90
Rework frame ordering and pts matching
NicolasHug Oct 1, 2025
7494259
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 1, 2025
560b376
Fix cuda context initialization
NicolasHug Oct 1, 2025
88196c5
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 1, 2025
2a78b84
Renaming
NicolasHug Oct 1, 2025
5d194e5
Comment
NicolasHug Oct 1, 2025
d1e51b3
Merge branch 'main' of github.com:pytorch/torchcodec into aeaenjfjanef
NicolasHug Oct 2, 2025
f9c7297
Skip equality check on ffmepg 4
NicolasHug Oct 2, 2025
b7bbfb2
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
390fd7c
Refac, simplify
NicolasHug Oct 2, 2025
f55dcc0
Update comment
NicolasHug Oct 2, 2025
7e4dd10
Define constant, add TODO for AVRational
NicolasHug Oct 2, 2025
f614846
Use uint32_t types
NicolasHug Oct 2, 2025
aa6e253
Create packet.reset() and add P0 TODO
NicolasHug Oct 2, 2025
186eaa4
Add TODO
NicolasHug Oct 2, 2025
1cb4890
Merge branch 'aeaenjfjanef' into nvdec-rework-frame-ordering
NicolasHug Oct 2, 2025
c5b32a4
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-rewor…
NicolasHug Oct 2, 2025
70873bf
lint
NicolasHug Oct 2, 2025
12c75e7
Add h265 support
NicolasHug Oct 2, 2025
7ea3ca9
Add h265 support
NicolasHug Oct 2, 2025
8ad66ce
Add AV1 support
NicolasHug Oct 3, 2025
f8f0402
Add BETA CUDA interface to built-in tests
NicolasHug Oct 3, 2025
bc55810
Refactor EOF packet logic
NicolasHug Oct 3, 2025
121a038
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-tests
NicolasHug Oct 4, 2025
204970e
Fix merge?
NicolasHug Oct 4, 2025
993d510
Merge branch 'nvdec-tests' into nvdec-send-eof
NicolasHug Oct 4, 2025
52a5347
Flushing cleanups, add comments
NicolasHug Oct 4, 2025
d072832
Fix output surface un-mapping
NicolasHug Oct 4, 2025
be8a481
Address BSF uglyness - kinda
NicolasHug Oct 4, 2025
cd1aa1e
Merge branch 'main' of github.com:pytorch/torchcodec into nvdec-cleanups
NicolasHug Oct 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 63 additions & 51 deletions src/torchcodec/_core/BetaCudaDeviceInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
" vs supported:",
caps.nMaxMBCount);

// Decoder creation parameters, taken from DALI
// Decoder creation parameters, most are taken from DALI
CUVIDDECODECREATEINFO decoderParams = {};
decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
decoderParams.ChromaFormat = videoFormat->chroma_format;
Expand All @@ -124,7 +124,12 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
decoderParams.ulTargetWidth =
videoFormat->display_area.right - videoFormat->display_area.left;
decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
decoderParams.ulNumOutputSurfaces = 2;
// We should only ever need 1 output surface, since we process frames
// sequentially, and we always unmap the previous frame before mapping a new
// one.
// TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
// benchmark to see if this makes any difference.
decoderParams.ulNumOutputSurfaces = 1;
decoderParams.display_area.left = videoFormat->display_area.left;
decoderParams.display_area.right = videoFormat->display_area.right;
decoderParams.display_area.top = videoFormat->display_area.top;
Expand Down Expand Up @@ -166,10 +171,14 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
}

BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
// TODONVDEC P0: we probably need to free the frames that have been decoded by
// NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?

if (decoder_) {
// DALI doesn't seem to do any particular cleanup of the decoder before
// sending it to the cache, so we probably don't need to do anything either.
// Just to be safe, we flush.
// What happens to those decode surfaces that haven't yet been mapped is
// unclear.
flush();
unmapPreviousFrame();
NVDECCache::getCache(device_.index())
.returnDecoder(&videoFormat_, std::move(decoder_));
}
Expand Down Expand Up @@ -320,7 +329,7 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);

if (!decoder_) {
// TODONVDEC P0: consider re-configuring an existing decoder instead of
// TODONVDEC P2: consider re-configuring an existing decoder instead of
// re-creating one. See docs, see DALI.
decoder_ = createDecoder(videoFormat);
}
Expand All @@ -341,13 +350,20 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
packet.get() && packet->data && packet->size > 0,
"sendPacket received an empty packet, this is unexpected, please report.");

applyBSF(packet);
// Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
// the original one if no BSF is needed. This new filtered packet must be
// allocated outside of applyBSF: if it were allocated inside applyBSF, it
// would be destroyed at the end of the function, leaving us with a dangling
// reference.
AutoAVPacket filteredAutoPacket;
ReferenceAVPacket filteredPacket(filteredAutoPacket);
ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);

CUVIDSOURCEDATAPACKET cuvidPacket = {};
cuvidPacket.payload = packet->data;
cuvidPacket.payload_size = packet->size;
cuvidPacket.payload = packetToSend->data;
cuvidPacket.payload_size = packetToSend->size;
cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
cuvidPacket.timestamp = packet->pts;
cuvidPacket.timestamp = packetToSend->pts;

return sendCuvidPacket(cuvidPacket);
}
Expand All @@ -366,9 +382,11 @@ int BetaCudaDeviceInterface::sendCuvidPacket(
return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
}

void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
ReferenceAVPacket& packet,
ReferenceAVPacket& filteredPacket) {
if (!bitstreamFilter_) {
return;
return packet;
}

int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
Expand All @@ -377,41 +395,26 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
"Failed to send packet to bitstream filter: ",
getFFMPEGErrorStringFromErrorCode(retVal));

// Create a temporary packet to receive the filtered data
// TODO P1: the docs mention there can theoretically be multiple output
// packets for a single input, i.e. we may need to call av_bsf_receive_packet
// more than once. We should figure out whether that applies to the BSF we're
// using.
AutoAVPacket filteredAutoPacket;
ReferenceAVPacket filteredPacket(filteredAutoPacket);
retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
TORCH_CHECK(
retVal >= AVSUCCESS,
"Failed to receive packet from bitstream filter: ",
getFFMPEGErrorStringFromErrorCode(retVal));

// Free the original packet's data which isn't needed anymore, and move the
// fields of the filtered packet into the original packet. The filtered packet
// fields are re-set by av_packet_move_ref, so when it goes out of scope and
// gets destructed, it's not going to affect the original packet.
packet.reset(filteredPacket);
// TODONVDEC P0: consider cleaner ways to do this. Maybe we should let
// applyBSF return a new packet, and maybe that new packet needs to be a field
// on the interface to avoid complex lifetime issues.
return filteredPacket;
}

// Parser triggers this callback within cuvidParseVideoData when a frame is
// ready to be decoded, i.e. the parser received all the necessary packets for a
// given frame. It means we can send that frame to be decoded by the hardware
// NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
if (isFlushing_) {
return 0;
}

TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");

// Send frame to be decoded by NVDEC - non-blocking call.
CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);

Expand All @@ -432,6 +435,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
// packets, or to stop if EOF was already sent.
return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
}

CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
readyFrames_.pop();

Expand All @@ -450,34 +454,41 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
// to "map" it to an "output surface" before we can use its data. This is a
// blocking calls that waits until the frame is fully decoded and ready to be
// used.
// When a frame is mapped to an output surface, it needs to be unmapped
// eventually, so that the decoder can re-use the output surface. Failing to
// unmap will cause map to eventually fail. DALI unmaps frames almost
// immediately after mapping them: they do the color-conversion in-between,
// which involves a copy of the data, so that works.
// We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
// we unmap here: just before mapping a new frame. At that point we know that
// the previously-mapped frame is no longer needed: it was either
// color-converted (with a copy), or that's a frame that was discarded in
// SingleStreamDecoder. Either way, the underlying output surface can be
// safely re-used.
unmapPreviousFrame();
CUresult result = cuvidMapVideoFrame(
*decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);

if (result != CUDA_SUCCESS) {
return AVERROR_EXTERNAL;
}
previouslyMappedFrame_ = framePtr;

avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);

// Unmap the frame so that the decoder can reuse its corresponding output
// surface. Whether this is blocking is unclear?
cuvidUnmapVideoFrame(*decoder_.get(), framePtr);
// TODONVDEC P0: Get clarity on this:
// We assume that the framePtr is still valid after unmapping. That framePtr
// is now part of the avFrame, which we'll return to the caller, and the
// caller will immediately use it for color-conversion, at which point a copy
// happens. After the copy, it doesn't matter whether framePtr is still valid.
// And we'll return to this function (and to cuvidUnmapVideoFrame()) *after*
// the copy is made, so there should be no risk of overwriting the data before
// the copy.
// Buuuut yeah, we need get more clarity on what actually happens, and on
// what's needed. IIUC DALI makes the color-conversion copy immediately after
// cuvidMapVideoFrame() and *before* cuvidUnmapVideoFrame() with a synchronize
// in between. So maybe we should do the same.

return AVSUCCESS;
}

void BetaCudaDeviceInterface::unmapPreviousFrame() {
if (previouslyMappedFrame_ == 0) {
return;
}
CUresult result =
cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
TORCH_CHECK(
result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
previouslyMappedFrame_ = 0;
}

UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
unsigned int pitch,
Expand Down Expand Up @@ -554,16 +565,17 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
}

void BetaCudaDeviceInterface::flush() {
isFlushing_ = true;

// The NVCUVID docs mention that after seeking, i.e. when flush() is called,
// we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
// don't say whether this should be an empty packet, or whether it should be a
// flag on the next non-empty packet. It doesn't matter: neither work :)
// Sending an EOF packet, however, does work. So we do that. And we re-set the
// eofSent_ flag to false because that's not a true EOF notification.
sendEOFPacket();

isFlushing_ = false;
eofSent_ = false;

std::queue<CUVIDPARSERDISPINFO> emptyQueue;
std::swap(readyFrames_, emptyQueue);

eofSent_ = false;
}

void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
Expand Down
15 changes: 9 additions & 6 deletions src/torchcodec/_core/BetaCudaDeviceInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,18 @@ class BetaCudaDeviceInterface : public DeviceInterface {

private:
int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
// Apply bitstream filter, modifies packet in-place
void applyBSF(ReferenceAVPacket& packet);

void initializeBSF(
const AVCodecParameters* codecPar,
const UniqueDecodingAVFormatContext& avFormatCtx);
// Apply bitstream filter, returns filtered packet or original if no filter
// needed.
ReferenceAVPacket& applyBSF(
ReferenceAVPacket& packet,
ReferenceAVPacket& filteredPacket);

CUdeviceptr previouslyMappedFrame_ = 0;
void unmapPreviousFrame();

UniqueAVFrame convertCudaFrameToAVFrame(
CUdeviceptr framePtr,
Expand All @@ -82,10 +89,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {

bool eofSent_ = false;

// Flush flag to prevent decode operations during flush (like DALI's
// isFlushing_)
bool isFlushing_ = false;

AVRational timeBase_ = {0, 1};
AVRational frameRateAvgFromFFmpeg_ = {0, 1};

Expand Down
7 changes: 0 additions & 7 deletions src/torchcodec/_core/FFMPEGCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,6 @@ AVPacket* ReferenceAVPacket::operator->() {
return avPacket_;
}

void ReferenceAVPacket::reset(ReferenceAVPacket& other) {
if (this != &other) {
av_packet_unref(avPacket_);
av_packet_move_ref(avPacket_, other.avPacket_);
}
}

AVCodecOnlyUseForCallingAVFindBestStream
makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
Expand Down
1 change: 0 additions & 1 deletion src/torchcodec/_core/FFMPEGCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ class ReferenceAVPacket {
~ReferenceAVPacket();
AVPacket* get();
AVPacket* operator->();
void reset(ReferenceAVPacket& other);
};

// av_find_best_stream is not const-correct before commit:
Expand Down
Loading