meta-pytorch · NicolasHug · Oct 4, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 30, 2025
diff --git a/src/torchcodec/_core/BetaCudaDeviceInterface.cpp b/src/torchcodec/_core/BetaCudaDeviceInterface.cpp
@@ -41,10 +41,17 @@ pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
 }
 
 static int CUDAAPI
-pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* pPicParams) {
+pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) {
   BetaCudaDeviceInterface* decoder =
       static_cast<BetaCudaDeviceInterface*>(pUserData);
-  return decoder->frameReadyForDecoding(pPicParams);
+  return decoder->frameReadyForDecoding(picParams);
+}
+
+static int CUDAAPI
+pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
+  BetaCudaDeviceInterface* decoder =
+      static_cast<BetaCudaDeviceInterface*>(pUserData);
+  return decoder->frameReadyInDisplayOrder(dispInfo);
 }
 
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
@@ -131,6 +138,24 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
 
+cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+  switch (codecId) {
+    case AV_CODEC_ID_H264:
+      return cudaVideoCodec_H264;
+    case AV_CODEC_ID_HEVC:
+      return cudaVideoCodec_HEVC;
+    // TODONVDEC P0: support more codecs
+    // case AV_CODEC_ID_AV1: return cudaVideoCodec_AV1;
+    // case AV_CODEC_ID_MPEG4: return cudaVideoCodec_MPEG4;
+    // case AV_CODEC_ID_VP8: return cudaVideoCodec_VP8;
+    // case AV_CODEC_ID_VP9: return cudaVideoCodec_VP9;
+    // case AV_CODEC_ID_MJPEG: return cudaVideoCodec_JPEG;
+    default: {
+      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
+    }
+  }
+}
+
 } // namespace
 
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -142,7 +167,7 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
 
 BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
   // TODONVDEC P0: we probably need to free the frames that have been decoded by
-  // NVDEC but not yet "mapped" - i.e. those that are still in frameBuffer_?
+  // NVDEC but not yet "mapped" - i.e. those that are still in readyFrames_?
 
   if (decoder_) {
     NVDECCache::getCache(device_.index())
@@ -156,29 +181,62 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
   }
 }
 
-void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
-  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
-      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+void BetaCudaDeviceInterface::initializeBSF(
+    const AVCodecParameters* codecPar,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
+  // Setup bit stream filters (BSF):
+  // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
+  // This is only needed for some formats, like H264 or HEVC.
 
-  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
-  timeBase_ = avStream->time_base;
+  TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
+  TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
+  TORCH_CHECK(
+      avFormatCtx->iformat != nullptr,
+      "AVFormatContext->iformat cannot be null");
+  std::string filterName;
+
+  // Matching logic is taken from DALI
+  switch (codecPar->codec_id) {
+    case AV_CODEC_ID_H264: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
+
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
+        filterName = "h264_mp4toannexb";
+      }
+      break;
+    }
 
-  const AVCodecParameters* codecpar = avStream->codecpar;
-  TORCH_CHECK(codecpar != nullptr, "CodecParameters cannot be null");
+    case AV_CODEC_ID_HEVC: {
+      const std::string formatName = avFormatCtx->iformat->long_name
+          ? avFormatCtx->iformat->long_name
+          : "";
 
-  TORCH_CHECK(
-      // TODONVDEC P0 support more
-      avStream->codecpar->codec_id == AV_CODEC_ID_H264,
-      "Can only do H264 for now");
+      if (formatName == "QuickTime / MOV" ||
+          formatName == "FLV (Flash Video)" ||
+          formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
+        filterName = "hevc_mp4toannexb";
+      }
+      break;
+    }
 
-  // Setup bit stream filters (BSF):
-  // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
-  // This is only needed for some formats, like H264 or HEVC.  TODONVDEC P1: For
-  // now we apply BSF unconditionally, but it should be optional  and dependent
-  // on codec and container.
-  const AVBitStreamFilter* avBSF = av_bsf_get_by_name("h264_mp4toannexb");
+    default:
+      // No bitstream filter needed for other codecs
+      // TODONVDEC P1 MPEG4 will need one!
+      break;
+  }
+
+  if (filterName.empty()) {
+    // Only initialize BSF if we actually need one
+    return;
+  }
+
+  const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
   TORCH_CHECK(
-      avBSF != nullptr, "Failed to find h264_mp4toannexb bitstream filter");
+      avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
 
   AVBSFContext* avBSFContext = nullptr;
   int retVal = av_bsf_alloc(avBSF, &avBSFContext);
@@ -189,7 +247,7 @@ void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
 
   bitstreamFilter_.reset(avBSFContext);
 
-  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecpar);
+  retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
   TORCH_CHECK(
       retVal >= AVSUCCESS,
       "Failed to copy codec parameters: ",
@@ -200,18 +258,33 @@ void BetaCudaDeviceInterface::initializeInterface(AVStream* avStream) {
       retVal == AVSUCCESS,
       "Failed to initialize bitstream filter: ",
       getFFMPEGErrorStringFromErrorCode(retVal));
+}
+
+void BetaCudaDeviceInterface::initializeInterface(
+    const AVStream* avStream,
+    const UniqueDecodingAVFormatContext& avFormatCtx) {
+  torch::Tensor dummyTensorForCudaInitialization = torch::empty(
+      {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
+
+  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
+  timeBase_ = avStream->time_base;
+
+  const AVCodecParameters* codecPar = avStream->codecpar;
+  TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
+
+  initializeBSF(codecPar, avFormatCtx);
 
   // Create parser. Default values that aren't obvious are taken from DALI.
   CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = cudaVideoCodec_H264;
+  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
   parserParams.ulMaxNumDecodeSurfaces = 8;
   parserParams.ulMaxDisplayDelay = 0;
   // Callback setup, all are triggered by the parser within a call
   // to cuvidParseVideoData
   parserParams.pUserData = this;
   parserParams.pfnSequenceCallback = pfnSequenceCallback;
   parserParams.pfnDecodePicture = pfnDecodePictureCallback;
-  parserParams.pfnDisplayPicture = nullptr;
+  parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
 
   CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
   TORCH_CHECK(
@@ -267,10 +340,6 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
     cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
     cuvidPacket.timestamp = packet->pts;
 
-    // Like DALI: store packet PTS in queue to later assign to frames as they
-    // come out
-    packetsPtsQueue.push(packet->pts);
-
   } else {
     // End of stream packet
     cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
@@ -322,70 +391,38 @@ void BetaCudaDeviceInterface::applyBSF(ReferenceAVPacket& packet) {
 // ready to be decoded, i.e. the parser received all the necessary packets for a
 // given frame. It means we can send that frame to be decoded by the hardware
 // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
-int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* pPicParams) {
+int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
   if (isFlushing_) {
     return 0;
   }
 
-  TORCH_CHECK(pPicParams != nullptr, "Invalid picture parameters");
+  TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
   TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
 
   // Send frame to be decoded by NVDEC - non-blocking call.
-  CUresult result = cuvidDecodePicture(*decoder_.get(), pPicParams);
-  if (result != CUDA_SUCCESS) {
-    return 0; // Yes, you're reading that right, 0 mean error.
-  }
+  CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
 
-  // The frame was sent to be decoded on the NVDEC hardware. Now we store some
-  // relevant info into our frame buffer so that we can retrieve the decoded
-  // frame later when receiveFrame() is called.
-  // Importantly we need to 'guess' the PTS of that frame. The heuristic we use
-  // (like in DALI) is that the frames are ready to be decoded in the same order
-  // as the packets were sent to the parser. So we assign the PTS of the frame
-  // by popping the PTS of the oldest packet in our packetsPtsQueue (note:
-  // oldest doesn't necessarily mean lowest PTS!).
+  // Yes, you're reading that right, 0 means error, 1 means success
+  return (result == CUDA_SUCCESS);
+}
 
-  TORCH_CHECK(
-      // TODONVDEC P0 the queue may be empty, handle that.
-      !packetsPtsQueue.empty(),
-      "PTS queue is empty when decoding a frame");
-  int64_t guessedPts = packetsPtsQueue.front();
-  packetsPtsQueue.pop();
-
-  // Field values taken from DALI
-  CUVIDPARSERDISPINFO dispInfo = {};
-  dispInfo.picture_index = pPicParams->CurrPicIdx;
-  dispInfo.progressive_frame = !pPicParams->field_pic_flag;
-  dispInfo.top_field_first = pPicParams->bottom_field_flag ^ 1;
-  dispInfo.repeat_first_field = 0;
-  dispInfo.timestamp = guessedPts;
-
-  FrameBuffer::Slot* slot = frameBuffer_.findEmptySlot();
-  slot->dispInfo = dispInfo;
-  slot->guessedPts = guessedPts;
-  slot->occupied = true;
-
-  return 1;
+int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
+    CUVIDPARSERDISPINFO* dispInfo) {
+  readyFrames_.push(*dispInfo);
+  return 1; // success
 }
 
-// Moral equivalent of avcodec_receive_frame(). Here, we look for a decoded
-// frame with the exact desired PTS in our frame buffer. This logic is only
-// valid in exact seek_mode, for now.
-int BetaCudaDeviceInterface::receiveFrame(
-    UniqueAVFrame& avFrame,
-    int64_t desiredPts) {
-  FrameBuffer::Slot* slot = frameBuffer_.findFrameWithExactPts(desiredPts);
-  if (slot == nullptr) {
+// Moral equivalent of avcodec_receive_frame().
+int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
+  if (readyFrames_.empty()) {
     // No frame found, instruct caller to try again later after sending more
     // packets.
     return AVERROR(EAGAIN);
   }
-
-  slot->occupied = false;
-  slot->guessedPts = -1;
+  CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
+  readyFrames_.pop();
 
   CUVIDPROCPARAMS procParams = {};
-  CUVIDPARSERDISPINFO dispInfo = slot->dispInfo;
   procParams.progressive_frame = dispInfo.progressive_frame;
   procParams.top_field_first = dispInfo.top_field_first;
   procParams.unpaired_field = dispInfo.repeat_first_field < 0;
@@ -445,7 +482,7 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
   avFrame->width = width;
   avFrame->height = height;
   avFrame->format = AV_PIX_FMT_CUDA;
-  avFrame->pts = dispInfo.timestamp; // == guessedPts
+  avFrame->pts = dispInfo.timestamp;
 
   // TODONVDEC P0: Zero division error!!!
   // TODONVDEC P0: Move AVRational arithmetic to FFMPEGCommon, and put the
@@ -511,13 +548,8 @@ void BetaCudaDeviceInterface::flush() {
 
   isFlushing_ = false;
 
-  for (auto& slot : frameBuffer_) {
-    slot.occupied = false;
-    slot.guessedPts = -1;
-  }
-
-  std::queue<int64_t> empty;
-  packetsPtsQueue.swap(empty);
+  std::queue<CUVIDPARSERDISPINFO> emptyQueue;
+  std::swap(readyFrames_, emptyQueue);
 
   eofSent_ = false;
 }
@@ -551,26 +583,4 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       preAllocatedOutputTensor);
 }
 
-BetaCudaDeviceInterface::FrameBuffer::Slot*
-BetaCudaDeviceInterface::FrameBuffer::findEmptySlot() {
-  for (auto& slot : frameBuffer_) {
-    if (!slot.occupied) {
-      return &slot;
-    }
-  }
-  frameBuffer_.emplace_back();
-  return &frameBuffer_.back();
-}
-
-BetaCudaDeviceInterface::FrameBuffer::Slot*
-BetaCudaDeviceInterface::FrameBuffer::findFrameWithExactPts(
-    int64_t desiredPts) {
-  for (auto& slot : frameBuffer_) {
-    if (slot.occupied && slot.guessedPts == desiredPts) {
-      return &slot;
-    }
-  }
-  return nullptr;
-}
-
 } // namespace facebook::torchcodec