From 524ad7b740b2a53bf7a29a4a3e32a796f32e54b2 Mon Sep 17 00:00:00 2001 From: eopXD Date: Sun, 2 Nov 2025 23:24:40 -0800 Subject: [PATCH] [#8813][fix] Add missing event for block onboard for the kv cache transfer manager Authored-by: @josephrocca Co-authored-by: eopXD Signed-off-by: eopXD --- .../batch_manager/kvCacheManager.h | 22 +++++++++++++++++++ .../batch_manager/kvCacheManager.cpp | 7 ++++++ .../batch_manager/kvCacheTransferManager.cpp | 3 +++ 3 files changed, 32 insertions(+) diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 0d8f7aa0e13..19f7c8a3c6c 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -325,6 +325,26 @@ class KVCacheBlock size_t getHash() const; + //! \brief Set onboard event to track asynchronous block transfer completion. + //! \param event CUDA event to associate with this block (moved into the block) + void setPendingOnboardEvent(runtime::CudaEvent&& event) + { + mPendingOnboardEvent = std::move(event); + } + + //! \brief Get the pending onboard event if one exists. + //! \return Pointer to the pending event, or nullptr if no event is pending + runtime::CudaEvent const* getPendingOnboardEvent() const + { + return mPendingOnboardEvent ? &mPendingOnboardEvent.value() : nullptr; + } + + //! \brief Clear the pending onboard event + void clearPendingOnboardEvent() + { + mPendingOnboardEvent.reset(); + } + private: // Linear ID of block independent of pool IdType mBlockId; @@ -365,6 +385,8 @@ class KVCacheBlock std::optional mExpirationTime; // Hash for the event manager size_t mHash; + // Possible pending event to onboard the block + std::optional mPendingOnboardEvent; }; class GenerationRequest diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index b0b7b494fa6..776df1c64ee 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -1483,6 +1483,13 @@ void WindowBlockManager::addSequence( void WindowBlockManager::addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx) { + if (auto const* onboardEvent = block->getPendingOnboardEvent()) + { + // Make sure block is onboarded before used + mBufferManager.getStream().wait(*onboardEvent); + block->clearPendingOnboardEvent(); + } + auto const requestId = sequence.getRequestId(); block->incRefCount(); if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0) diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp index fd5758a8368..4ed1db086b0 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp @@ -224,6 +224,9 @@ void KVCacheTransferManager::onboard(BlockPtr const& offloadBlock, BlockPtr cons mOnboardManager.getStream().wait(mPendingOffloads[offloadBlock->getBlockId()]); } copyBlock(offloadBlock, block, pools, false, numTokensToCopy, mode, directory); + tr::CudaEvent onboardEvent; + mOnboardManager.getStream().record(onboardEvent); + block->setPendingOnboardEvent(std::move(onboardEvent)); } void KVCacheTransferManager::offload(BlockPtr const& block, BlockPtr const& offloadBlock,