feat: KV events for sliding window attention (NVIDIA#5580)

jthomson04 · web-flow · commit 1b588f8390c1 · 2025-07-05T06:05:20.000+08:00
Signed-off-by: jthomson04 &lt;jwillthomson19@gmail.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
@@ -44,13 +44,13 @@ class KVCacheEventManager
     KVCacheEventManager(KVCacheEventManager&& other) = delete;
     KVCacheEventManager& operator=(KVCacheEventManager&& other) = delete;
 
-    void enqueueCreatedEvent(std::vector<SizeType32> const& numBlocksPerCacheLevel);
+    void enqueueCreatedEvent(std::vector<SizeType32> const& numBlocksPerCacheLevel, SizeType32 windowSize);
 
-    void enqueueStoredEvent(std::vector<BlockPtr> const& blocks);
+    void enqueueStoredEvent(std::vector<BlockPtr> const& blocks, SizeType32 windowSize);
 
-    void enqueueRemovedEvent(BlockPtr const& block);
+    void enqueueRemovedEvent(BlockPtr const& block, SizeType32 windowSize);
 
-    void enqueueUpdatedEvent(executor::KVCacheUpdatedData const& data);
+    void enqueueUpdatedEvent(executor::KVCacheUpdatedData const& data, SizeType32 windowSize);
 
     // Get events in mEvents. If there are no events, wait for a maximum of `timeout` milliseconds.
     std::deque<executor::KVCacheEvent> getEvents(std::optional<std::chrono::milliseconds> timeout);
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1709,12 +1709,14 @@ using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVC
 struct KVCacheEvent
 {
 
-    KVCacheEvent(IdType eventId, KVCacheEventData data);
+    KVCacheEvent(IdType eventId, KVCacheEventData data, SizeType32 windowSize);
 
     /// @brief The unique id of this event
     IdType eventId;
     /// @brief The data corresponding to this event
     KVCacheEventData data;
+    /// @brief The sliding window size
+    SizeType32 windowSize;
 };
 
 /// @brief Exposes a limited set of KV cache manager functionalities
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheEventManager.cpp
@@ -42,12 +42,13 @@ KVCacheEventManager::~KVCacheEventManager()
     mWorkerThread.join();
 }
 
-void KVCacheEventManager::enqueueCreatedEvent(std::vector<SizeType32> const& numBlocksPerCacheLevel)
+void KVCacheEventManager::enqueueCreatedEvent(
+    std::vector<SizeType32> const& numBlocksPerCacheLevel, SizeType32 windowSize)
 {
-    enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}});
+    enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}, windowSize});
 }
 
-void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks)
+void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks, SizeType32 windowSize)
 {
     if (blocks.empty())
     {
@@ -67,24 +68,26 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks
             block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());
     }
 
-    enqueueEvent({mEventId++, data});
+    enqueueEvent({mEventId++, data, windowSize});
 }
 
-void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block)
+void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block, SizeType32 windowSize)
 {
-    if (!mEventQueue.empty() && std::holds_alternative<tle::KVCacheRemovedData>(mEventQueue.back().data))
+    // We can only batch the removed block events if the same sliding window size is used.
+    if (!mEventQueue.empty() && mEventQueue.back().windowSize == windowSize
+        && std::holds_alternative<tle::KVCacheRemovedData>(mEventQueue.back().data))
     {
         std::get<tle::KVCacheRemovedData>(mEventQueue.back().data).blockHashes.push_back(block->getHash());
     }
     else
     {
-        enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}});
+        enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}, windowSize});
     }
 }
 
-void KVCacheEventManager::enqueueUpdatedEvent(tle::KVCacheUpdatedData const& data)
+void KVCacheEventManager::enqueueUpdatedEvent(tle::KVCacheUpdatedData const& data, SizeType32 windowSize)
 {
-    enqueueEvent({mEventId++, data});
+    enqueueEvent({mEventId++, data, windowSize});
 }
 
 void KVCacheEventManager::enqueueEvent(tle::KVCacheEvent&& event)
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -552,7 +552,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
         mAllBlocksById, {blocksInPrimaryPool, blocksInSecondaryPool}, secondaryOffloadMinPriority);
     if (mEventManager)
     {
-        mEventManager->enqueueCreatedEvent({blocksInPrimaryPool, blocksInSecondaryPool});
+        mEventManager->enqueueCreatedEvent({blocksInPrimaryPool, blocksInSecondaryPool}, mWindowSize);
     }
 }
 
@@ -741,7 +741,7 @@ void WindowBlockManager::freeChildren(
     // Free block
     if (mEventManager && blockInRadixTree(block))
     {
-        mEventManager->enqueueRemovedEvent(block);
+        mEventManager->enqueueRemovedEvent(block, mWindowSize);
     }
 
     claimLeafBlock(block, priority, durationMs);
@@ -776,7 +776,8 @@ BlockPtr WindowBlockManager::getFreeBlock(
         if (mEventManager && blockInRadixTree(block))
         {
             mEventManager->enqueueUpdatedEvent(
-                tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel));
+                tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),
+                mWindowSize);
         }
         mEvictionPolicy->releaseBlock(block); // append offload block to mFreeSecondaryBlocks queue
         block = offloadBlock;
@@ -881,7 +882,8 @@ void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock)
         if (mEventManager)
         {
             mEventManager->enqueueUpdatedEvent(
-                tle::KVCacheUpdatedData(offloadBlock->getHash()).cacheLevelUpdated(kSecondaryLevel, kPrimaryLevel));
+                tle::KVCacheUpdatedData(offloadBlock->getHash()).cacheLevelUpdated(kSecondaryLevel, kPrimaryLevel),
+                mWindowSize);
         }
         mEvictionPolicy->releaseBlock(block); // append block to offload queue
                                               // offloadBlock is now in primary memory pool
@@ -908,7 +910,8 @@ void WindowBlockManager::offloadBlock(BlockPtr const& block)
         if (mEventManager && blockInRadixTree(block))
         {
             mEventManager->enqueueUpdatedEvent(
-                tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel));
+                tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),
+                mWindowSize);
         }
         mEvictionPolicy->releaseBlock(offloadBlock); // append offloadBlock to mFreePrimaryBlocks queue
                                                      // block is now in secondary memory
@@ -980,7 +983,8 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
             {
                 mEventManager->enqueueUpdatedEvent(
                     tle::KVCacheUpdatedData(matchingBlock->getHash())
-                        .priorityUpdated(matchingBlock->getPriority(), *perBlockRetentions[bi].retentionPriority));
+                        .priorityUpdated(matchingBlock->getPriority(), *perBlockRetentions[bi].retentionPriority),
+                    mWindowSize);
             }
             if (partialMatch)
             {
@@ -1275,7 +1279,7 @@ void WindowBlockManager::storeBlocks(
     }
     if (mEventManager)
     {
-        mEventManager->enqueueStoredEvent(storedBlocks);
+        mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);
     }
 }
 
diff --git a/cpp/tensorrt_llm/executor/executor.cpp b/cpp/tensorrt_llm/executor/executor.cpp
@@ -132,9 +132,10 @@ std::optional<std::shared_ptr<KVCacheEventManager>> Executor::getKVCacheEventMan
     return mImpl->getKVCacheEventManager();
 }
 
-KVCacheEvent::KVCacheEvent(size_t eventId, KVCacheEventData data)
+KVCacheEvent::KVCacheEvent(size_t eventId, KVCacheEventData data, SizeType32 windowSize)
     : eventId{eventId}
     , data{std::move(data)}
+    , windowSize{windowSize}
 {
 }
 
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -239,7 +239,8 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tle::KVCacheEvent>(executor_kv_cache, "KVCacheEvent")
         .def_readonly("event_id", &tle::KVCacheEvent::eventId)
-        .def_readonly("data", &tle::KVCacheEvent::data);
+        .def_readonly("data", &tle::KVCacheEvent::data)
+        .def_readonly("window_size", &tle::KVCacheEvent::windowSize);
 
     py::class_<tle::KVCacheEventManager, std::shared_ptr<tle::KVCacheEventManager>>(
         executor_kv_cache, "KVCacheEventManager")
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -3401,6 +3401,62 @@ TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamBlocking)
     EXPECT_TRUE(std::holds_alternative<tle::KVCacheStoredData>(events.front().data));
 }
 
+TEST_F(KVCacheManagerTest, KVCacheManagerEventStreamWindowSize)
+{
+    auto constexpr numLayers = 2;
+    auto constexpr numHeads = 6;
+    auto constexpr sizePerHead = 16;
+    auto constexpr tokensPerBlock = 4;
+    auto constexpr maxBlocksPerSeq = 4;
+    auto constexpr maxNumSequences = 8;
+    auto blocksInPool = std::vector<SizeType32>{8, 2};
+    auto blocksInSlidingWindowPool = std::vector<SizeType32>{4, 2};
+    auto constexpr onboardBlocks = true;
+    auto constexpr dtype = nvinfer1::DataType::kHALF;
+    auto const stream = std::make_shared<tr::CudaStream>();
+
+    auto constexpr beamWidth = 1;
+    SizeType32 constexpr maxNewTokens{0};
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    bool constexpr isStreaming{false};
+
+    auto const maxAttentionWindow = tokensPerBlock * maxBlocksPerSeq;
+    auto const slidingWindow = tokensPerBlock * (maxBlocksPerSeq - 1);
+
+    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPool[0], blocksInPool[1]}},
+        {slidingWindow, {blocksInSlidingWindowPool[0], blocksInSlidingWindowPool[1]}}};
+
+    KVCacheManager kvCacheManager(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
+        beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow, slidingWindow}, std::nullopt, dtype, 0,
+        stream, std::nullopt, true, onboardBlocks, CacheType::kSELF, std::nullopt,
+        std::make_unique<tlk::KVCacheEventManager>(1024));
+    kvCacheManager.allocatePools(false);
+
+    auto events = getEvents(kvCacheManager);
+
+    EXPECT_EQ(events.size(), 2);
+
+    EXPECT_EQ(events.front().windowSize, slidingWindow);
+    EXPECT_EQ(std::get<tle::KVCacheCreatedData>(events.front().data).numBlocksPerCacheLevel, blocksInSlidingWindowPool);
+
+    EXPECT_EQ(events.back().windowSize, maxAttentionWindow);
+    EXPECT_EQ(std::get<tle::KVCacheCreatedData>(events.back().data).numBlocksPerCacheLevel, blocksInPool);
+
+    auto inputTokens0 = std::make_shared<VecTokens>(VecTokens{0, 1, 2, 3, 4, 5, 6, 7});
+    auto llmRequest0 = std::make_shared<LlmRequest>(0, 0, inputTokens0, samplingConfig, true);
+    kvCacheManager.addSequence(0, inputTokens0->size(), beamWidth, llmRequest0);
+    kvCacheManager.storeContextBlocks(*llmRequest0);
+
+    events = getEvents(kvCacheManager);
+
+    EXPECT_EQ(events.size(), 2);
+    EXPECT_EQ(events.front().windowSize, slidingWindow);
+    EXPECT_TRUE(std::holds_alternative<tle::KVCacheStoredData>(events.front().data));
+
+    EXPECT_EQ(events.back().windowSize, maxAttentionWindow);
+    EXPECT_TRUE(std::holds_alternative<tle::KVCacheStoredData>(events.back().data));
+}
+
 TEST_F(KVCacheManagerTest, KVCacheTransferManagerConcurrencyTest)
 {
     auto const blockSize = 16384;

Original file line number	Diff line number	Diff line change
`@@ -42,12 +42,13 @@ KVCacheEventManager::~KVCacheEventManager()`
`42`	`42`	`mWorkerThread.join();`
`43`	`43`	`}`
`44`	`44`
`45`		`-void KVCacheEventManager::enqueueCreatedEvent(std::vector<SizeType32> const& numBlocksPerCacheLevel)`
	`45`	`+void KVCacheEventManager::enqueueCreatedEvent(`
	`46`	`+ std::vector<SizeType32> const& numBlocksPerCacheLevel, SizeType32 windowSize)`
`46`	`47`	`{`
`47`		`- enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}});`
	`48`	`+ enqueueEvent({mEventId++, tle::KVCacheCreatedData{numBlocksPerCacheLevel}, windowSize});`
`48`	`49`	`}`
`49`	`50`
`50`		`-void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks)`
	`51`	`+void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks, SizeType32 windowSize)`
`51`	`52`	`{`
`52`	`53`	`if (blocks.empty())`
`53`	`54`	`{`
`@@ -67,24 +68,26 @@ void KVCacheEventManager::enqueueStoredEvent(std::vector<BlockPtr> const& blocks`
`67`	`68`	`block->isPrimary() ? kPrimaryLevel : kSecondaryLevel, block->getPriority());`
`68`	`69`	`}`
`69`	`70`
`70`		`- enqueueEvent({mEventId++, data});`
	`71`	`+ enqueueEvent({mEventId++, data, windowSize});`
`71`	`72`	`}`
`72`	`73`
`73`		`-void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block)`
	`74`	`+void KVCacheEventManager::enqueueRemovedEvent(BlockPtr const& block, SizeType32 windowSize)`
`74`	`75`	`{`
`75`		`- if (!mEventQueue.empty() && std::holds_alternative<tle::KVCacheRemovedData>(mEventQueue.back().data))`
	`76`	`+ // We can only batch the removed block events if the same sliding window size is used.`
	`77`	`+ if (!mEventQueue.empty() && mEventQueue.back().windowSize == windowSize`
	`78`	`+ && std::holds_alternative<tle::KVCacheRemovedData>(mEventQueue.back().data))`
`76`	`79`	`{`
`77`	`80`	`std::get<tle::KVCacheRemovedData>(mEventQueue.back().data).blockHashes.push_back(block->getHash());`
`78`	`81`	`}`
`79`	`82`	`else`
`80`	`83`	`{`
`81`		`- enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}});`
	`84`	`+ enqueueEvent({mEventId++, tle::KVCacheRemovedData{{block->getHash()}}, windowSize});`
`82`	`85`	`}`
`83`	`86`	`}`
`84`	`87`
`85`		`-void KVCacheEventManager::enqueueUpdatedEvent(tle::KVCacheUpdatedData const& data)`
	`88`	`+void KVCacheEventManager::enqueueUpdatedEvent(tle::KVCacheUpdatedData const& data, SizeType32 windowSize)`
`86`	`89`	`{`
`87`		`- enqueueEvent({mEventId++, data});`
	`90`	`+ enqueueEvent({mEventId++, data, windowSize});`
`88`	`91`	`}`
`89`	`92`
`90`	`93`	`void KVCacheEventManager::enqueueEvent(tle::KVCacheEvent&& event)`
Original file line number	Diff line number	Diff line change
`@@ -552,7 +552,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind`
`552`	`552`	`mAllBlocksById, {blocksInPrimaryPool, blocksInSecondaryPool}, secondaryOffloadMinPriority);`
`553`	`553`	`if (mEventManager)`
`554`	`554`	`{`
`555`		`- mEventManager->enqueueCreatedEvent({blocksInPrimaryPool, blocksInSecondaryPool});`
	`555`	`+ mEventManager->enqueueCreatedEvent({blocksInPrimaryPool, blocksInSecondaryPool}, mWindowSize);`
`556`	`556`	`}`
`557`	`557`	`}`
`558`	`558`
`@@ -741,7 +741,7 @@ void WindowBlockManager::freeChildren(`
`741`	`741`	`// Free block`
`742`	`742`	`if (mEventManager && blockInRadixTree(block))`
`743`	`743`	`{`
`744`		`- mEventManager->enqueueRemovedEvent(block);`
	`744`	`+ mEventManager->enqueueRemovedEvent(block, mWindowSize);`
`745`	`745`	`}`
`746`	`746`
`747`	`747`	`claimLeafBlock(block, priority, durationMs);`
`@@ -776,7 +776,8 @@ BlockPtr WindowBlockManager::getFreeBlock(`
`776`	`776`	`if (mEventManager && blockInRadixTree(block))`
`777`	`777`	`{`
`778`	`778`	`mEventManager->enqueueUpdatedEvent(`
`779`		`- tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel));`
	`779`	`+ tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),`
	`780`	`+ mWindowSize);`
`780`	`781`	`}`
`781`	`782`	`mEvictionPolicy->releaseBlock(block); // append offload block to mFreeSecondaryBlocks queue`
`782`	`783`	`block = offloadBlock;`
`@@ -881,7 +882,8 @@ void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock)`
`881`	`882`	`if (mEventManager)`
`882`	`883`	`{`
`883`	`884`	`mEventManager->enqueueUpdatedEvent(`
`884`		`- tle::KVCacheUpdatedData(offloadBlock->getHash()).cacheLevelUpdated(kSecondaryLevel, kPrimaryLevel));`
	`885`	`+ tle::KVCacheUpdatedData(offloadBlock->getHash()).cacheLevelUpdated(kSecondaryLevel, kPrimaryLevel),`
	`886`	`+ mWindowSize);`
`885`	`887`	`}`
`886`	`888`	`mEvictionPolicy->releaseBlock(block); // append block to offload queue`
`887`	`889`	`// offloadBlock is now in primary memory pool`
`@@ -908,7 +910,8 @@ void WindowBlockManager::offloadBlock(BlockPtr const& block)`
`908`	`910`	`if (mEventManager && blockInRadixTree(block))`
`909`	`911`	`{`
`910`	`912`	`mEventManager->enqueueUpdatedEvent(`
`911`		`- tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel));`
	`913`	`+ tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),`
	`914`	`+ mWindowSize);`
`912`	`915`	`}`
`913`	`916`	`mEvictionPolicy->releaseBlock(offloadBlock); // append offloadBlock to mFreePrimaryBlocks queue`
`914`	`917`	`// block is now in secondary memory`
`@@ -980,7 +983,8 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&`
`980`	`983`	`{`
`981`	`984`	`mEventManager->enqueueUpdatedEvent(`
`982`	`985`	`tle::KVCacheUpdatedData(matchingBlock->getHash())`
`983`		`- .priorityUpdated(matchingBlock->getPriority(), *perBlockRetentions[bi].retentionPriority));`
	`986`	`+ .priorityUpdated(matchingBlock->getPriority(), *perBlockRetentions[bi].retentionPriority),`
	`987`	`+ mWindowSize);`
`984`	`988`	`}`
`985`	`989`	`if (partialMatch)`
`986`	`990`	`{`
`@@ -1275,7 +1279,7 @@ void WindowBlockManager::storeBlocks(`
`1275`	`1279`	`}`
`1276`	`1280`	`if (mEventManager)`
`1277`	`1281`	`{`
`1278`		`- mEventManager->enqueueStoredEvent(storedBlocks);`
	`1282`	`+ mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);`
`1279`	`1283`	`}`
`1280`	`1284`	`}`
`1281`	`1285`
Original file line number	Diff line number	Diff line change
`@@ -132,9 +132,10 @@ std::optional<std::shared_ptr<KVCacheEventManager>> Executor::getKVCacheEventMan`
`132`	`132`	`return mImpl->getKVCacheEventManager();`
`133`	`133`	`}`
`134`	`134`
`135`		`-KVCacheEvent::KVCacheEvent(size_t eventId, KVCacheEventData data)`
	`135`	`+KVCacheEvent::KVCacheEvent(size_t eventId, KVCacheEventData data, SizeType32 windowSize)`
`136`	`136`	`: eventId{eventId}`
`137`	`137`	`, data{std::move(data)}`
	`138`	`+ , windowSize{windowSize}`
`138`	`139`	`{`
`139`	`140`	`}`
`140`	`141`