[TRTLLM-7731][feat] Avoid over-allocation of KV cache for transmission in disagg with CP (#8145)

brb-nv · web-flow · commit d798d6697616 · 2025-10-31T17:32:39.000-07:00
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -42,21 +42,22 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-BlockRange getBlockRangeForSending(
-    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, BlockKey const& lastBlockKey, int32_t indexFromEnd)
+BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
+    BlockKey const& lastBlockKey, int32_t indexFromEnd, bool recvSideHasCP)
 {
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0)
+    // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is
+    // distributed among CP ranks. So, we transfer all blocks from send side.
+    if (poolNum > 1 || !cacheManager->isEnableBlockReuse() || lastBlockKey.uniqueTokens.size() == 0 || recvSideHasCP)
     {
         // disable reuse path, and vwsa don't support reuse.
         bool needSendAllForWindow = common::getEnvKVCacheTransferAllBlocksForWindow();
 
         auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);
-        // auto inputLen = llmRequest.getPromptLen();
 
         auto const& windowsMetadata = cacheManager->getBlockManager().getWindowSizesMetadata();
 
-        if ((windowsMetadata.size() == 1 || needSendAllForWindow))
+        if (windowsMetadata.size() == 1 || needSendAllForWindow || recvSideHasCP)
         {
             return blockRange;
         }
@@ -85,10 +86,11 @@ BlockRange getBlockRangeForSending(
 }
 
 BlockRange getBlockRangeForReceiving(
-    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse)
+    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse, bool recvSideHasCP)
 {
     auto poolNum = cacheManager->getBlockManager().getNumPools();
-    if (poolNum == 1 && srcEnableBlockReuse)
+    // Note: When recv side has CP, we request all blocks from send side right now.
+    if (poolNum == 1 && srcEnableBlockReuse && !recvSideHasCP)
     {
         // Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.
         auto windowSize = cacheManager->getBlockManager().getWindowSizesMetadata().begin()->first;
@@ -121,9 +123,8 @@ BlockRange getBlockRangeForReceiving(
     }
 
     auto const& windowsMetadata = cacheManager->getBlockManager().getWindowSizesMetadata();
-    if (windowsMetadata.size() == 1 || common::getEnvKVCacheTransferAllBlocksForWindow())
+    if (windowsMetadata.size() == 1 || common::getEnvKVCacheTransferAllBlocksForWindow() || recvSideHasCP)
     {
-
         return BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);
     }
     auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
@@ -43,7 +43,7 @@ class TransferSession;
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
-    BlockKey const& lastBlockKey, SizeType32 indexFromEnd);
+    BlockKey const& lastBlockKey, SizeType32 indexFromEnd, bool recvSideHasCP = false);
 
 using DataContext = tensorrt_llm::executor::kv_cache::DataContext;
 using Connection = tensorrt_llm::executor::kv_cache::Connection;
@@ -52,8 +52,8 @@ using BaseKVCacheManager = kv_cache_manager::BaseKVCacheManager;
 using CacheTransBufferManager = kv_cache_manager::CacheTransBufferManager;
 using BlockRange = kv_cache_manager::BlockRange;
 
-BlockRange getBlockRangeForReceiving(
-    BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse);
+BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,
+    bool srcEnableBlockReuse, bool recvSideHasCP = false);
 
 // Used to support the cache transmission with different layouts and different protocols.
 class BaseCacheFormatter
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -37,31 +37,6 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-int getBlockNumAccountingForCP(int cpRank, int cpSize, int numTotalBlocks, bool strict)
-{
-    TLLM_CHECK(cpRank >= 0 && cpRank < cpSize);
-    if (cpSize == 1)
-    {
-        return numTotalBlocks;
-    }
-    // NOTE: Non-strict mode may over-allocate blocks when numTotalBlocks is not divisible by cpSize.
-    // This is a known limitation and will be addressed in a future MR.
-    if (!strict)
-    {
-        // Simple ceiling division.
-        return (numTotalBlocks + cpSize - 1) / cpSize;
-    }
-    // In strict mode, blocks are distributed among CP ranks in a round-robin fashion as evenly as possible.
-    // When the number of blocks is not divisible by cpSize, the remainder shall be distributed evenly among
-    // lowest-indexed CP ranks (let's call them overflow ranks).
-    int numBlocksCurrRank = numTotalBlocks / cpSize;
-    if (numTotalBlocks % cpSize > cpRank)
-    {
-        numBlocksCurrRank++;
-    }
-    return numBlocksCurrRank;
-}
-
 // some context rank in connection
 std::vector<size_t> MLACacheFormatter::pickRecvConnections(
     size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
@@ -145,7 +120,8 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
     int blockNum = 0;
     std::vector<runtime::ITensor::SharedPtr> inputKvCacheBlocks;
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
-    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, lastBlockKey, indexFromEnd);
+    bool const recvSideHasCP = destConfig.getParallelConfig().mContextParallelism > 1;
+    auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest, lastBlockKey, indexFromEnd, recvSideHasCP);
     auto const& windowSizes = blockRange.getWindowSizes();
     TLLM_CHECK_WITH_INFO(
         static_cast<int>(windowSizes.size()) == numPools, "window sizes should be the same as numPools");
@@ -204,7 +180,7 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
                 auto const idx = cpDomainIdx * pPDomainSize + ppDomainIdx;
                 // Note: contextCP is always 1. So, cpDomainSize == genCPSize and cpDomainIdx == genCPRank.
                 auto const peerBlockNum
-                    = getBlockNumAccountingForCP(cpDomainIdx, cPDomainSize, blockNum, /*strict=*/false);
+                    = executor::kv_cache::getBlockNumAccountingForCP(cpDomainIdx, cPDomainSize, blockNum);
                 bufferSizeForTarget[idx] = blockSizePerLayer * peerAttentionLayerNum * peerBlockNum;
             }
         }
@@ -346,7 +322,9 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
     auto const& connections = session.getConnections();
     auto& bufferManager = session.getBufferManager();
     auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
-    auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse());
+    bool const recvSideHasCP = selfConfig.getParallelConfig().mContextParallelism > 1;
+    auto blockRange
+        = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse(), recvSideHasCP);
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::vector<runtime::ITensor::SharedPtr> outputBuffers;
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h
@@ -22,24 +22,6 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-/**
- * @brief Calculate the number of blocks allocated to a specific Context Parallelism (CP) rank.
- *
- * This function determines how many blocks should be allocated to a given CP rank when
- * distributing a total number of blocks across multiple CP ranks. It supports two distribution
- * modes: strict and non-strict.
- *
- * @param cpRank The rank (index) of the current CP process. Must be in range [0, cpSize).
- * @param cpSize The total number of CP ranks/processes in the parallel group.
- * @param numTotalBlocks The total number of blocks to be distributed across all CP ranks.
- * @param strict Flag controlling the distribution strategy:
- *               - true: Use strict round-robin distribution with exact allocation
- *               - false: Use ceiling division which may over-allocate
- *
- * @return The number of blocks allocated to the specified CP rank.
- */
-int getBlockNumAccountingForCP(int cpRank, int cpSize, int numTotalBlocks, bool strict);
-
 // Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
 // parallel topology is completely identical, making it the preferred method.
 class MLACacheFormatter final : public BaseCacheFormatter
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -278,6 +278,12 @@ bool getEnvUseNixlKvCache()
     return useNixlKvCache;
 }
 
+bool getEnvUseRoundRobinBlockDistForCP()
+{
+    static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP");
+    return useRoundRobinBlockDistForCP;
+}
+
 std::string getEnvUCXInterface()
 {
     static std::once_flag flag;
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
@@ -82,6 +82,8 @@ bool getEnvUseUCXKvCache();
 bool getEnvUseMPIKvCache();
 bool getEnvUseNixlKvCache();
 
+bool getEnvUseRoundRobinBlockDistForCP();
+
 std::string getEnvUCXInterface();
 
 std::string getEnvNixlInterface();
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp

Original file line number	Diff line number	Diff line change
`@@ -42,21 +42,22 @@`
`42`	`42`	`namespace tensorrt_llm::batch_manager::kv_cache_manager`
`43`	`43`	`{`
`44`	`44`
`45`		`-BlockRange getBlockRangeForSending(`
`46`		`- BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, BlockKey const& lastBlockKey, int32_t indexFromEnd)`
	`45`	`+BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest,`
	`46`	`+ BlockKey const& lastBlockKey, int32_t indexFromEnd, bool recvSideHasCP)`
`47`	`47`	`{`
`48`	`48`	`auto poolNum = cacheManager->getBlockManager().getNumPools();`
`49`		`- if (poolNum > 1 \|\| !cacheManager->isEnableBlockReuse() \|\| lastBlockKey.uniqueTokens.size() == 0)`
	`49`	`+ // Note: When recv side has CP, the requested seqLen is lesser than seqLen on the sender side as seqLen is`
	`50`	`+ // distributed among CP ranks. So, we transfer all blocks from send side.`
	`51`	`+ if (poolNum > 1 \|\| !cacheManager->isEnableBlockReuse() \|\| lastBlockKey.uniqueTokens.size() == 0 \|\| recvSideHasCP)`
`50`	`52`	`{`
`51`	`53`	`// disable reuse path, and vwsa don't support reuse.`
`52`	`54`	`bool needSendAllForWindow = common::getEnvKVCacheTransferAllBlocksForWindow();`
`53`	`55`
`54`	`56`	`auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);`
`55`		`- // auto inputLen = llmRequest.getPromptLen();`
`56`	`57`
`57`	`58`	`auto const& windowsMetadata = cacheManager->getBlockManager().getWindowSizesMetadata();`
`58`	`59`
`59`		`- if ((windowsMetadata.size() == 1 \|\| needSendAllForWindow))`
	`60`	`+ if (windowsMetadata.size() == 1 \|\| needSendAllForWindow \|\| recvSideHasCP)`
`60`	`61`	`{`
`61`	`62`	`return blockRange;`
`62`	`63`	`}`
`@@ -85,10 +86,11 @@ BlockRange getBlockRangeForSending(`
`85`	`86`	`}`
`86`	`87`
`87`	`88`	`BlockRange getBlockRangeForReceiving(`
`88`		`- BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse)`
	`89`	`+ BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest, bool srcEnableBlockReuse, bool recvSideHasCP)`
`89`	`90`	`{`
`90`	`91`	`auto poolNum = cacheManager->getBlockManager().getNumPools();`
`91`		`- if (poolNum == 1 && srcEnableBlockReuse)`
	`92`	`+ // Note: When recv side has CP, we request all blocks from send side right now.`
	`93`	`+ if (poolNum == 1 && srcEnableBlockReuse && !recvSideHasCP)`
`92`	`94`	`{`
`93`	`95`	`// Build from all block ids, then slice off the reused blocks so we only transfer newly allocated ones.`
`94`	`96`	`auto windowSize = cacheManager->getBlockManager().getWindowSizesMetadata().begin()->first;`
`@@ -121,9 +123,8 @@ BlockRange getBlockRangeForReceiving(`
`121`	`123`	`}`
`122`	`124`
`123`	`125`	`auto const& windowsMetadata = cacheManager->getBlockManager().getWindowSizesMetadata();`
`124`		`- if (windowsMetadata.size() == 1 \|\| common::getEnvKVCacheTransferAllBlocksForWindow())`
	`126`	`+ if (windowsMetadata.size() == 1 \|\| common::getEnvKVCacheTransferAllBlocksForWindow() \|\| recvSideHasCP)`
`125`	`127`	`{`
`126`		`-`
`127`	`128`	`return BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);`
`128`	`129`	`}`
`129`	`130`	`auto blockRange = BlockRange::fromAllBlockIds(*cacheManager, llmRequest.mRequestId);`
Original file line number	Diff line number	Diff line change
`@@ -278,6 +278,12 @@ bool getEnvUseNixlKvCache()`
`278`	`278`	`return useNixlKvCache;`
`279`	`279`	`}`
`280`	`280`
	`281`	`+bool getEnvUseRoundRobinBlockDistForCP()`
	`282`	`+{`
	`283`	`+ static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP");`
	`284`	`+ return useRoundRobinBlockDistForCP;`
	`285`	`+}`
	`286`	`+`
`281`	`287`	`std::string getEnvUCXInterface()`
`282`	`288`	`{`
`283`	`289`	`static std::once_flag flag;`