chore: rename IOFormatter to BaseCacheFormatter (NVIDIA#5068)

zhengd-nv · web-flow · commit ee44fa00f863 · 2025-06-12T10:50:14.000+08:00
Signed-off-by: Zheng Duan &lt;200704041+zhengd-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "cacheFormatter.h"
+#include "mlaCacheFormatter.h"
 
 #include "tensorrt_llm/batch_manager/contextProgress.h"
 #include "tensorrt_llm/batch_manager/kvCacheUtils.h"
@@ -751,4 +752,15 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
     }
     return true;
 }
+
+std::unique_ptr<BaseCacheFormatter> createCacheFormatter(
+    BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager, bool isMLA)
+{
+    if (isMLA)
+    {
+        return std::make_unique<MLACacheFormatter>(cacheManager, cacheTransBufferManager);
+    }
+    return std::make_unique<CacheFormatter>(cacheManager, cacheTransBufferManager);
+}
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
@@ -17,12 +17,13 @@
 
 #pragma once
 
+#include "cacheTransBuffer.h"
 #include "dataTransceiver.h"
-#include "tensorrt_llm/batch_manager/cacheTransBuffer.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/batch_manager/kvCacheUtils.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/cacheCommunicator.h"
 #include "tensorrt_llm/executor/cache_transmission/cacheConcatenate.h"
 #include "tensorrt_llm/executor/dataTransceiverState.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
@@ -60,13 +61,54 @@ BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest
 
 BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest);
 
-// Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
-// parallel topology is completely identical, making it the preferred method.
-class CacheFormatter final : public IOFormatter
+// Used to support the cache transmission with different layouts and different protocols.
+class BaseCacheFormatter
 {
 public:
+    using SizeType32 = tensorrt_llm::runtime::SizeType32;
     using CacheState = executor::kv_cache::CacheState;
 
+    virtual void formatOutput(LlmRequest const& llmRequest,
+        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
+        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+        = 0;
+
+    virtual void formatInput(LlmRequest const& llmRequest,
+        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
+        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+        = 0;
+
+    /// @brief Determine whether the sender is applicable to the source and target.
+    /// @param selfConfig Source data arrangement.
+    /// @param destConfig Target data arrangement.
+    /// @return Whether the sender is applicable to the source and target.
+    [[nodiscard]] virtual bool inquireSupport(CacheState const& selfConfig, CacheState const& destConfig) const = 0;
+
+    /// @brief Obtain the indies of the counterparts that need to be actually communicated with.
+    /// @param selfConfig Source data arrangement.
+    /// @param selfIdx The sequential index of the current executor process within the entire parallel group.
+    /// @param destConfig Target data arrangement.
+    /// @return The indies of the counterparts.
+    [[nodiscard]] virtual std::vector<SizeType32> getCounterparts(
+        CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
+        = 0;
+
+    [[nodiscard]] virtual BaseKVCacheManager* getCacheManager() const noexcept = 0;
+
+    [[nodiscard]] virtual std::vector<executor::kv_cache::Connection const*> pickRecvConnections(
+        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
+        SizeType32 selfIdx, CacheState const& destConfig) const
+        = 0;
+
+    /// @brief Destructor.
+    virtual ~BaseCacheFormatter() = default;
+};
+
+// Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
+// parallel topology is completely identical, making it the preferred method.
+class CacheFormatter final : public BaseCacheFormatter
+{
+public:
     CacheFormatter(BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager)
         : mCacheManager{cacheManager}
         , mCacheTransBufferManager{cacheTransBufferManager}
@@ -91,7 +133,7 @@ class CacheFormatter final : public IOFormatter
         return executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx).mIRanks;
     }
 
-    BaseKVCacheManager* getCacheManager() const noexcept
+    [[nodiscard]] BaseKVCacheManager* getCacheManager() const noexcept override
     {
         return mCacheManager;
     }
@@ -102,11 +144,12 @@ class CacheFormatter final : public IOFormatter
         SizeType32 selfIdx, CacheState const& destConfig) const override;
 
 private:
-    BaseKVCacheManager* mCacheManager{};
-
+    BaseKVCacheManager* mCacheManager;
     CacheTransBufferManager* mCacheTransBufferManager;
-
     KvCacheMeasureHelper kvCacheMeasureHelper{common::getEnvKVCacheTransferOutputPath()};
 };
 
+std::unique_ptr<BaseCacheFormatter> createCacheFormatter(
+    BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager, bool isMLA = false);
+
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -179,13 +179,8 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
         }
 
         using tensorrt_llm::batch_manager::kv_cache_manager::MLACacheFormatter;
-        auto makeFormatter = [cacheManager, isMLA, this]() -> std::unique_ptr<IOFormatter>
-        {
-            return isMLA ? std::unique_ptr<IOFormatter>(
-                       std::make_unique<MLACacheFormatter>(cacheManager, this->mCacheTransBufferManager.get()))
-                         : std::unique_ptr<IOFormatter>(
-                             std::make_unique<CacheFormatter>(cacheManager, this->mCacheTransBufferManager.get()));
-        };
+        auto makeFormatter = [cacheManager, isMLA, this]()
+        { return createCacheFormatter(cacheManager, mCacheTransBufferManager.get(), isMLA); };
 
         mDataResponder = std::make_unique<DataResponder>(
             std::make_unique<DataSenderImpl>(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()));
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
@@ -34,47 +34,6 @@
 namespace tensorrt_llm::batch_manager
 {
 
-// Used to support the data transmission with different layouts and different protocols.
-class IOFormatter
-{
-public:
-    using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using CacheState = executor::kv_cache::CacheState;
-
-    virtual void formatOutput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
-        = 0;
-
-    virtual void formatInput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
-        = 0;
-
-    /// @brief Determine whether the sender is applicable to the source and target.
-    /// @param selfConfig Source data arrangement.
-    /// @param destConfig Target data arrangement.
-    /// @return Whether the sender is applicable to the source and target.
-    [[nodiscard]] virtual bool inquireSupport(CacheState const& selfConfig, CacheState const& destConfig) const = 0;
-
-    /// @brief Obtain the indies of the counterparts that need to be actually communicated with.
-    /// @param selfConfig Source data arrangement.
-    /// @param selfIdx The sequential index of the current executor process within the entire parallel group.
-    /// @param destConfig Target data arrangement.
-    /// @return The indies of the counterparts.
-    [[nodiscard]] virtual std::vector<SizeType32> getCounterparts(
-        CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
-        = 0;
-
-    [[nodiscard]] virtual std::vector<executor::kv_cache::Connection const*> pickRecvConnections(
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig) const
-        = 0;
-
-    /// @brief Destructor.
-    virtual ~IOFormatter() = default;
-};
-
 // Used to store the information that needs to be sent to the context executor to ensure the generation
 // executor smoothly receives the data.
 class RequestInfo
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
@@ -15,11 +15,8 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/batch_manager/dataTransceiverImpl.h"
-#include "tensorrt_llm/batch_manager/cacheFormatter.h"
-#include "tensorrt_llm/batch_manager/dataTransceiverImpl.h"
-#include "tensorrt_llm/batch_manager/kvCacheUtils.h"
-#include "tensorrt_llm/batch_manager/mlaCacheFormatter.h"
+#include "dataTransceiverImpl.h"
+
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
@@ -28,7 +25,7 @@ namespace tensorrt_llm::batch_manager
 {
 
 DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
-    executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<IOFormatter> formatter)
+    executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter)
     : mManager{manager}
     , mSelfState{std::move(selfCacheState), executor::kv_cache::CommState{manager->getCommState()}}
     , mFormatter(std::move(formatter))
@@ -133,7 +130,7 @@ void DataSenderImpl::release(LlmRequest::RequestIdType requestId)
 }
 
 DataReceiverImpl::DataReceiverImpl(executor::kv_cache::ConnectionManager* manager,
-    executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<IOFormatter> formatter)
+    executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter)
     : mManager{manager}
     , mSelfState{std::move(selfCacheState), executor::kv_cache::CommState{manager->getCommState()}}
     , mFormatter(std::move(formatter))
@@ -156,23 +153,10 @@ void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
 
     if (!common::getEnvDisableSelectiveCacheTransfer())
     {
-        // TODO: remove IOFormatter and make CacheFormatter new base class
-        auto* cacheFormatter = dynamic_cast<kv_cache_manager::CacheFormatter const*>(mFormatter.get());
-        auto* mlaCacheFormatter = dynamic_cast<kv_cache_manager::MLACacheFormatter const*>(mFormatter.get());
-        if (cacheFormatter != nullptr)
-        {
-            auto* cacheManager = cacheFormatter->getCacheManager();
-            auto blockRange
-                = kv_cache_manager::BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
-            requestInfo = RequestInfo(requestId, blockRange.getBlockHashes(), mSelfState);
-        }
-        else if (mlaCacheFormatter != nullptr)
-        {
-            auto* cacheManager = mlaCacheFormatter->getCacheManager();
-            auto blockRange
-                = kv_cache_manager::BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
-            requestInfo = RequestInfo(requestId, blockRange.getBlockHashes(), mSelfState);
-        }
+        auto* cacheManager = mFormatter->getCacheManager();
+        auto blockRange
+            = kv_cache_manager::BlockRange::fromNewlyAllocatedBlockIds(*cacheManager, llmRequest.mRequestId);
+        requestInfo = RequestInfo(requestId, blockRange.getBlockHashes(), mSelfState);
     }
 
     auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
@@ -17,10 +17,8 @@
 
 #pragma once
 
-#include "tensorrt_llm/batch_manager/cacheTransBuffer.h"
-#include "tensorrt_llm/batch_manager/dataTransceiver.h"
-#include "tensorrt_llm/common/envUtils.h"
-#include "tensorrt_llm/executor/cache_transmission/cacheConcatenate.h"
+#include "cacheFormatter.h"
+#include "dataTransceiver.h"
 
 namespace tensorrt_llm::batch_manager
 {
@@ -37,6 +35,8 @@ struct TransceiverTag
     static constexpr int32_t kINFO_TAG{32};
 };
 
+using BaseCacheFormatter = kv_cache_manager::BaseCacheFormatter;
+
 class DataSenderImpl : public DataSender, public TransceiverTag
 {
 public:
@@ -45,7 +45,7 @@ class DataSenderImpl : public DataSender, public TransceiverTag
         = std::vector<std::pair<executor::kv_cache::Connection const*, executor::DataTransceiverState>>;
 
     DataSenderImpl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState,
-        SizeType32 selfIndex, std::unique_ptr<IOFormatter> formatter);
+        SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter);
 
     [[nodiscard]] RequestInfo recvRequestInfo() override;
 
@@ -63,7 +63,7 @@ class DataSenderImpl : public DataSender, public TransceiverTag
     executor::kv_cache::ConnectionManager* mManager;
     std::map<LlmRequest::RequestIdType, RequestMapInfo> mRequestToComms;
     executor::DataTransceiverState mSelfState;
-    std::unique_ptr<IOFormatter> mFormatter;
+    std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::mutex mMtxForMap;
     runtime::BufferManager mBufferManager;
 };
@@ -74,7 +74,7 @@ class DataReceiverImpl : public DataReceiver, public TransceiverTag
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
     DataReceiverImpl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState,
-        SizeType32 selfIndex, std::unique_ptr<IOFormatter> formatter);
+        SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter);
 
     void sendRequestInfo(LlmRequest const& llmRequest) override;
 
@@ -99,7 +99,7 @@ class DataReceiverImpl : public DataReceiver, public TransceiverTag
 
     executor::kv_cache::ConnectionManager* mManager;
     executor::DataTransceiverState mSelfState;
-    std::unique_ptr<IOFormatter> mFormatter;
+    std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::unordered_map<std::string, std::unique_ptr<ReceiveCacheResource>> mProcessToResources;
     std::mutex mProcessIoResouceMutex;
 };
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h
@@ -17,31 +17,16 @@
 
 #pragma once
 
-#include "dataTransceiver.h"
-#include "tensorrt_llm/batch_manager/cacheTransBuffer.h"
-#include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/batch_manager/kvCacheUtils.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/executor/cache_transmission/cacheConcatenate.h"
-#include "tensorrt_llm/executor/dataTransceiverState.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include <NvInferRuntimeBase.h>
-#include <condition_variable>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
+#include "cacheFormatter.h"
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
 // Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
 // parallel topology is completely identical, making it the preferred method.
-class MLACacheFormatter final : public IOFormatter
+class MLACacheFormatter final : public BaseCacheFormatter
 {
 public:
-    using CacheState = executor::kv_cache::CacheState;
-
     MLACacheFormatter(BaseKVCacheManager* cacheManager, CacheTransBufferManager* cacheTransBufferManager)
         : mCacheManager{cacheManager}
         , mCacheTransBufferManager{cacheTransBufferManager}
@@ -66,7 +51,7 @@ class MLACacheFormatter final : public IOFormatter
         return executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx).mIRanks;
     }
 
-    [[nodiscard]] BaseKVCacheManager* getCacheManager() const
+    [[nodiscard]] BaseKVCacheManager* getCacheManager() const noexcept override
     {
         return mCacheManager;
     }
@@ -77,7 +62,7 @@ class MLACacheFormatter final : public IOFormatter
         SizeType32 selfIdx, CacheState const& destConfig) const override;
 
 private:
-    BaseKVCacheManager* mCacheManager{};
+    BaseKVCacheManager* mCacheManager;
     CacheTransBufferManager* mCacheTransBufferManager;
 };
 
diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
@@ -734,13 +734,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
                 mConnectionManager = std::make_unique<texec::kv_cache::MpiConnectionManager>(mComm);
             }
 
-            auto makeFormatter = [this]()
-            {
-                return mIsMLA ? std::unique_ptr<IOFormatter>(
-                           std::make_unique<MLACacheFormatter>(mManager.get(), mCacheTransBufferManager.get()))
-                              : std::unique_ptr<IOFormatter>(
-                                  std::make_unique<CacheFormatter>(mManager.get(), mCacheTransBufferManager.get()));
-            };
+            auto makeFormatter
+                = [this]() { return createCacheFormatter(mManager.get(), mCacheTransBufferManager.get(), mIsMLA); };
 
             if (mIsContext)
             {

Original file line number	Diff line number	Diff line change
`@@ -734,13 +734,8 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara`
`734`	`734`	`mConnectionManager = std::make_unique<texec::kv_cache::MpiConnectionManager>(mComm);`
`735`	`735`	`}`
`736`	`736`
`737`		`- auto makeFormatter = [this]()`
`738`		`- {`
`739`		`- return mIsMLA ? std::unique_ptr<IOFormatter>(`
`740`		`- std::make_unique<MLACacheFormatter>(mManager.get(), mCacheTransBufferManager.get()))`
`741`		`- : std::unique_ptr<IOFormatter>(`
`742`		`- std::make_unique<CacheFormatter>(mManager.get(), mCacheTransBufferManager.get()));`
`743`		`- };`
	`737`	`+ auto makeFormatter`
	`738`	`+ = [this]() { return createCacheFormatter(mManager.get(), mCacheTransBufferManager.get(), mIsMLA); };`
`744`	`739`
`745`	`740`	`if (mIsContext)`
`746`	`741`	`{`