NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 12 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 30 additions & 10 deletions b/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 30 additions & 10 deletions
@@ -57,7 +57,10 @@ class BasePeftCacheManager
 public:
     using LlmRequestPtr = std::shared_ptr<LlmRequest>;
     using RequestVector = std::vector<LlmRequestPtr>;
-    using PeftTable = std::map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using PeftTable = std::unordered_map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using TaskPeftTable = std::unordered_map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using TaskIdToReqIds = std::unordered_map<uint64_t, std::vector<uint64_t>>;
+    using EnsureBatchTaskResult = std::tuple<TaskPeftTable, TaskIdToReqIds>;
 
     virtual ~BasePeftCacheManager() = default;
 
@@ -99,6 +102,8 @@ class BasePeftCacheManager
 class PeftCacheManager : public BasePeftCacheManager
 {
 public:
+    using EnsureBatchTaskResult = BasePeftCacheManager::EnsureBatchTaskResult;
+
     PeftCacheManager(PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
         runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
@@ -109,12 +114,17 @@ class PeftCacheManager : public BasePeftCacheManager
     PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,
         bool resetGpuCache = false) override;
 
+    EnsureBatchTaskResult ensureBatchMapTaskId(
+        RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache = false);
+
     [[nodiscard]] bool isTaskCached(uint64_t taskId) const;
 
     [[nodiscard]] bool isTaskDone(uint64_t taskId) const;
 
     [[nodiscard]] bool isTaskDoneDevice(uint64_t taskId) const;
 
+    [[nodiscard]] bool isTaskCachedDevice(uint64_t const taskId) const;
+
     void resetDeviceCache() override;
 
     void markRequestDone(LlmRequest const& llmReq, bool pause = false) override;
@@ -159,7 +169,7 @@ class PeftCacheManager : public BasePeftCacheManager
     std::unordered_map<uint64_t, std::unordered_set<uint64_t>> mTaskIdToReqIds;
     std::unordered_map<uint64_t, std::unordered_set<uint64_t>> mTaskIdToPausedReqIds;
 
-    std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>> getTaskMaps(
+    std::tuple<std::unordered_map<uint64_t, std::future<void>>, TaskIdToReqIds> getTaskMaps(
         RequestVector const& contextRequests, RequestVector const& generationRequests);
 
     runtime::ModelConfig mModelConfig;
 
@@ -373,11 +373,11 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bo
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>>
+std::tuple<std::unordered_map<uint64_t, std::future<void>>, BasePeftCacheManager::TaskIdToReqIds>
 PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVector const& generationRequests)
 {
-    std::map<uint64_t, std::vector<uint64_t>> taskIdToReqIds;
-    std::map<uint64_t, std::future<void>> taskIdToFuture;
+    TaskIdToReqIds taskIdToReqIds;
+    std::unordered_map<uint64_t, std::future<void>> taskIdToFuture;
     std::lock_guard<std::mutex> futuresLock(mPutFuturesMutex);
     for (auto const& requests : {contextRequests, generationRequests})
     {
@@ -415,7 +415,7 @@ PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVecto
     return {std::move(taskIdToFuture), taskIdToReqIds};
 }
 
-PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
+PeftCacheManager::EnsureBatchTaskResult PeftCacheManager::ensureBatchMapTaskId(
     RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -426,7 +426,7 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
     auto [taskIdToFuture_, taskIdToReqIds] = getTaskMaps(contextRequests, generationRequests);
     auto taskIdToFuture = std::move(taskIdToFuture_); // captured structured bindings are a C++20 extension
 
-    std::map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;
+    std::unordered_map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;
     for (auto& [taskId, taskFuture] : taskIdToFuture)
     {
         auto fn = [&taskIdToFuture, taskId = taskId, this]() -> std::vector<runtime::LoraCache::TaskLayerModuleConfig>
@@ -457,18 +457,31 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
         ensureFutures.try_emplace(taskId, std::move(f));
     }
 
-    PeftTable peftTable{};
+    TaskPeftTable peftTable{};
     for (auto const& [taskId, reqIds] : taskIdToReqIds)
     {
         auto&& f = ensureFutures.at(taskId);
         auto const values = f.get();
-        for (auto const& reqId : reqIds)
+        peftTable.try_emplace(taskId, values);
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    return {std::move(peftTable), std::move(taskIdToReqIds)};
+}
+
+PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
+    RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)
+{
+    auto [taskTable, taskIdToReqIds] = ensureBatchMapTaskId(contextRequests, generationRequests, resetGpuCache);
+    PeftTable requestTable{};
+    for (auto const& [taskId, values] : taskTable)
+    {
+        auto const& reqIds = taskIdToReqIds.at(taskId);
+        for (auto const reqId : reqIds)
         {
-            peftTable.try_emplace(reqId, values);
+            requestTable.try_emplace(reqId, values);
         }
     }
-    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-    return peftTable;
+    return requestTable;
 }
 
 bool PeftCacheManager::isTaskCached(uint64_t taskId) const
@@ -486,6 +499,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const
     return mDeviceLoraCache->isDone(taskId);
 }
 
+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const
+{
+    return mDeviceLoraCache->has(taskId);
+}
+
 void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)
 {
     if (!terminate)
@@ -645,3 +663,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l
     return 0;
 }
 } // namespace tensorrt_llm::batch_manager
+
+// TODO: merge C++ LoRA caching status with Py Slot manager
Original file line number	Diff line number	Diff line change
`@@ -373,11 +373,11 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bo`
`373`	`373`	`TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
`374`	`374`	`}`
`375`	`375`
`376`		`-std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>>`
	`376`	`+std::tuple<std::unordered_map<uint64_t, std::future<void>>, BasePeftCacheManager::TaskIdToReqIds>`
`377`	`377`	`PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVector const& generationRequests)`
`378`	`378`	`{`
`379`		`- std::map<uint64_t, std::vector<uint64_t>> taskIdToReqIds;`
`380`		`- std::map<uint64_t, std::future<void>> taskIdToFuture;`
	`379`	`+ TaskIdToReqIds taskIdToReqIds;`
	`380`	`+ std::unordered_map<uint64_t, std::future<void>> taskIdToFuture;`
`381`	`381`	`std::lock_guard<std::mutex> futuresLock(mPutFuturesMutex);`
`382`	`382`	`for (auto const& requests : {contextRequests, generationRequests})`
`383`	`383`	`{`
`@@ -415,7 +415,7 @@ PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVecto`
`415`	`415`	`return {std::move(taskIdToFuture), taskIdToReqIds};`
`416`	`416`	`}`
`417`	`417`
`418`		`-PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
	`418`	`+PeftCacheManager::EnsureBatchTaskResult PeftCacheManager::ensureBatchMapTaskId(`
`419`	`419`	`RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)`
`420`	`420`	`{`
`421`	`421`	`TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);`
`@@ -426,7 +426,7 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
`426`	`426`	`auto [taskIdToFuture_, taskIdToReqIds] = getTaskMaps(contextRequests, generationRequests);`
`427`	`427`	`auto taskIdToFuture = std::move(taskIdToFuture_); // captured structured bindings are a C++20 extension`
`428`	`428`
`429`		`- std::map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;`
	`429`	`+ std::unordered_map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;`
`430`	`430`	`for (auto& [taskId, taskFuture] : taskIdToFuture)`
`431`	`431`	`{`
`432`	`432`	`auto fn = [&taskIdToFuture, taskId = taskId, this]() -> std::vector<runtime::LoraCache::TaskLayerModuleConfig>`
`@@ -457,18 +457,31 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
`457`	`457`	`ensureFutures.try_emplace(taskId, std::move(f));`
`458`	`458`	`}`
`459`	`459`
`460`		`- PeftTable peftTable{};`
	`460`	`+ TaskPeftTable peftTable{};`
`461`	`461`	`for (auto const& [taskId, reqIds] : taskIdToReqIds)`
`462`	`462`	`{`
`463`	`463`	`auto&& f = ensureFutures.at(taskId);`
`464`	`464`	`auto const values = f.get();`
`465`		`- for (auto const& reqId : reqIds)`
	`465`	`+ peftTable.try_emplace(taskId, values);`
	`466`	`+ }`
	`467`	`+ TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
	`468`	`+ return {std::move(peftTable), std::move(taskIdToReqIds)};`
	`469`	`+}`
	`470`	`+`
	`471`	`+PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
	`472`	`+ RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)`
	`473`	`+{`
	`474`	`+ auto [taskTable, taskIdToReqIds] = ensureBatchMapTaskId(contextRequests, generationRequests, resetGpuCache);`
	`475`	`+ PeftTable requestTable{};`
	`476`	`+ for (auto const& [taskId, values] : taskTable)`
	`477`	`+ {`
	`478`	`+ auto const& reqIds = taskIdToReqIds.at(taskId);`
	`479`	`+ for (auto const reqId : reqIds)`
`466`	`480`	`{`
`467`		`- peftTable.try_emplace(reqId, values);`
	`481`	`+ requestTable.try_emplace(reqId, values);`
`468`	`482`	`}`
`469`	`483`	`}`
`470`		`- TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
`471`		`- return peftTable;`
	`484`	`+ return requestTable;`
`472`	`485`	`}`
`473`	`486`
`474`	`487`	`bool PeftCacheManager::isTaskCached(uint64_t taskId) const`
`@@ -486,6 +499,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const`
`486`	`499`	`return mDeviceLoraCache->isDone(taskId);`
`487`	`500`	`}`
`488`	`501`
	`502`	`+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const`
	`503`	`+{`
	`504`	`+ return mDeviceLoraCache->has(taskId);`
	`505`	`+}`
	`506`	`+`
`489`	`507`	`void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)`
`490`	`508`	`{`
`491`	`509`	`if (!terminate)`
`@@ -645,3 +663,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l`
`645`	`663`	`return 0;`
`646`	`664`	`}`
`647`	`665`	`} // namespace tensorrt_llm::batch_manager`
	`666`	`+`
	`667`	`+// TODO: merge C++ LoRA caching status with Py Slot manager`