NVIDIA
diff --git a/‎ATTRIBUTIONS-Python.md‎
Lines changed: 1 addition & 1 deletion b/‎ATTRIBUTIONS-Python.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 12 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 1 deletion b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 30 additions & 10 deletions b/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 30 additions & 10 deletions
diff --git a/‎cpp/tensorrt_llm/executor/executorImpl.cpp‎
Lines changed: 5 additions & 6 deletions b/‎cpp/tensorrt_llm/executor/executorImpl.cpp‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/executor/executorImpl.h‎
Lines changed: 17 additions & 3 deletions b/‎cpp/tensorrt_llm/executor/executorImpl.h‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/executor/request.cpp‎
Lines changed: 19 additions & 8 deletions b/‎cpp/tensorrt_llm/executor/request.cpp‎
Lines changed: 19 additions & 8 deletions
@@ -5261,7 +5261,7 @@ For more information, please refer to <http://unlicense.org>
   - `Tracker`: https://github.com/tox-dev/py-filelock/issues
 
 
-## flashinfer-python (0.6.0)
+## flashinfer-python (0.6.1)
 
 ### Licenses
 License: `Apache-2.0`
 
@@ -13,7 +13,7 @@ This branch is a prototype and not stable for production use. PRs are not accept
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.1.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.9.1-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.3.0rc0-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.3.0rc1-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -57,7 +57,10 @@ class BasePeftCacheManager
 public:
     using LlmRequestPtr = std::shared_ptr<LlmRequest>;
     using RequestVector = std::vector<LlmRequestPtr>;
-    using PeftTable = std::map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using PeftTable = std::unordered_map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using TaskPeftTable = std::unordered_map<uint64_t, std::vector<runtime::LoraCache::TaskLayerModuleConfig>>;
+    using TaskIdToReqIds = std::unordered_map<uint64_t, std::vector<uint64_t>>;
+    using EnsureBatchTaskResult = std::tuple<TaskPeftTable, TaskIdToReqIds>;
 
     virtual ~BasePeftCacheManager() = default;
 
@@ -99,6 +102,8 @@ class BasePeftCacheManager
 class PeftCacheManager : public BasePeftCacheManager
 {
 public:
+    using EnsureBatchTaskResult = BasePeftCacheManager::EnsureBatchTaskResult;
+
     PeftCacheManager(PeftCacheManagerConfig const& config, runtime::ModelConfig const& modelConfig,
         runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
@@ -109,12 +114,17 @@ class PeftCacheManager : public BasePeftCacheManager
     PeftTable ensureBatch(RequestVector const& contextRequests, RequestVector const& generationRequests,
         bool resetGpuCache = false) override;
 
+    EnsureBatchTaskResult ensureBatchMapTaskId(
+        RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache = false);
+
     [[nodiscard]] bool isTaskCached(uint64_t taskId) const;
 
     [[nodiscard]] bool isTaskDone(uint64_t taskId) const;
 
     [[nodiscard]] bool isTaskDoneDevice(uint64_t taskId) const;
 
+    [[nodiscard]] bool isTaskCachedDevice(uint64_t const taskId) const;
+
     void resetDeviceCache() override;
 
     void markRequestDone(LlmRequest const& llmReq, bool pause = false) override;
@@ -159,7 +169,7 @@ class PeftCacheManager : public BasePeftCacheManager
     std::unordered_map<uint64_t, std::unordered_set<uint64_t>> mTaskIdToReqIds;
     std::unordered_map<uint64_t, std::unordered_set<uint64_t>> mTaskIdToPausedReqIds;
 
-    std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>> getTaskMaps(
+    std::tuple<std::unordered_map<uint64_t, std::future<void>>, TaskIdToReqIds> getTaskMaps(
         RequestVector const& contextRequests, RequestVector const& generationRequests);
 
     runtime::ModelConfig mModelConfig;
 
@@ -684,6 +684,7 @@ class Request
     /// finish reason. The request may exceed this time slightly, but at most by 1 forward pass (in pipeline parallelism
     /// that may involve multiple micro-batches). A request can be timed-out before ever being scheduled.
     /// @param cacheSaltID Salt ID for KV cache blocks to limit the kv cache reuse to the requests with the same string.
+    /// @param disaggRequestId Disaggregated request ID.
     Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming = false,
         SamplingConfig const& samplingConfig = SamplingConfig(), OutputConfig const& outputConfig = OutputConfig(),
         std::optional<SizeType32> const& endId = std::nullopt, std::optional<SizeType32> const& padId = std::nullopt,
@@ -711,7 +712,8 @@ class Request
         std::optional<GuidedDecodingParams> guidedDecodingParams = std::nullopt,
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt);
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
+        std::optional<IdType> disaggRequestId = std::nullopt);
 
     /// @brief This logits postprocessor name will dispatch to the batched logits postprocessor
     static auto constexpr kBatchedPostProcessorName = "batched";
@@ -761,6 +763,7 @@ class Request
     [[nodiscard]] std::optional<MillisecondsType> getAllottedTimeMs() const;
     [[nodiscard]] std::optional<CacheSaltIDType> getCacheSaltID() const;
     [[nodiscard]] std::optional<std::vector<std::string>> getAdditionalOutputNames() const;
+    [[nodiscard]] std::optional<IdType> getDisaggRequestId() const;
 
     void setStreaming(bool streaming);
     void setSamplingConfig(SamplingConfig const& config);
@@ -796,6 +799,7 @@ class Request
     void setLanguageAdapterUid(SizeType32 languageAdapterUid);
     void setAllottedTimeMs(MillisecondsType allottedTimeMs);
     void setCacheSaltID(CacheSaltIDType cacheSaltID);
+    void setDisaggRequestId(IdType disaggRequestId);
 
 private:
     friend class Serialization;
 
@@ -373,11 +373,11 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bo
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>>
+std::tuple<std::unordered_map<uint64_t, std::future<void>>, BasePeftCacheManager::TaskIdToReqIds>
 PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVector const& generationRequests)
 {
-    std::map<uint64_t, std::vector<uint64_t>> taskIdToReqIds;
-    std::map<uint64_t, std::future<void>> taskIdToFuture;
+    TaskIdToReqIds taskIdToReqIds;
+    std::unordered_map<uint64_t, std::future<void>> taskIdToFuture;
     std::lock_guard<std::mutex> futuresLock(mPutFuturesMutex);
     for (auto const& requests : {contextRequests, generationRequests})
     {
@@ -415,7 +415,7 @@ PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVecto
     return {std::move(taskIdToFuture), taskIdToReqIds};
 }
 
-PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
+PeftCacheManager::EnsureBatchTaskResult PeftCacheManager::ensureBatchMapTaskId(
     RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -426,7 +426,7 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
     auto [taskIdToFuture_, taskIdToReqIds] = getTaskMaps(contextRequests, generationRequests);
     auto taskIdToFuture = std::move(taskIdToFuture_); // captured structured bindings are a C++20 extension
 
-    std::map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;
+    std::unordered_map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;
     for (auto& [taskId, taskFuture] : taskIdToFuture)
     {
         auto fn = [&taskIdToFuture, taskId = taskId, this]() -> std::vector<runtime::LoraCache::TaskLayerModuleConfig>
@@ -457,18 +457,31 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
         ensureFutures.try_emplace(taskId, std::move(f));
     }
 
-    PeftTable peftTable{};
+    TaskPeftTable peftTable{};
     for (auto const& [taskId, reqIds] : taskIdToReqIds)
     {
         auto&& f = ensureFutures.at(taskId);
         auto const values = f.get();
-        for (auto const& reqId : reqIds)
+        peftTable.try_emplace(taskId, values);
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    return {std::move(peftTable), std::move(taskIdToReqIds)};
+}
+
+PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
+    RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)
+{
+    auto [taskTable, taskIdToReqIds] = ensureBatchMapTaskId(contextRequests, generationRequests, resetGpuCache);
+    PeftTable requestTable{};
+    for (auto const& [taskId, values] : taskTable)
+    {
+        auto const& reqIds = taskIdToReqIds.at(taskId);
+        for (auto const reqId : reqIds)
         {
-            peftTable.try_emplace(reqId, values);
+            requestTable.try_emplace(reqId, values);
         }
     }
-    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-    return peftTable;
+    return requestTable;
 }
 
 bool PeftCacheManager::isTaskCached(uint64_t taskId) const
@@ -486,6 +499,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const
     return mDeviceLoraCache->isDone(taskId);
 }
 
+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const
+{
+    return mDeviceLoraCache->has(taskId);
+}
+
 void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)
 {
     if (!terminate)
@@ -645,3 +663,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l
     return 0;
 }
 } // namespace tensorrt_llm::batch_manager
+
+// TODO: merge C++ LoRA caching status with Py Slot manager
@@ -907,7 +907,7 @@ std::vector<IdType> Executor::Impl::enqueueRequests(common::ArrayView<Request co
         auto now = std::chrono::steady_clock::now();
         for (auto const& req : requests)
         {
-            ids.emplace_back(generateReqId());
+            ids.emplace_back(generateReqId(req));
             TLLM_LOG_DEBUG("Enqueue new request with id %d", ids.back());
 
             std::vector<IdType> childReqIds;
@@ -917,7 +917,7 @@ std::vector<IdType> Executor::Impl::enqueueRequests(common::ArrayView<Request co
                 childReqIds.reserve(numChildRequests);
                 for (int childId = 0; childId < numChildRequests; childId++)
                 {
-                    childReqIds.emplace_back(generateReqId());
+                    childReqIds.emplace_back(generateLocalReqId());
                     TLLM_LOG_DEBUG("Add new child request with id %d", childReqIds.back());
                 }
             }
@@ -1319,7 +1319,7 @@ std::vector<RequestWithId> Executor::Impl::getLeaderNewReqWithIds(
         return reqWithIds;
     }
 
-    if (mQueuedRequests.front().id == mTerminateReqId)
+    if (mQueuedRequests.front().id == kTerminateReqId)
     {
         reqWithIds.emplace_back(std::move(mQueuedRequests.front()));
         mQueuedRequests.pop_front();
@@ -1468,7 +1468,7 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests
     double newActiveRequestsQueueLatencyMS{0.};
     for (auto& reqWithId : reqWithIds)
     {
-        if (reqWithId.id == mTerminateReqId)
+        if (reqWithId.id == kTerminateReqId)
         {
             mShutdown = true;
             mResponsesCv.notify_all();
@@ -2357,7 +2357,6 @@ void Executor::Impl::executionLoop()
                 }
             }
         }
-
         if (!activeRequests.empty())
         {
             forwardAsync(activeRequests);
@@ -2411,7 +2410,7 @@ void Executor::Impl::enqueueTerminateRequest()
     {
         std::scoped_lock<std::mutex> lck(mQueuedReqMtx);
         Request dummyReq({1}, 1);
-        RequestWithId reqWithId{std::move(dummyReq), mTerminateReqId};
+        RequestWithId reqWithId{std::move(dummyReq), kTerminateReqId};
         mQueuedRequests.emplace_back(reqWithId);
     }
     mQueuedReqCv.notify_one();
 
@@ -178,9 +178,20 @@ class Executor::Impl
 
     void initializeLogitsPostProcessorBatched(LogitsPostProcessorConfig const& logitsProcConfig);
 
-    IdType generateReqId()
+    IdType generateReqId(Request const& request)
     {
-        return (mLastReqId++ % UINT64_MAX);
+        // If the request has a disaggregated request id, prefer it.
+        if (request.getDisaggRequestId().has_value() && request.getDisaggRequestId().value() > kMaxLocalReqId)
+        {
+            return request.getDisaggRequestId().value();
+        }
+        // Otherwise, generate a local request id in range [1, kMaxLocalReqId).
+        return generateLocalReqId();
+    }
+
+    IdType generateLocalReqId()
+    {
+        return (mLastReqId++ % kMaxLocalReqId);
     }
 
     std::vector<RequestWithId> getLeaderNewReqWithIds(
@@ -315,7 +326,10 @@ class Executor::Impl
 
     IdType mLastReqId = 1;
 
-    static constexpr IdType mTerminateReqId = 0;
+    static constexpr IdType kTerminateReqId = 0;
+    // Request id > kMaxLocalReqId is reserved for disaggregated requests.
+    // This max ID is also in Python side.
+    static constexpr IdType kMaxLocalReqId = 1ULL << 42U;
 
     BatchingType mBatchingType;
     bool mIsSchedulerMaxUtilization;
 
@@ -40,7 +40,8 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,
     std::optional<SizeType32> encoderOutputLength, std::optional<Tensor> crossAttentionMask,
     SizeType32 numReturnSequences, std::optional<EagleConfig> eagleConfig, std::optional<Tensor> skipCrossAttnBlocks,
     std::optional<GuidedDecodingParams> guidedDecodingParams, std::optional<SizeType32> languageAdapterUid,
-    std::optional<MillisecondsType> allottedTimeMs, std::optional<CacheSaltIDType> cacheSaltID)
+    std::optional<MillisecondsType> allottedTimeMs, std::optional<CacheSaltIDType> cacheSaltID,
+    std::optional<IdType> disaggRequestId)
     : mImpl(std::make_unique<Impl>(std::move(inputTokenIds), maxTokens, streaming, samplingConfig, outputConfig, endId,
         padId, std::move(positionIds), std::move(badWords), std::move(stopWords), std::move(embeddingBias),
         std::move(externalDraftTokensConfig), std::move(pTuningConfig), std::move(multimodalInput),
@@ -49,7 +50,7 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,
         std::move(encoderInputTokenIds), clientId, returnAllGeneratedTokens, priority, type,
         std::move(contextPhaseParams), std::move(encoderInputFeatures), encoderOutputLength, crossAttentionMask,
         numReturnSequences, eagleConfig, skipCrossAttnBlocks, std::move(guidedDecodingParams), languageAdapterUid,
-        allottedTimeMs, cacheSaltID))
+        allottedTimeMs, cacheSaltID, disaggRequestId))
 {
 }
 
@@ -253,6 +254,11 @@ std::optional<CacheSaltIDType> Request::getCacheSaltID() const
     return mImpl->getCacheSaltID();
 }
 
+std::optional<IdType> Request::getDisaggRequestId() const
+{
+    return mImpl->getDisaggRequestId();
+}
+
 void Request::setStreaming(bool streaming)
 {
     mImpl->setStreaming(streaming);
@@ -310,12 +316,12 @@ void Request::setPromptTuningConfig(PromptTuningConfig const& pTuningConfig)
 
 void Request::setMultimodalEmbedding(Tensor const& multimodalEmbedding)
 {
-    return mImpl->setMultimodalEmbedding(multimodalEmbedding);
+    mImpl->setMultimodalEmbedding(multimodalEmbedding);
 }
 
 void Request::setMultimodalInput(MultimodalInput const& multimodalInput)
 {
-    return mImpl->setMultimodalInput(multimodalInput);
+    mImpl->setMultimodalInput(multimodalInput);
 }
 
 void Request::setMropeConfig(MropeConfig const& mRopeConfig)
@@ -400,7 +406,7 @@ void Request::setEagleConfig(std::optional<EagleConfig> const& eagleConfig)
 
 void Request::setSkipCrossAttnBlocks(Tensor skipCrossAttnBlocks)
 {
-    return mImpl->setSkipCrossAttnBlocks(skipCrossAttnBlocks);
+    mImpl->setSkipCrossAttnBlocks(skipCrossAttnBlocks);
 }
 
 void Request::setGuidedDecodingParams(GuidedDecodingParams const& guidedDecodingParams)
@@ -410,16 +416,21 @@ void Request::setGuidedDecodingParams(GuidedDecodingParams const& guidedDecoding
 
 void Request::setAllottedTimeMs(MillisecondsType allottedTimeMs)
 {
-    return mImpl->setAllottedTimeMs(allottedTimeMs);
+    mImpl->setAllottedTimeMs(allottedTimeMs);
 }
 
 void Request::setLanguageAdapterUid(SizeType32 languageAdapterUid)
 {
-    return mImpl->setLanguageAdapterUid(languageAdapterUid);
+    mImpl->setLanguageAdapterUid(languageAdapterUid);
 }
 
 void Request::setCacheSaltID(CacheSaltIDType cacheSaltID)
 {
-    return mImpl->setCacheSaltID(cacheSaltID);
+    mImpl->setCacheSaltID(cacheSaltID);
+}
+
+void Request::setDisaggRequestId(IdType disaggRequestId)
+{
+    mImpl->setDisaggRequestId(disaggRequestId);
 }
 } // namespace tensorrt_llm::executor
Original file line number	Diff line number	Diff line change
`@@ -373,11 +373,11 @@ void PeftCacheManager::addRequestPeft(std::shared_ptr<LlmRequest> llmRequest, bo`
`373`	`373`	`TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
`374`	`374`	`}`
`375`	`375`
`376`		`-std::tuple<std::map<uint64_t, std::future<void>>, std::map<uint64_t, std::vector<uint64_t>>>`
	`376`	`+std::tuple<std::unordered_map<uint64_t, std::future<void>>, BasePeftCacheManager::TaskIdToReqIds>`
`377`	`377`	`PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVector const& generationRequests)`
`378`	`378`	`{`
`379`		`- std::map<uint64_t, std::vector<uint64_t>> taskIdToReqIds;`
`380`		`- std::map<uint64_t, std::future<void>> taskIdToFuture;`
	`379`	`+ TaskIdToReqIds taskIdToReqIds;`
	`380`	`+ std::unordered_map<uint64_t, std::future<void>> taskIdToFuture;`
`381`	`381`	`std::lock_guard<std::mutex> futuresLock(mPutFuturesMutex);`
`382`	`382`	`for (auto const& requests : {contextRequests, generationRequests})`
`383`	`383`	`{`
`@@ -415,7 +415,7 @@ PeftCacheManager::getTaskMaps(RequestVector const& contextRequests, RequestVecto`
`415`	`415`	`return {std::move(taskIdToFuture), taskIdToReqIds};`
`416`	`416`	`}`
`417`	`417`
`418`		`-PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
	`418`	`+PeftCacheManager::EnsureBatchTaskResult PeftCacheManager::ensureBatchMapTaskId(`
`419`	`419`	`RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)`
`420`	`420`	`{`
`421`	`421`	`TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);`
`@@ -426,7 +426,7 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
`426`	`426`	`auto [taskIdToFuture_, taskIdToReqIds] = getTaskMaps(contextRequests, generationRequests);`
`427`	`427`	`auto taskIdToFuture = std::move(taskIdToFuture_); // captured structured bindings are a C++20 extension`
`428`	`428`
`429`		`- std::map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;`
	`429`	`+ std::unordered_map<uint64_t, std::future<std::vector<runtime::LoraCache::TaskLayerModuleConfig>>> ensureFutures;`
`430`	`430`	`for (auto& [taskId, taskFuture] : taskIdToFuture)`
`431`	`431`	`{`
`432`	`432`	`auto fn = [&taskIdToFuture, taskId = taskId, this]() -> std::vector<runtime::LoraCache::TaskLayerModuleConfig>`
`@@ -457,18 +457,31 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
`457`	`457`	`ensureFutures.try_emplace(taskId, std::move(f));`
`458`	`458`	`}`
`459`	`459`
`460`		`- PeftTable peftTable{};`
	`460`	`+ TaskPeftTable peftTable{};`
`461`	`461`	`for (auto const& [taskId, reqIds] : taskIdToReqIds)`
`462`	`462`	`{`
`463`	`463`	`auto&& f = ensureFutures.at(taskId);`
`464`	`464`	`auto const values = f.get();`
`465`		`- for (auto const& reqId : reqIds)`
	`465`	`+ peftTable.try_emplace(taskId, values);`
	`466`	`+ }`
	`467`	`+ TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
	`468`	`+ return {std::move(peftTable), std::move(taskIdToReqIds)};`
	`469`	`+}`
	`470`	`+`
	`471`	`+PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
	`472`	`+ RequestVector const& contextRequests, RequestVector const& generationRequests, bool resetGpuCache)`
	`473`	`+{`
	`474`	`+ auto [taskTable, taskIdToReqIds] = ensureBatchMapTaskId(contextRequests, generationRequests, resetGpuCache);`
	`475`	`+ PeftTable requestTable{};`
	`476`	`+ for (auto const& [taskId, values] : taskTable)`
	`477`	`+ {`
	`478`	`+ auto const& reqIds = taskIdToReqIds.at(taskId);`
	`479`	`+ for (auto const reqId : reqIds)`
`466`	`480`	`{`
`467`		`- peftTable.try_emplace(reqId, values);`
	`481`	`+ requestTable.try_emplace(reqId, values);`
`468`	`482`	`}`
`469`	`483`	`}`
`470`		`- TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
`471`		`- return peftTable;`
	`484`	`+ return requestTable;`
`472`	`485`	`}`
`473`	`486`
`474`	`487`	`bool PeftCacheManager::isTaskCached(uint64_t taskId) const`
`@@ -486,6 +499,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const`
`486`	`499`	`return mDeviceLoraCache->isDone(taskId);`
`487`	`500`	`}`
`488`	`501`
	`502`	`+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const`
	`503`	`+{`
	`504`	`+ return mDeviceLoraCache->has(taskId);`
	`505`	`+}`
	`506`	`+`
`489`	`507`	`void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)`
`490`	`508`	`{`
`491`	`509`	`if (!terminate)`
`@@ -645,3 +663,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l`
`645`	`663`	`return 0;`
`646`	`664`	`}`
`647`	`665`	`} // namespace tensorrt_llm::batch_manager`
	`666`	`+`
	`667`	`+// TODO: merge C++ LoRA caching status with Py Slot manager`
Original file line number	Diff line number	Diff line change
`@@ -907,7 +907,7 @@ std::vector<IdType> Executor::Impl::enqueueRequests(common::ArrayView<Request co`
`907`	`907`	`auto now = std::chrono::steady_clock::now();`
`908`	`908`	`for (auto const& req : requests)`
`909`	`909`	`{`
`910`		`- ids.emplace_back(generateReqId());`
	`910`	`+ ids.emplace_back(generateReqId(req));`
`911`	`911`	`TLLM_LOG_DEBUG("Enqueue new request with id %d", ids.back());`
`912`	`912`
`913`	`913`	`std::vector<IdType> childReqIds;`
`@@ -917,7 +917,7 @@ std::vector<IdType> Executor::Impl::enqueueRequests(common::ArrayView<Request co`
`917`	`917`	`childReqIds.reserve(numChildRequests);`
`918`	`918`	`for (int childId = 0; childId < numChildRequests; childId++)`
`919`	`919`	`{`
`920`		`- childReqIds.emplace_back(generateReqId());`
	`920`	`+ childReqIds.emplace_back(generateLocalReqId());`
`921`	`921`	`TLLM_LOG_DEBUG("Add new child request with id %d", childReqIds.back());`
`922`	`922`	`}`
`923`	`923`	`}`
`@@ -1319,7 +1319,7 @@ std::vector<RequestWithId> Executor::Impl::getLeaderNewReqWithIds(`
`1319`	`1319`	`return reqWithIds;`
`1320`	`1320`	`}`
`1321`	`1321`
`1322`		`- if (mQueuedRequests.front().id == mTerminateReqId)`
	`1322`	`+ if (mQueuedRequests.front().id == kTerminateReqId)`
`1323`	`1323`	`{`
`1324`	`1324`	`reqWithIds.emplace_back(std::move(mQueuedRequests.front()));`
`1325`	`1325`	`mQueuedRequests.pop_front();`
`@@ -1468,7 +1468,7 @@ std::tuple<Executor::Impl::RequestList, double> Executor::Impl::fetchNewRequests`
`1468`	`1468`	`double newActiveRequestsQueueLatencyMS{0.};`
`1469`	`1469`	`for (auto& reqWithId : reqWithIds)`
`1470`	`1470`	`{`
`1471`		`- if (reqWithId.id == mTerminateReqId)`
	`1471`	`+ if (reqWithId.id == kTerminateReqId)`
`1472`	`1472`	`{`
`1473`	`1473`	`mShutdown = true;`
`1474`	`1474`	`mResponsesCv.notify_all();`
`@@ -2357,7 +2357,6 @@ void Executor::Impl::executionLoop()`
`2357`	`2357`	`}`
`2358`	`2358`	`}`
`2359`	`2359`	`}`
`2360`		`-`
`2361`	`2360`	`if (!activeRequests.empty())`
`2362`	`2361`	`{`
`2363`	`2362`	`forwardAsync(activeRequests);`
`@@ -2411,7 +2410,7 @@ void Executor::Impl::enqueueTerminateRequest()`
`2411`	`2410`	`{`
`2412`	`2411`	`std::scoped_lock<std::mutex> lck(mQueuedReqMtx);`
`2413`	`2412`	`Request dummyReq({1}, 1);`
`2414`		`- RequestWithId reqWithId{std::move(dummyReq), mTerminateReqId};`
	`2413`	`+ RequestWithId reqWithId{std::move(dummyReq), kTerminateReqId};`
`2415`	`2414`	`mQueuedRequests.emplace_back(reqWithId);`
`2416`	`2415`	`}`
`2417`	`2416`	`mQueuedReqCv.notify_one();`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,8 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,`
`40`	`40`	`std::optional<SizeType32> encoderOutputLength, std::optional<Tensor> crossAttentionMask,`
`41`	`41`	`SizeType32 numReturnSequences, std::optional<EagleConfig> eagleConfig, std::optional<Tensor> skipCrossAttnBlocks,`
`42`	`42`	`std::optional<GuidedDecodingParams> guidedDecodingParams, std::optional<SizeType32> languageAdapterUid,`
`43`		`- std::optional<MillisecondsType> allottedTimeMs, std::optional<CacheSaltIDType> cacheSaltID)`
	`43`	`+ std::optional<MillisecondsType> allottedTimeMs, std::optional<CacheSaltIDType> cacheSaltID,`
	`44`	`+ std::optional<IdType> disaggRequestId)`
`44`	`45`	`: mImpl(std::make_unique<Impl>(std::move(inputTokenIds), maxTokens, streaming, samplingConfig, outputConfig, endId,`
`45`	`46`	`padId, std::move(positionIds), std::move(badWords), std::move(stopWords), std::move(embeddingBias),`
`46`	`47`	`std::move(externalDraftTokensConfig), std::move(pTuningConfig), std::move(multimodalInput),`
`@@ -49,7 +50,7 @@ Request::Request(VecTokens inputTokenIds, SizeType32 maxTokens, bool streaming,`
`49`	`50`	`std::move(encoderInputTokenIds), clientId, returnAllGeneratedTokens, priority, type,`
`50`	`51`	`std::move(contextPhaseParams), std::move(encoderInputFeatures), encoderOutputLength, crossAttentionMask,`
`51`	`52`	`numReturnSequences, eagleConfig, skipCrossAttnBlocks, std::move(guidedDecodingParams), languageAdapterUid,`
`52`		`- allottedTimeMs, cacheSaltID))`
	`53`	`+ allottedTimeMs, cacheSaltID, disaggRequestId))`
`53`	`54`	`{`
`54`	`55`	`}`
`55`	`56`
`@@ -253,6 +254,11 @@ std::optional<CacheSaltIDType> Request::getCacheSaltID() const`
`253`	`254`	`return mImpl->getCacheSaltID();`
`254`	`255`	`}`
`255`	`256`
	`257`	`+std::optional<IdType> Request::getDisaggRequestId() const`
	`258`	`+{`
	`259`	`+ return mImpl->getDisaggRequestId();`
	`260`	`+}`
	`261`	`+`
`256`	`262`	`void Request::setStreaming(bool streaming)`
`257`	`263`	`{`
`258`	`264`	`mImpl->setStreaming(streaming);`
`@@ -310,12 +316,12 @@ void Request::setPromptTuningConfig(PromptTuningConfig const& pTuningConfig)`
`310`	`316`
`311`	`317`	`void Request::setMultimodalEmbedding(Tensor const& multimodalEmbedding)`
`312`	`318`	`{`
`313`		`- return mImpl->setMultimodalEmbedding(multimodalEmbedding);`
	`319`	`+ mImpl->setMultimodalEmbedding(multimodalEmbedding);`
`314`	`320`	`}`
`315`	`321`
`316`	`322`	`void Request::setMultimodalInput(MultimodalInput const& multimodalInput)`
`317`	`323`	`{`
`318`		`- return mImpl->setMultimodalInput(multimodalInput);`
	`324`	`+ mImpl->setMultimodalInput(multimodalInput);`
`319`	`325`	`}`
`320`	`326`
`321`	`327`	`void Request::setMropeConfig(MropeConfig const& mRopeConfig)`
`@@ -400,7 +406,7 @@ void Request::setEagleConfig(std::optional<EagleConfig> const& eagleConfig)`
`400`	`406`
`401`	`407`	`void Request::setSkipCrossAttnBlocks(Tensor skipCrossAttnBlocks)`
`402`	`408`	`{`
`403`		`- return mImpl->setSkipCrossAttnBlocks(skipCrossAttnBlocks);`
	`409`	`+ mImpl->setSkipCrossAttnBlocks(skipCrossAttnBlocks);`
`404`	`410`	`}`
`405`	`411`
`406`	`412`	`void Request::setGuidedDecodingParams(GuidedDecodingParams const& guidedDecodingParams)`
`@@ -410,16 +416,21 @@ void Request::setGuidedDecodingParams(GuidedDecodingParams const& guidedDecoding`
`410`	`416`
`411`	`417`	`void Request::setAllottedTimeMs(MillisecondsType allottedTimeMs)`
`412`	`418`	`{`
`413`		`- return mImpl->setAllottedTimeMs(allottedTimeMs);`
	`419`	`+ mImpl->setAllottedTimeMs(allottedTimeMs);`
`414`	`420`	`}`
`415`	`421`
`416`	`422`	`void Request::setLanguageAdapterUid(SizeType32 languageAdapterUid)`
`417`	`423`	`{`
`418`		`- return mImpl->setLanguageAdapterUid(languageAdapterUid);`
	`424`	`+ mImpl->setLanguageAdapterUid(languageAdapterUid);`
`419`	`425`	`}`
`420`	`426`
`421`	`427`	`void Request::setCacheSaltID(CacheSaltIDType cacheSaltID)`
`422`	`428`	`{`
`423`		`- return mImpl->setCacheSaltID(cacheSaltID);`
	`429`	`+ mImpl->setCacheSaltID(cacheSaltID);`
	`430`	`+}`
	`431`	`+`
	`432`	`+void Request::setDisaggRequestId(IdType disaggRequestId)`
	`433`	`+{`
	`434`	`+ mImpl->setDisaggRequestId(disaggRequestId);`
`424`	`435`	`}`
`425`	`436`	`} // namespace tensorrt_llm::executor`