NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 2 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 9 additions & 4 deletions b/‎cpp/tensorrt_llm/batch_manager/peftCacheManager.cpp‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cuda_graph_grouped_gemm.cu‎
Lines changed: 382 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/cuda_graph_grouped_gemm.cu‎
Lines changed: 382 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cuda_graph_grouped_gemm.h‎
Lines changed: 62 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/cuda_graph_grouped_gemm.h‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/lora/lora.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/lora/lora.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -115,6 +115,8 @@ class PeftCacheManager : public BasePeftCacheManager
 
     [[nodiscard]] bool isTaskDoneDevice(uint64_t taskId) const;
 
+    [[nodiscard]] bool isTaskCachedDevice(uint64_t const taskId) const;
+
     void resetDeviceCache() override;
 
     void markRequestDone(LlmRequest const& llmReq, bool pause = false) override;
 
@@ -462,10 +462,8 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(
     {
         auto&& f = ensureFutures.at(taskId);
         auto const values = f.get();
-        for (auto const& reqId : reqIds)
-        {
-            peftTable.try_emplace(reqId, values);
-        }
+        // Map task_id to layer-module-configs instead of request_id to layer-module-configs
+        peftTable.try_emplace(taskId, values);
     }
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return peftTable;
@@ -486,6 +484,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const
     return mDeviceLoraCache->isDone(taskId);
 }
 
+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const
+{
+    return mDeviceLoraCache->has(taskId);
+}
+
 void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)
 {
     if (!terminate)
@@ -645,3 +648,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l
     return 0;
 }
 } // namespace tensorrt_llm::batch_manager
+
+// TODO: merge C++ LoRA caching status with Py Slot manager
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/gemm_coord.h"
+#include <NvInferRuntime.h>
+#include <cuda_runtime.h>
+
+namespace tensorrt_llm
+{
+namespace kernels
+{
+
+/**
+ * @brief CUDA Graph compatible wrapper for grouped GEMM operations.
+ *
+ * This function accepts GPU pointers directly without any workspace for parameters,
+ * making it fully compatible with CUDA Graph capture and replay.
+ *
+ * @param problem_sizes_ptr GPU pointer to array of cutlass::gemm::GemmCoord
+ * @param problem_count Number of GEMM problems
+ * @param ptrA_gpu GPU pointer to array of A matrix pointers
+ * @param ptrB_gpu GPU pointer to array of B matrix pointers
+ * @param ptrC_gpu GPU pointer to array of C matrix pointers (can be nullptr)
+ * @param ptrD_gpu GPU pointer to array of D matrix pointers
+ * @param isLoraIn Whether this is for LoRA input transformation
+ * @param dataType Data type of the matrices
+ * @param minKN Minimum K*N value for kernel selection
+ * @param stream CUDA stream
+ */
+void cuda_graph_grouped_gemm(cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count, void** ptrA_gpu,
+    void** ptrB_gpu, void** ptrC_gpu, void** ptrD_gpu, int64_t* lda_gpu, int64_t* ldb_gpu, int64_t* ldc_gpu,
+    int64_t* ldd_gpu, bool isLoraIn, nvinfer1::DataType dataType, int minKN,
+    cutlass::gemm::GemmCoord* host_max_problem_sizes_ptr, cudaStream_t stream);
+
+/**
+ * @brief CUDA Graph compatible wrapper for split-K grouped GEMM operations.
+ *
+ * Similar to cuda_graph_grouped_gemm but uses split-K algorithm for better
+ * performance with certain problem sizes. No parameter workspace needed.
+ */
+void cuda_graph_splitk_grouped_gemm(cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count, void** ptrA_gpu,
+    void** ptrB_gpu, void** ptrC_gpu, void** ptrD_gpu, int64_t* lda_gpu, int64_t* ldb_gpu, int64_t* ldc_gpu,
+    int64_t* ldd_gpu, bool isLoraIn, nvinfer1::DataType dataType, int splitKSlices, int minKN,
+    cutlass::gemm::GemmCoord* host_max_problem_sizes_ptr, int64_t* splitk_offsets_gpu, cudaStream_t stream);
+
+} // namespace kernels
+} // namespace tensorrt_llm
@@ -296,7 +296,7 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t
                         + (loraModuleIdx * numTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
 
                     auto const N2 = mOutHiddenSizes[loraModuleIdx];
-                    cutlass::gemm::GemmCoord problem_2(M, N2, N);
+                    cutlass::gemm::GemmCoord problem_2(M, N2, N); // token_num, module_output_size, lora_rank
                     problem_sizes_2.push_back(problem_2);
                     ptrA_2.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
                         + (loraModuleIdx * numTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
Original file line number	Diff line number	Diff line change
`@@ -462,10 +462,8 @@ PeftCacheManager::PeftTable PeftCacheManager::ensureBatch(`
`462`	`462`	`{`
`463`	`463`	`auto&& f = ensureFutures.at(taskId);`
`464`	`464`	`auto const values = f.get();`
`465`		`- for (auto const& reqId : reqIds)`
`466`		`- {`
`467`		`- peftTable.try_emplace(reqId, values);`
`468`		`- }`
	`465`	`+ // Map task_id to layer-module-configs instead of request_id to layer-module-configs`
	`466`	`+ peftTable.try_emplace(taskId, values);`
`469`	`467`	`}`
`470`	`468`	`TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);`
`471`	`469`	`return peftTable;`
`@@ -486,6 +484,11 @@ bool PeftCacheManager::isTaskDoneDevice(uint64_t taskId) const`
`486`	`484`	`return mDeviceLoraCache->isDone(taskId);`
`487`	`485`	`}`
`488`	`486`
	`487`	`+bool PeftCacheManager::isTaskCachedDevice(uint64_t const taskId) const`
	`488`	`+{`
	`489`	`+ return mDeviceLoraCache->has(taskId);`
	`490`	`+}`
	`491`	`+`
`489`	`492`	`void PeftCacheManager::updateTaskState(uint64_t taskId, uint64_t reqId, bool terminate, bool pause)`
`490`	`493`	`{`
`491`	`494`	`if (!terminate)`
`@@ -645,3 +648,5 @@ SizeType32 NoOpPeftCacheManager::determineNumPages(std::shared_ptr<LlmRequest> l`
`645`	`648`	`return 0;`
`646`	`649`	`}`
`647`	`650`	`} // namespace tensorrt_llm::batch_manager`
	`651`	`+`
	`652`	`+// TODO: merge C++ LoRA caching status with Py Slot manager`