niukuo
diff --git a/‎.github/workflows/blossom-ci.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/blossom-ci.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitmodules‎
Lines changed: 7 additions & 7 deletions b/‎.gitmodules‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 12 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 8 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/handleGenerationLogits.h‎
Lines changed: 3 additions & 3 deletions b/‎cpp/include/tensorrt_llm/batch_manager/handleGenerationLogits.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 27 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 27 additions & 2 deletions
@@ -25,7 +25,7 @@ on:
             required: true
           test_result:
             description: 'test result'
-            required: false
+            required: true
           test_results_url:
             description: 'test results url'
             required: true
@@ -38,7 +38,7 @@ jobs:
 
     # This job only runs for pull request comments
     if: |
-         startsWith( github.event.comment.body, '/bot' ) && contains('["niukuo", "tburt-nv"]', github.actor)
+         startsWith( github.event.comment.body, '/bot' ) && contains('["chzblych", "tburt-nv", "niukuo"]', github.actor)
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
@@ -81,7 +81,7 @@ jobs:
           CI_SERVER: ${{ secrets.CI_SERVER }}
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-  Upload-Test:
+  Upload-results:
     name: Upload test results
     runs-on: linux-amd64-cpu4
     if: github.event_name == 'workflow_dispatch'
 
@@ -1,22 +1,22 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/cutlass-mirror.git
 [submodule "3rdparty/json"]
 	path = 3rdparty/json
-	url = https://github.com/nlohmann/json.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/json-mirror.git
 [submodule "3rdparty/cxxopts"]
 	path = 3rdparty/cxxopts
-	url = https://github.com/jarro2783/cxxopts
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/cxxopts-mirror.git
 	branch = v3.1.1
 [submodule "3rdparty/NVTX"]
 	path = 3rdparty/NVTX
-	url = https://github.com/NVIDIA/NVTX.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/NVTX-mirror.git
 [submodule "3rdparty/ucxx"]
 	path = 3rdparty/ucxx
-	url = https://github.com/rapidsai/ucxx.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/ucxx.git
 [submodule "3rdparty/pybind11"]
 	path = 3rdparty/pybind11
-	url = https://github.com/pybind/pybind11.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/pybind11.git
 [submodule "3rdparty/xgrammar"]
 	path = 3rdparty/xgrammar
-	url = https://github.com/mlc-ai/xgrammar.git
+	url = https://gitlab-master.nvidia.com/ftp/GitHubSync/xgrammar.git
@@ -21,7 +21,7 @@ TensorRT-LLM
 * [2025/02/25] 🌟 DeepSeek-R1 performance now optimized for Blackwell [➡️ link](https://huggingface.co/nvidia/DeepSeek-R1-FP4)
 <div align="center">
 <img src="docs/source/media/r1-perf.jpeg" width="75%">
-  
+
   <sub><sup>HGX B200 (8 GPUs) vs HGX H200 (8 GPUs) vs 2 x HGX H100 (normalized to 8 GPUs for comparison). Input tokens not included in TPS calculations. TensorRT-LLM Version:  0.18.0.dev2025021800 (pre-release) used for Feb measurements, SGLang used for Jan measurements. Hopper numbers in FP8. B200 numbers in FP4. Max concurrency use case. ISL/OSL: 1K/1K.</sub></sup>
 <div align="left">
 
 
@@ -527,6 +527,7 @@ texec::Request makeExecutorContextRequest(Sample const& sample, SizeType32 const
             lookaheadConfig, // lookaheadConfig
             std::nullopt,    // kvCacheRetentionConfig
             std::nullopt,    // logitsPostProcessorName
+            std::nullopt,    // logitsPostProcessor
             encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
     request.setRequestType(tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY);
     return request;
 
@@ -833,6 +833,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
         lookaheadConfig, // lookaheadConfig
         std::nullopt,    // kvCacheRetentionConfig
         std::nullopt,    // logitsPostProcessorName
+        std::nullopt,    // logitsPostProcessor
         encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
 }
 
 
@@ -94,6 +94,7 @@ class [[deprecated("Use the executor API instead.")]] GptManager
     [[nodiscard]] SizeType32 getMaxSequenceLen() const;
     [[nodiscard]] SizeType32 getMaxNumSequences() const;
     [[nodiscard]] SizeType32 getMaxDraftLen() const;
+    [[nodiscard]] SizeType32 getVocabSizePadded() const;
 
     void validateLlmRequest(
         LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
 
@@ -41,7 +41,9 @@ class CacheTransceiverFactory
 public:
     static std::unique_ptr<BaseCacheTransceiver> createCacheTransceiver(
         kv_cache_manager::BaseKVCacheManager* cacheManager, runtime::ModelConfig const& modelConfig,
-        runtime::WorldConfig const& worldConfig);
+        runtime::WorldConfig const& worldConfig,
+        executor::kv_cache::CacheState::AttentionType attentionType
+        = executor::kv_cache::CacheState::AttentionType::kDEFAULT);
 };
 
 class BaseCacheTransceiver
@@ -75,14 +77,18 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
         executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
-        nvinfer1::DataType dataType);
+        nvinfer1::DataType dataType,
+        executor::kv_cache::CacheState::AttentionType attentionType
+        = executor::kv_cache::CacheState::AttentionType::kDEFAULT);
 
     CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
         std::vector<SizeType32> numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType)
+        runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType,
+        executor::kv_cache::CacheState::AttentionType attentionType
+        = executor::kv_cache::CacheState::AttentionType::kDEFAULT)
         : CacheTransceiver(cacheManager, commType,
             executor::kv_cache::CacheState::ModelConfig{numKvHeadsPerLayer, sizePerHead, tokensPerBlock}, worldConfig,
-            dataType)
+            dataType, attentionType)
     {
     }
 
@@ -113,7 +119,8 @@ class CacheTransceiver : public BaseCacheTransceiver
     std::map<LlmRequest*, std::future<void>> mResponderFutures;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mRequesterFutures;
     mpi::MpiComm const *mMpiGroupComm{}, *mMpiWorldComm{};
-    std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm;
+    std::shared_ptr<mpi::MpiComm> mMpiGroupTensorParaComm, mMpiGroupPipeParaComm, mMpiGroupDataComm,
+        mMpiGroupTPInDPComm;
     executor::kv_cache::CommState const* mCommState;
     std::unique_ptr<executor::kv_cache::CacheState> mCacheState;
     std::unique_ptr<executor::kv_cache::ConnectionManager> mManager;
 
@@ -41,11 +41,18 @@ class DecoderInputBuffers
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
 
-    explicit DecoderInputBuffers(SizeType32 maxBatchSize, SizeType32 maxTokensPerEngineStep);
+    explicit DecoderInputBuffers(
+        SizeType32 maxBatchSize, SizeType32 maxTokensPerEngineStep, runtime::BufferManager const& manager);
 
+    // buffers for setup
     TensorPtr setupBatchSlots;
     TensorPtr inputsIds;
 
+    // buffers for forward
+    TensorPtr forwardBatchSlotsRequestOrder;
+    TensorPtr forwardBatchSlotsRequestOrderDevice;
+    TensorPtr fillValues;
+    TensorPtr fillValuesDevice;
     TensorPtr forwardBatchSlots;
 };
 
 
@@ -41,9 +41,9 @@ class HandleGenerationLogits : Algorithm
 
     HandleGenerationLogits() = default;
 
-    void operator()(tr::SizeType32 logitsIndex, RequestVector const& contextRequests,
-        RequestVector const& generationRequests, RuntimeBuffers const& genRuntimeBuffers,
-        DecoderBuffers& decoderBuffers, tr::ModelConfig const& modelConfig, runtime::TllmRuntime const& runtime) const;
+    void operator()(tr::SizeType32 logitsIndex, RequestVector const& generationRequests,
+        RuntimeBuffers& genRuntimeBuffers, DecoderBuffers& decoderBuffers, tr::ModelConfig const& modelConfig,
+        runtime::TllmRuntime const& runtime) const;
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -491,11 +491,14 @@ class BlockManager
 
     void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
 
+    //! \brief Get the ids of all newly allocated (not reused) blocks for the sequence.
+    std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(GenerationRequest const& sequence) const;
+
     //! \brief Release blocks of the sequence. Store blocks for reuse if llmReqeust is provided.
     void releaseBlocks(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt);
 
     //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
-    void schedulingReleaseBlocks(GenerationRequest& sequence);
+    void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
 
     //! \brief Release last block in the sequence
     void releaseLastBlock(GenerationRequest& sequence);
@@ -658,6 +661,11 @@ class BlockManager
 
     [[nodiscard]] static bool blockInRadixTree(BlockPtr const& block);
 
+    [[nodiscard]] bool isEnableHashKey() const
+    {
+        return mEnableHashKey;
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -849,6 +857,7 @@ class BaseKVCacheManager
     virtual void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) = 0;
 
     [[nodiscard]] virtual GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const = 0;
+    [[nodiscard]] virtual GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) = 0;
 
     [[nodiscard]] virtual bool isCrossKv() const = 0;
 
@@ -872,6 +881,10 @@ class BaseKVCacheManager
         std::vector<LlmRequest::RequestIdType> const& requestIds) const
         = 0;
 
+    [[nodiscard]] virtual std::vector<KVCacheBlock::IdType> getNewlyAllocatedBlockIds(
+        LlmRequest::RequestIdType requestId) const
+        = 0;
+
     [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
     [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
 
@@ -904,6 +917,8 @@ class BaseKVCacheManager
     /// @param outputLength The number of output tokens in each sequence in the batch.
     /// @return SizeType32 A number of sequences per batch.
     [[nodiscard]] virtual SizeType32 getMaxCapacityBatchSize(SizeType32 inputLength, SizeType32 outputLength) const = 0;
+
+    [[nodiscard]] virtual CacheType getCacheType() const = 0;
 };
 
 class KVCacheManager : public BaseKVCacheManager
@@ -935,7 +950,7 @@ class KVCacheManager : public BaseKVCacheManager
         SizeType32 sinkTokenLength, CudaStreamPtr stream, std::optional<SizeType32> maxSequenceLength,
         bool enableBlockReuse = true, bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
 
     KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
         SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
@@ -1100,12 +1115,18 @@ class KVCacheManager : public BaseKVCacheManager
     void rewindKVCache(LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override;
 
     [[nodiscard]] GenerationRequest const& getSequence(LlmRequest::RequestIdType requestId) const override;
+    [[nodiscard]] GenerationRequest& getSequence(LlmRequest::RequestIdType requestId) override;
 
     [[nodiscard]] bool isCrossKv() const override
     {
         return mBlockManager.getCacheType() == CacheType::kCROSS;
     }
 
+    [[nodiscard]] CacheType getCacheType() const override
+    {
+        return mBlockManager.getCacheType();
+    }
+
     //! \brief Find first new block that must be allocated for context phase and return it's concatenated token vector.
     //! \details Only full blocks are considered.
     [[nodiscard]] std::optional<BlockKey> findNewContextBlock(
@@ -1148,6 +1169,8 @@ class KVCacheManager : public BaseKVCacheManager
     std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
         std::vector<LlmRequest::RequestIdType> const& requestIds) const override;
 
+    std::vector<SizeType32> getNewlyAllocatedBlockIds(LlmRequest::RequestIdType requestId) const override;
+
     runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override;
 
     SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
@@ -1219,6 +1242,8 @@ class KVCacheManager : public BaseKVCacheManager
     bool mEnableHashKey;
     // Whether use one more block for each sequence
     bool mUseOneMoreBlock;
+    // Mutex to protect access to mSequences
+    mutable std::mutex mSequencesMtx;
     // buffers for static tensors, will be created after allocating pools
     runtime::ITensor::SharedPtr mBlockPoolPointers;
     runtime::ITensor::SharedPtr mLayerToPoolMapping;
Original file line number	Diff line number	Diff line change
`@@ -833,6 +833,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW`
`833`	`833`	`lookaheadConfig, // lookaheadConfig`
`834`	`834`	`std::nullopt, // kvCacheRetentionConfig`
`835`	`835`	`std::nullopt, // logitsPostProcessorName`
	`836`	`+ std::nullopt, // logitsPostProcessor`
`836`	`837`	`encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);`
`837`	`838`	`}`
`838`	`839`