NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 4 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 1 addition & 8 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 13 additions & 9 deletions b/‎cpp/tensorrt_llm/thop/moeOp.cpp‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions b/‎docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 0 additions & 9 deletions b/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 0 additions & 9 deletions
@@ -183,6 +183,10 @@ class BlockRange
             auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
             mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
         }
+        if (cacheManager.isEnableIndexerKCache())
+        {
+            mIndexerKCachePool = cacheManager.getIndexerKCachePool();
+        }
     }
 
     BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)
 
@@ -806,7 +806,7 @@ class CacheReceiver::Impl
 
         RequestInfo requestInfo(requestId, mSelfState);
 
-        if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1)
+        if (!mFormatter->getCacheManager()->getBlockManager().isVariableWindow())
         {
             auto* cacheManager = mFormatter->getCacheManager();
             auto beam = 0;
 
@@ -876,14 +876,7 @@ void WindowBlockManager::allocatePools(bool useUvm)
         }
 
         nvinfer1::Dims cacheShape;
-        if (pool.containsIndexerKCache)
-        {
-            cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, blockSize});
-        }
-        else
-        {
-            cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
-        }
+        cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
 
         TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(),
             mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads);
 
@@ -435,7 +435,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
             static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
-        auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
+        auto const quant_params
+            = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales, base_activation_type);
         kernels::MoeMinLatencyParams min_latency_params{};
 
         // TODO: support lora in the future
@@ -613,7 +614,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
         WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
             static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
 
-        auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
+        auto const quant_params
+            = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales, base_activation_type);
 
         // TODO: support lora in the future
         ::tensorrt_llm::kernels::LoraParams lora_params{};
@@ -859,8 +861,10 @@ class FusedMoeRunner : public torch::CustomClassHolder
     }
 
     kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,
-        int64_t const inter_size, torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales) const
+        int64_t const inter_size, torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
+        ActivationType base_activation_type) const
     {
+        int expand_ratio = isGatedActivation(base_activation_type) ? 2 : 1;
         if (isFp8Quant())
         {
             TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for fp8 quantization");
@@ -925,12 +929,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
                     && fc1_weight_block.sizes()[1]
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                                inter_size, TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX)
-                            * 2
+                            * expand_ratio
                     && fc1_weight_block.sizes()[2] * FP8_PER_INT32
                             * TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                             hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
-                "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
+                "fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
                 "block_scale_vector_size)");
             TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
             TORCH_CHECK(fc2_act_global.dim() == 0 || fc2_act_global.sizes()[0] == num_experts_on_rank,
@@ -978,12 +982,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
                     && fc1_weight_block.sizes()[1]
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                                inter_size, TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX)
-                            * 2
+                            * expand_ratio
                     && fc1_weight_block.sizes()[2] * FP8_PER_INT32
                             * TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                             hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
-                "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
+                "fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
                 "block_scale_vector_size)");
             TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
             TORCH_CHECK(fc2_weight_block.sizes()[0] == num_experts_on_rank
@@ -1044,12 +1048,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
                     && fc1_weight_block.sizes()[1]
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                                inter_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4)
-                            * 2
+                            * expand_ratio
                     && fc1_weight_block.sizes()[2] * FP8_PER_INT32
                             * TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
                         == TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
                             hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4),
-                "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
+                "fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
                 "block_scale_vector_size)");
             TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
             TORCH_CHECK(fc2_act_global.dim() == 0 || fc2_act_global.sizes()[0] == num_experts_on_rank,
 
@@ -162,7 +162,7 @@ P99 E2EL (ms):                           1643.44
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -172,7 +172,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -182,14 +182,14 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 ### Request Time Breakdown
 
@@ -400,7 +400,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -410,7 +410,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -420,12 +420,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -350,7 +350,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -360,7 +360,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -370,12 +370,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -355,7 +355,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -365,7 +365,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -375,12 +375,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -347,7 +347,7 @@ P99 E2EL (ms):                            [result]
 For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
 
 $$
-\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
+\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
 $$
 
 Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -357,7 +357,7 @@ $$
 $$
 
 $$
-\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
+\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
 $$
 
 #### End-to-End (E2E) Latency
@@ -367,12 +367,12 @@ $$
   * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
 
 $$
-\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
 
 #### Tokens Per Second (TPS) or Output Token Throughput
   * how many output tokens the system generates each second.
 
 $$
-\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
+\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
 $$
@@ -881,12 +881,3 @@ python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --enable_chunked_pref
 - **GPU Memory:** Adjust `--max_batch_size` and `--max_num_tokens` if you encounter out-of-memory errors.
 - **Logs:** Check `/workspace/trt_bench.log` for detailed performance information and troubleshooting messages.
 - **Configuration Files:** Verify that the configuration files are correctly formatted to avoid runtime issues.
-
-## Known Issues
-- Support for KV Cache Reuse and Chunked Prefill in DeepSeek-V3.2-Exp is currently under development. When running `quickstart_advanced.py`, please include `--disable_kv_cache_reuse` to disable KV Cache Reuse. When using `trtllm-eval`/`trtllm-serve`/`trtllm-bench`, please include the following configuration in the extra llm_api options:
-```
-kv_cache_config:
-    enable_block_reuse: false
-    tokens_per_block: 64
-enable_chunked_prefill: false
-```
Original file line number	Diff line number	Diff line change
`@@ -183,6 +183,10 @@ class BlockRange`
`183`	`183`	`auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);`
`184`	`184`	`mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));`
`185`	`185`	`}`
	`186`	`+ if (cacheManager.isEnableIndexerKCache())`
	`187`	`+ {`
	`188`	`+ mIndexerKCachePool = cacheManager.getIndexerKCachePool();`
	`189`	`+ }`
`186`	`190`	`}`
`187`	`191`
`188`	`192`	`BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)`
Original file line number	Diff line number	Diff line change
`@@ -806,7 +806,7 @@ class CacheReceiver::Impl`
`806`	`806`
`807`	`807`	`RequestInfo requestInfo(requestId, mSelfState);`
`808`	`808`
`809`		`- if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1)`
	`809`	`+ if (!mFormatter->getCacheManager()->getBlockManager().isVariableWindow())`
`810`	`810`	`{`
`811`	`811`	`auto* cacheManager = mFormatter->getCacheManager();`
`812`	`812`	`auto beam = 0;`