Skip to content

Commit 8da6d6b

Browse files
Merge branch 'main' into gk/improved_sharding_heuristics
2 parents 54e65da + 3e4f238 commit 8da6d6b

File tree

74 files changed

+2139
-840
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2139
-840
lines changed

cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ class BlockRange
183183
auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
184184
mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
185185
}
186+
if (cacheManager.isEnableIndexerKCache())
187+
{
188+
mIndexerKCachePool = cacheManager.getIndexerKCachePool();
189+
}
186190
}
187191

188192
BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)

cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ class CacheReceiver::Impl
806806

807807
RequestInfo requestInfo(requestId, mSelfState);
808808

809-
if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1)
809+
if (!mFormatter->getCacheManager()->getBlockManager().isVariableWindow())
810810
{
811811
auto* cacheManager = mFormatter->getCacheManager();
812812
auto beam = 0;

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -876,14 +876,7 @@ void WindowBlockManager::allocatePools(bool useUvm)
876876
}
877877

878878
nvinfer1::Dims cacheShape;
879-
if (pool.containsIndexerKCache)
880-
{
881-
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, blockSize});
882-
}
883-
else
884-
{
885-
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
886-
}
879+
cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
887880

888881
TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(),
889882
mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads);

cpp/tensorrt_llm/thop/moeOp.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
435435
WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
436436
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
437437

438-
auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
438+
auto const quant_params
439+
= getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales, base_activation_type);
439440
kernels::MoeMinLatencyParams min_latency_params{};
440441

441442
// TODO: support lora in the future
@@ -613,7 +614,8 @@ class FusedMoeRunner : public torch::CustomClassHolder
613614
WorkspaceInfo const& workspace_info = getWorkspaceInfo(num_rows, hidden_size, inter_size, num_experts_total,
614615
static_cast<int>(experts_per_token), base_activation_type, parallelism_config, min_latency_mode, stream);
615616

616-
auto const quant_params = getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
617+
auto const quant_params
618+
= getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales, base_activation_type);
617619

618620
// TODO: support lora in the future
619621
::tensorrt_llm::kernels::LoraParams lora_params{};
@@ -859,8 +861,10 @@ class FusedMoeRunner : public torch::CustomClassHolder
859861
}
860862

861863
kernels::QuantParams getQuantParams(int64_t const num_experts_on_rank, int64_t const hidden_size,
862-
int64_t const inter_size, torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales) const
864+
int64_t const inter_size, torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
865+
ActivationType base_activation_type) const
863866
{
867+
int expand_ratio = isGatedActivation(base_activation_type) ? 2 : 1;
864868
if (isFp8Quant())
865869
{
866870
TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for fp8 quantization");
@@ -925,12 +929,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
925929
&& fc1_weight_block.sizes()[1]
926930
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
927931
inter_size, TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX)
928-
* 2
932+
* expand_ratio
929933
&& fc1_weight_block.sizes()[2] * FP8_PER_INT32
930934
* TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize
931935
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
932936
hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
933-
"fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
937+
"fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
934938
"block_scale_vector_size)");
935939
TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
936940
TORCH_CHECK(fc2_act_global.dim() == 0 || fc2_act_global.sizes()[0] == num_experts_on_rank,
@@ -978,12 +982,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
978982
&& fc1_weight_block.sizes()[1]
979983
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
980984
inter_size, TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX)
981-
* 2
985+
* expand_ratio
982986
&& fc1_weight_block.sizes()[2] * FP8_PER_INT32
983987
* TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize
984988
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
985989
hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
986-
"fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
990+
"fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
987991
"block_scale_vector_size)");
988992
TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
989993
TORCH_CHECK(fc2_weight_block.sizes()[0] == num_experts_on_rank
@@ -1044,12 +1048,12 @@ class FusedMoeRunner : public torch::CustomClassHolder
10441048
&& fc1_weight_block.sizes()[1]
10451049
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
10461050
inter_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4)
1047-
* 2
1051+
* expand_ratio
10481052
&& fc1_weight_block.sizes()[2] * FP8_PER_INT32
10491053
* TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
10501054
== TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
10511055
hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4),
1052-
"fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
1056+
"fc1 weight block size must be (num_experts_on_rank, inter_size * expand_ratio, hidden_size // 4 // "
10531057
"block_scale_vector_size)");
10541058
TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank, "fc1 global size must be (num_experts_on_rank,)");
10551059
TORCH_CHECK(fc2_act_global.dim() == 0 || fc2_act_global.sizes()[0] == num_experts_on_rank,

docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ P99 E2EL (ms): 1643.44
162162
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
163163

164164
$$
165-
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
165+
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
166166
$$
167167

168168
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -172,7 +172,7 @@ $$
172172
$$
173173

174174
$$
175-
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
175+
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
176176
$$
177177

178178
#### End-to-End (E2E) Latency
@@ -182,14 +182,14 @@ $$
182182
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
183183

184184
$$
185-
\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
185+
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
186186
$$
187187

188188
#### Tokens Per Second (TPS) or Output Token Throughput
189189
* how many output tokens the system generates each second.
190190

191191
$$
192-
\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
192+
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
193193
$$
194194

195195
### Request Time Breakdown

docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ P99 E2EL (ms): [result]
400400
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
401401
402402
$$
403-
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
403+
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
404404
$$
405405
406406
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -410,7 +410,7 @@ $$
410410
$$
411411
412412
$$
413-
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
413+
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
414414
$$
415415
416416
#### End-to-End (E2E) Latency
@@ -420,12 +420,12 @@ $$
420420
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
421421
422422
$$
423-
\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
423+
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
424424
$$
425425
426426
#### Tokens Per Second (TPS) or Output Token Throughput
427427
* how many output tokens the system generates each second.
428428
429429
$$
430-
\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
430+
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
431431
$$

docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ P99 E2EL (ms): [result]
350350
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
351351

352352
$$
353-
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
353+
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
354354
$$
355355

356356
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -360,7 +360,7 @@ $$
360360
$$
361361

362362
$$
363-
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
363+
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
364364
$$
365365

366366
#### End-to-End (E2E) Latency
@@ -370,12 +370,12 @@ $$
370370
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
371371

372372
$$
373-
\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
373+
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
374374
$$
375375

376376
#### Tokens Per Second (TPS) or Output Token Throughput
377377
* how many output tokens the system generates each second.
378378

379379
$$
380-
\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
380+
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
381381
$$

docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ P99 E2EL (ms): [result]
355355
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
356356

357357
$$
358-
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
358+
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
359359
$$
360360

361361
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -365,7 +365,7 @@ $$
365365
$$
366366

367367
$$
368-
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
368+
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
369369
$$
370370

371371
#### End-to-End (E2E) Latency
@@ -375,12 +375,12 @@ $$
375375
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
376376

377377
$$
378-
\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
378+
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
379379
$$
380380

381381
#### Tokens Per Second (TPS) or Output Token Throughput
382382
* how many output tokens the system generates each second.
383383

384384
$$
385-
\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
385+
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
386386
$$

docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ P99 E2EL (ms): [result]
347347
For a single request, ITLs are the time intervals between tokens, while TPOT is the average of those intervals:
348348

349349
$$
350-
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{#Output Tokens} - 1}
350+
\text{TPOT (1 request)} = \text{Avg(ITL)} = \frac{\text{E2E latency} - \text{TTFT}}{\text{Num Output Tokens} - 1}
351351
$$
352352

353353
Across different requests, **average TPOT** is the mean of each request's TPOT (all requests weighted equally), while **average ITL** is token-weighted (all tokens weighted equally):
@@ -357,7 +357,7 @@ $$
357357
$$
358358

359359
$$
360-
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{#Output Tokens across requests}}
360+
\text{Avg ITL (N requests)} = \frac{\text{Sum of all ITLs across requests}}{\text{Num Output Tokens across requests}}
361361
$$
362362

363363
#### End-to-End (E2E) Latency
@@ -367,12 +367,12 @@ $$
367367
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
368368

369369
$$
370-
\text{Total TPS} = \frac{\text{#Input Tokens}+\text{#Output Tokens}}{T_{last} - T_{first}}
370+
\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}}
371371
$$
372372

373373
#### Tokens Per Second (TPS) or Output Token Throughput
374374
* how many output tokens the system generates each second.
375375

376376
$$
377-
\text{TPS} = \frac{\text{#Output Tokens}}{T_{last} - T_{first}}
377+
\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}}
378378
$$

examples/models/core/deepseek_v3/README.md

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -881,12 +881,3 @@ python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --enable_chunked_pref
881881
- **GPU Memory:** Adjust `--max_batch_size` and `--max_num_tokens` if you encounter out-of-memory errors.
882882
- **Logs:** Check `/workspace/trt_bench.log` for detailed performance information and troubleshooting messages.
883883
- **Configuration Files:** Verify that the configuration files are correctly formatted to avoid runtime issues.
884-
885-
## Known Issues
886-
- Support for KV Cache Reuse and Chunked Prefill in DeepSeek-V3.2-Exp is currently under development. When running `quickstart_advanced.py`, please include `--disable_kv_cache_reuse` to disable KV Cache Reuse. When using `trtllm-eval`/`trtllm-serve`/`trtllm-bench`, please include the following configuration in the extra llm_api options:
887-
```
888-
kv_cache_config:
889-
enable_block_reuse: false
890-
tokens_per_block: 64
891-
enable_chunked_prefill: false
892-
```

0 commit comments

Comments
 (0)