Skip to content

Commit 92d90fa

Browse files
authored
[None][feat] Expose enable_trt_overlap in Triton_backend brings 1.05x OTPS (#10018)
Signed-off-by: Jhao-Ting Chen <[email protected]>
1 parent 0027a01 commit 92d90fa

File tree

5 files changed

+14
-11
lines changed

5 files changed

+14
-11
lines changed

tests/integration/defs/triton_server/test_triton_llm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def stop_triton_server():
3838
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
3939
["max_utilization", "guaranteed_no_evict"])
4040
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
41-
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
42-
ids=["disableTrtOverlap"])
41+
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False", "True"],
42+
ids=["disableTrtOverlap", "enableTrtOverlap"])
4343
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
4444
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
4545
ids=["enableDecoupleMode", "disableDecoupleMode"])

tests/integration/test_lists/qa/llm_triton_integration.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---Fals
1414
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-True-tensorrt_llm_bls]
1515
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
1616
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
17+
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-enableTrtOverlap--guaranteed_no_evict---1-1-1-True-ensemble]
1718
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-ensemble]
1819
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
1920
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]

tests/integration/test_lists/test-db/l0_a30.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ l0_a30:
218218
- triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
219219
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
220220
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
221+
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-enableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
221222
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
222223
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
223224
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]

triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -686,13 +686,13 @@ parameters: {
686686
string_value: "${kv_cache_onboard_blocks}"
687687
}
688688
}
689-
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
690-
# parameters: {
691-
# key: "enable_trt_overlap"
692-
# value: {
693-
# string_value: "${enable_trt_overlap}"
694-
# }
695-
# }
689+
# enable_trt_overlap is an experimental feature used with CUDA Graph Mode.
690+
parameters: {
691+
key: "enable_trt_overlap"
692+
value: {
693+
string_value: "${enable_trt_overlap}"
694+
}
695+
}
696696
parameters: {
697697
key: "exclude_input_in_output"
698698
value: {

triton_backend/inflight_batcher_llm/src/model_instance_state.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -491,10 +491,10 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
491491
+ std::to_string(requestStatsMaxIterations));
492492
}
493493

494+
bool enableTrtOverlap = false;
494495
try
495496
{
496-
model_state_->GetParameter<bool>("enable_trt_overlap");
497-
TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored");
497+
enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
498498
}
499499
catch (std::exception const& e)
500500
{
@@ -698,6 +698,7 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
698698
maxQueueSize, extendedRuntimePerfKnobConfig,
699699
/*DebugConfig*/ std::nullopt, recvPollPeriodMs};
700700
execConfig.setSpecDecConfig(specDecConfig);
701+
execConfig.setEnableTrtOverlap(enableTrtOverlap);
701702
execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI));
702703
if (guidedConfig.has_value())
703704
{

0 commit comments

Comments
 (0)