Skip to content

Commit 26b953e

Browse files
authored
[nvbugs/5309940] Add support for input output token counts (NVIDIA#5445)
Signed-off-by: Iman Tabrizian <[email protected]>
1 parent 5437075 commit 26b953e

File tree

11 files changed

+402
-10
lines changed

11 files changed

+402
-10
lines changed

tests/integration/defs/triton_server/test_triton_llm.py

Lines changed: 163 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,8 +519,7 @@ def test_llama_v2_70b_ifb(
519519
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
520520
pytest.skip("Skipping.")
521521

522-
llm_backend_repo_root = os.path.join(os.environ["LLM_ROOT"],
523-
"triton_backend")
522+
llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
524523
# Build Engine
525524
ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
526525
tensorrt_llm_llama_example_root,
@@ -3708,3 +3707,165 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
37083707

37093708
print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
37103709
venv_check_call(llm_backend_venv, run_cmd)
3710+
3711+
3712+
@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
3713+
@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
3714+
@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
3715+
@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
3716+
@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
3717+
@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
3718+
@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
3719+
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
3720+
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
3721+
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
3722+
ids=["disableTrtOverlap"])
3723+
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
3724+
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
3725+
ids=["enableDecoupleMode", "disableDecoupleMode"])
3726+
@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
3727+
@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
3728+
@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
3729+
@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
3730+
@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
3731+
@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
3732+
@pytest.mark.parametrize("DECODING_MODE", [""])
3733+
@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
3734+
@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
3735+
@pytest.mark.parametrize("TOKEN_COUNT_TEST",
3736+
["input_only", "output_only", "both"])
3737+
@pytest.mark.parametrize("BACKEND", ["tensorrtllm", "python"])
3738+
def test_tiny_llama_ifb_token_counts(
3739+
E2E_MODEL_NAME,
3740+
MAX_TOKENS_IN_KV_CACHE,
3741+
MAX_ATTENTION_WINDOW_SIZE,
3742+
BATCH_SCHEDULER_POLICY,
3743+
KV_CACHE_FREE_GPU_MEM_FRACTION,
3744+
ENABLE_TRT_OVERLAP,
3745+
BATCHING_STRATEGY,
3746+
DECOUPLED_MODE,
3747+
TRITON_MAX_BATCH_SIZE,
3748+
MAX_QUEUE_DELAY_MICROSECONDS,
3749+
MAX_BEAM_WIDTH,
3750+
ENABLE_KV_CACHE_REUSE,
3751+
NORMALIZE_LOG_PROBS,
3752+
ENABLE_CHUNKED_CONTEXT,
3753+
GPU_DEVICE_IDS,
3754+
DECODING_MODE,
3755+
PREPROCESSING_INSTANCE_COUNT,
3756+
POSTPROCESSING_INSTANCE_COUNT,
3757+
ACCUMULATE_TOKEN,
3758+
BLS_INSTANCE_COUNT,
3759+
EXCLUDE_INPUT_IN_OUTPUT,
3760+
TOKEN_COUNT_TEST,
3761+
BACKEND,
3762+
inflight_batcher_llm_client_root,
3763+
tensorrt_llm_llama_example_root,
3764+
tiny_llama_model_root,
3765+
llm_backend_venv,
3766+
):
3767+
"""Test that the TRT-LLM inflight batcher backend can return input and output token counts."""
3768+
if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
3769+
pytest.skip("Skipping. V1 doesn't support max_utilization.")
3770+
3771+
if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
3772+
pytest.skip("Skipping.")
3773+
3774+
llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
3775+
# Build engine
3776+
ENGINE_PATH, _ = prepare_tiny_llama_1b_engine(
3777+
type="ifb",
3778+
tensorrt_llm_llama_example_root=tensorrt_llm_llama_example_root,
3779+
tiny_llama_model_root=tiny_llama_model_root,
3780+
tensorrt_llm_example_root=None,
3781+
)
3782+
# Prepare model repo
3783+
new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
3784+
prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
3785+
3786+
# Modify config.pbtxt
3787+
TOKENIZER_PATH = tiny_llama_model_root
3788+
modify_ib_config_pbtxt(new_model_repo,
3789+
ENGINE_PATH,
3790+
TOKENIZER_PATH,
3791+
llm_backend_repo_root,
3792+
DECOUPLED_MODE,
3793+
MAX_TOKENS_IN_KV_CACHE,
3794+
MAX_ATTENTION_WINDOW_SIZE,
3795+
BATCH_SCHEDULER_POLICY,
3796+
BATCHING_STRATEGY,
3797+
KV_CACHE_FREE_GPU_MEM_FRACTION,
3798+
EXCLUDE_INPUT_IN_OUTPUT,
3799+
ENABLE_TRT_OVERLAP,
3800+
TRITON_MAX_BATCH_SIZE,
3801+
MAX_QUEUE_DELAY_MICROSECONDS,
3802+
MAX_BEAM_WIDTH,
3803+
ENABLE_KV_CACHE_REUSE,
3804+
NORMALIZE_LOG_PROBS,
3805+
ENABLE_CHUNKED_CONTEXT,
3806+
GPU_DEVICE_IDS,
3807+
DECODING_MODE,
3808+
PREPROCESSING_INSTANCE_COUNT,
3809+
POSTPROCESSING_INSTANCE_COUNT,
3810+
ACCUMULATE_TOKEN,
3811+
BLS_INSTANCE_COUNT,
3812+
TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
3813+
TENSORRT_LLM_DRAFT_MODEL_NAME="",
3814+
BACKEND=BACKEND)
3815+
3816+
# Launch Triton Server
3817+
launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
3818+
"launch_triton_server.py")
3819+
check_call(
3820+
f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
3821+
shell=True)
3822+
check_server_ready()
3823+
3824+
# Test token count functionality based on the test type
3825+
tokenizer_dir = f"{tiny_llama_model_root}"
3826+
3827+
# Prepare different test commands based on token count test type
3828+
if TOKEN_COUNT_TEST == "input_only":
3829+
test_args = ["--return-num-input-tokens"]
3830+
elif TOKEN_COUNT_TEST == "output_only":
3831+
test_args = ["--return-num-output-tokens"]
3832+
elif TOKEN_COUNT_TEST == "both":
3833+
test_args = ["--return-num-input-tokens", "--return-num-output-tokens"]
3834+
3835+
if DECOUPLED_MODE == "False":
3836+
run_cmd = [
3837+
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
3838+
f"--tokenizer-dir={tokenizer_dir}",
3839+
"--tokenizer-type=auto",
3840+
"--request-output-len=20",
3841+
] + test_args
3842+
3843+
output = venv_check_output(llm_backend_venv, run_cmd)
3844+
else:
3845+
run_cmd = [
3846+
f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
3847+
f"--tokenizer-dir={tokenizer_dir}",
3848+
"--tokenizer-type=auto",
3849+
"--request-output-len=20",
3850+
"--streaming",
3851+
] + test_args
3852+
3853+
output = venv_check_output(llm_backend_venv, run_cmd)
3854+
3855+
print(output)
3856+
if TOKEN_COUNT_TEST == "input_only":
3857+
assert "Input token count: [[13]]" in output
3858+
elif TOKEN_COUNT_TEST == "output_only":
3859+
if DECOUPLED_MODE == "False":
3860+
assert "Output token count: [[33]]" in output
3861+
else:
3862+
assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
3863+
elif TOKEN_COUNT_TEST == "both":
3864+
assert "Input token count: [[13]]" in output
3865+
if DECOUPLED_MODE == "False":
3866+
assert "Output token count: [[33]]" in output
3867+
else:
3868+
assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
3869+
print_info(
3870+
f"Successfully tested token count functionality for {TOKEN_COUNT_TEST} mode"
3871+
)

tests/integration/test_lists/test-db/l0_a30.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,9 @@ l0_a30:
230230
- triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
231231
- triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
232232
- triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
233+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
234+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
235+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
236+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
237+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
238+
- triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]

triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,18 @@ input [
328328
dims: [ 1 ]
329329
optional: true
330330
allow_ragged_batch: true
331+
},
332+
{
333+
name: "return_num_output_tokens"
334+
data_type: TYPE_BOOL
335+
dims: [ 1 ]
336+
optional: true
337+
},
338+
{
339+
name: "return_num_input_tokens"
340+
data_type: TYPE_BOOL
341+
dims: [ 1 ]
342+
optional: true
331343
}
332344
]
333345
output [
@@ -420,6 +432,16 @@ output [
420432
name: "total_draft_tokens"
421433
data_type: TYPE_INT32
422434
dims: [ 1 ]
435+
},
436+
{
437+
name: "num_output_tokens"
438+
data_type: TYPE_INT32
439+
dims: [ 1 ]
440+
},
441+
{
442+
name: "num_input_tokens"
443+
data_type: TYPE_INT32
444+
dims: [ 1 ]
423445
}
424446
]
425447
instance_group [

triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,18 @@ input [
158158
dims: [ 1 ]
159159
optional: true
160160
},
161+
{
162+
name: "return_num_input_tokens"
163+
data_type: TYPE_BOOL
164+
dims: [ 1 ]
165+
optional: true
166+
},
167+
{
168+
name: "return_num_output_tokens"
169+
data_type: TYPE_BOOL
170+
dims: [ 1 ]
171+
optional: true
172+
},
161173
{
162174
name: "beam_width"
163175
data_type: TYPE_INT32
@@ -346,6 +358,16 @@ output [
346358
name: "total_draft_tokens"
347359
data_type: TYPE_INT32
348360
dims: [ 1 ]
361+
},
362+
{
363+
name: "num_input_tokens"
364+
data_type: TYPE_INT32
365+
dims: [ 1 ]
366+
},
367+
{
368+
name: "num_output_tokens"
369+
data_type: TYPE_INT32
370+
dims: [ 1 ]
349371
}
350372
]
351373
ensemble_scheduling {
@@ -529,6 +551,14 @@ ensemble_scheduling {
529551
key: "return_perf_metrics"
530552
value: "return_perf_metrics"
531553
}
554+
input_map {
555+
key: "return_num_input_tokens"
556+
value: "return_num_input_tokens"
557+
}
558+
input_map {
559+
key: "return_num_output_tokens"
560+
value: "return_num_output_tokens"
561+
}
532562
input_map {
533563
key: "num_return_sequences"
534564
value: "num_return_sequences"
@@ -652,6 +682,14 @@ ensemble_scheduling {
652682
output_map {
653683
key: "total_draft_tokens"
654684
value: "total_draft_tokens"
685+
},
686+
output_map {
687+
key: "num_input_tokens"
688+
value: "num_input_tokens"
689+
},
690+
output_map {
691+
key: "num_output_tokens"
692+
value: "num_output_tokens"
655693
}
656694
},
657695
{

triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ class RequestData:
6262
num_input_tokens: int
6363
num_output_tokens: int
6464
response_sender: Any
65+
return_num_input_tokens: bool = False
66+
return_num_output_tokens: bool = False
6567

6668

6769
def mpi_comm():
@@ -657,7 +659,10 @@ def convert_response(response,
657659
batch_index,
658660
batch_size,
659661
num_return_sequences,
660-
expected_logits_dtype=torch.float32):
662+
expected_logits_dtype=torch.float32,
663+
input_token_count=None,
664+
return_num_input_tokens=False,
665+
return_num_output_tokens=False):
661666

662667
if response.has_error():
663668
return pb_utils.InferenceResponse(output_tensors=[],
@@ -723,6 +728,18 @@ def convert_response(response,
723728
"sequence_index",
724729
np.expand_dims(np.array([result.sequence_index], np.int32), 0)))
725730

731+
# Add token count outputs if requested
732+
if return_num_input_tokens and input_token_count is not None:
733+
triton_output_tensor = pb_utils.Tensor(
734+
"num_input_tokens",
735+
np.expand_dims(np.array([input_token_count], np.int32), 0))
736+
output_tensors.append(triton_output_tensor)
737+
if return_num_output_tokens:
738+
triton_output_tensor = pb_utils.Tensor(
739+
"num_output_tokens",
740+
np.expand_dims(np.array([output_lengths], np.int32), 0))
741+
output_tensors.append(triton_output_tensor)
742+
726743
if result.request_perf_metrics is not None:
727744
kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
728745
output_tensors.append(
@@ -1420,7 +1437,13 @@ def execute(self, requests):
14201437
triton_req_id, triton_user_id, batch_index,
14211438
len(batch_indices),
14221439
executor_request.sampling_config.num_return_sequences, 0, 0,
1423-
triton_request.get_response_sender())
1440+
triton_request.get_response_sender(),
1441+
get_input_scalar_by_name(triton_request,
1442+
'return_num_input_tokens',
1443+
batch_index=batch_index),
1444+
get_input_scalar_by_name(triton_request,
1445+
'return_num_output_tokens',
1446+
batch_index=batch_index))
14241447
self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
14251448
input_len = len(
14261449
executor_request.input_token_ids
@@ -1451,7 +1474,10 @@ def awaiter_loop(self):
14511474

14521475
triton_response, is_final, output_length = convert_response(
14531476
response, request_data.batch_index, request_data.batch_size,
1454-
request_data.num_return_sequences, self.logits_dtype)
1477+
request_data.num_return_sequences, self.logits_dtype,
1478+
request_data.num_input_tokens,
1479+
request_data.return_num_input_tokens,
1480+
request_data.return_num_output_tokens)
14551481
with self.lock:
14561482
self.req_id_to_request_data[
14571483
req_id].num_output_tokens += output_length

triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,20 @@ input [
282282
reshape: { shape: [ ] }
283283
optional: true
284284
},
285+
{
286+
name: "return_num_input_tokens"
287+
data_type: TYPE_BOOL
288+
dims: [ 1 ]
289+
reshape: { shape: [ ] }
290+
optional: true
291+
},
292+
{
293+
name: "return_num_output_tokens"
294+
data_type: TYPE_BOOL
295+
dims: [ 1 ]
296+
reshape: { shape: [ ] }
297+
optional: true
298+
},
285299
{
286300
name: "exclude_input_in_output"
287301
data_type: TYPE_BOOL
@@ -575,6 +589,16 @@ output [
575589
name: "total_draft_tokens"
576590
data_type: TYPE_INT32
577591
dims: [ 1 ]
592+
},
593+
{
594+
name: "num_input_tokens"
595+
data_type: TYPE_INT32
596+
dims: [ 1 ]
597+
},
598+
{
599+
name: "num_output_tokens"
600+
data_type: TYPE_INT32
601+
dims: [ 1 ]
578602
}
579603
]
580604
instance_group [

0 commit comments

Comments
 (0)