LarryXFly
diff --git a/‎tests/integration/defs/triton_server/test_triton_llm.py‎
Lines changed: 163 additions & 2 deletions b/‎tests/integration/defs/triton_server/test_triton_llm.py‎
Lines changed: 163 additions & 2 deletions
diff --git a/‎tests/integration/test_lists/test-db/l0_a30.yml‎
Lines changed: 6 additions & 0 deletions b/‎tests/integration/test_lists/test-db/l0_a30.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt‎
Lines changed: 22 additions & 0 deletions b/‎triton_backend/all_models/disaggregated_serving/disaggregated_serving_bls/config.pbtxt‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 38 additions & 0 deletions b/‎triton_backend/all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py‎
Lines changed: 29 additions & 3 deletions b/‎triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt‎
Lines changed: 24 additions & 0 deletions b/‎triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt‎
Lines changed: 24 additions & 0 deletions
@@ -519,8 +519,7 @@ def test_llama_v2_70b_ifb(
     if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
         pytest.skip("Skipping.")
 
-    llm_backend_repo_root = os.path.join(os.environ["LLM_ROOT"],
-                                         "triton_backend")
+    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
     # Build Engine
     ENGINE_PATH = prepare_llama_v2_70b_engine("ifb",
                                               tensorrt_llm_llama_example_root,
@@ -3708,3 +3707,165 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
 
             print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
             venv_check_call(llm_backend_venv, run_cmd)
+
+
+@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
+@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
+@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"])
+@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"])
+@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"])
+@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""])
+@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""])
+@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["guaranteed_no_evict"])
+@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
+@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
+                         ids=["disableTrtOverlap"])
+@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
+@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
+                         ids=["enableDecoupleMode", "disableDecoupleMode"])
+@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["128"])
+@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"])
+@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"])
+@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"])
+@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"])
+@pytest.mark.parametrize("GPU_DEVICE_IDS", [""])
+@pytest.mark.parametrize("DECODING_MODE", [""])
+@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"])
+@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"])
+@pytest.mark.parametrize("TOKEN_COUNT_TEST",
+                         ["input_only", "output_only", "both"])
+@pytest.mark.parametrize("BACKEND", ["tensorrtllm", "python"])
+def test_tiny_llama_ifb_token_counts(
+    E2E_MODEL_NAME,
+    MAX_TOKENS_IN_KV_CACHE,
+    MAX_ATTENTION_WINDOW_SIZE,
+    BATCH_SCHEDULER_POLICY,
+    KV_CACHE_FREE_GPU_MEM_FRACTION,
+    ENABLE_TRT_OVERLAP,
+    BATCHING_STRATEGY,
+    DECOUPLED_MODE,
+    TRITON_MAX_BATCH_SIZE,
+    MAX_QUEUE_DELAY_MICROSECONDS,
+    MAX_BEAM_WIDTH,
+    ENABLE_KV_CACHE_REUSE,
+    NORMALIZE_LOG_PROBS,
+    ENABLE_CHUNKED_CONTEXT,
+    GPU_DEVICE_IDS,
+    DECODING_MODE,
+    PREPROCESSING_INSTANCE_COUNT,
+    POSTPROCESSING_INSTANCE_COUNT,
+    ACCUMULATE_TOKEN,
+    BLS_INSTANCE_COUNT,
+    EXCLUDE_INPUT_IN_OUTPUT,
+    TOKEN_COUNT_TEST,
+    BACKEND,
+    inflight_batcher_llm_client_root,
+    tensorrt_llm_llama_example_root,
+    tiny_llama_model_root,
+    llm_backend_venv,
+):
+    """Test that the TRT-LLM inflight batcher backend can return input and output token counts."""
+    if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization":
+        pytest.skip("Skipping. V1 doesn't support max_utilization.")
+
+    if E2E_MODEL_NAME == "ensemble" and ACCUMULATE_TOKEN == "True":
+        pytest.skip("Skipping.")
+
+    llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
+    # Build engine
+    ENGINE_PATH, _ = prepare_tiny_llama_1b_engine(
+        type="ifb",
+        tensorrt_llm_llama_example_root=tensorrt_llm_llama_example_root,
+        tiny_llama_model_root=tiny_llama_model_root,
+        tensorrt_llm_example_root=None,
+    )
+    # Prepare model repo
+    new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
+    prepare_ib_model_repo(llm_backend_repo_root, new_model_repo)
+
+    # Modify config.pbtxt
+    TOKENIZER_PATH = tiny_llama_model_root
+    modify_ib_config_pbtxt(new_model_repo,
+                           ENGINE_PATH,
+                           TOKENIZER_PATH,
+                           llm_backend_repo_root,
+                           DECOUPLED_MODE,
+                           MAX_TOKENS_IN_KV_CACHE,
+                           MAX_ATTENTION_WINDOW_SIZE,
+                           BATCH_SCHEDULER_POLICY,
+                           BATCHING_STRATEGY,
+                           KV_CACHE_FREE_GPU_MEM_FRACTION,
+                           EXCLUDE_INPUT_IN_OUTPUT,
+                           ENABLE_TRT_OVERLAP,
+                           TRITON_MAX_BATCH_SIZE,
+                           MAX_QUEUE_DELAY_MICROSECONDS,
+                           MAX_BEAM_WIDTH,
+                           ENABLE_KV_CACHE_REUSE,
+                           NORMALIZE_LOG_PROBS,
+                           ENABLE_CHUNKED_CONTEXT,
+                           GPU_DEVICE_IDS,
+                           DECODING_MODE,
+                           PREPROCESSING_INSTANCE_COUNT,
+                           POSTPROCESSING_INSTANCE_COUNT,
+                           ACCUMULATE_TOKEN,
+                           BLS_INSTANCE_COUNT,
+                           TENSORRT_LLM_TARGET_MODEL_NAME="tensorrt_llm",
+                           TENSORRT_LLM_DRAFT_MODEL_NAME="",
+                           BACKEND=BACKEND)
+
+    # Launch Triton Server
+    launch_server_py = os.path.join(llm_backend_repo_root, "scripts",
+                                    "launch_triton_server.py")
+    check_call(
+        f"python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}",
+        shell=True)
+    check_server_ready()
+
+    # Test token count functionality based on the test type
+    tokenizer_dir = f"{tiny_llama_model_root}"
+
+    # Prepare different test commands based on token count test type
+    if TOKEN_COUNT_TEST == "input_only":
+        test_args = ["--return-num-input-tokens"]
+    elif TOKEN_COUNT_TEST == "output_only":
+        test_args = ["--return-num-output-tokens"]
+    elif TOKEN_COUNT_TEST == "both":
+        test_args = ["--return-num-input-tokens", "--return-num-output-tokens"]
+
+    if DECOUPLED_MODE == "False":
+        run_cmd = [
+            f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
+            f"--tokenizer-dir={tokenizer_dir}",
+            "--tokenizer-type=auto",
+            "--request-output-len=20",
+        ] + test_args
+
+        output = venv_check_output(llm_backend_venv, run_cmd)
+    else:
+        run_cmd = [
+            f"{inflight_batcher_llm_client_root}/inflight_batcher_llm_client.py",
+            f"--tokenizer-dir={tokenizer_dir}",
+            "--tokenizer-type=auto",
+            "--request-output-len=20",
+            "--streaming",
+        ] + test_args
+
+        output = venv_check_output(llm_backend_venv, run_cmd)
+
+    print(output)
+    if TOKEN_COUNT_TEST == "input_only":
+        assert "Input token count: [[13]]" in output
+    elif TOKEN_COUNT_TEST == "output_only":
+        if DECOUPLED_MODE == "False":
+            assert "Output token count: [[33]]" in output
+        else:
+            assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
+    elif TOKEN_COUNT_TEST == "both":
+        assert "Input token count: [[13]]" in output
+        if DECOUPLED_MODE == "False":
+            assert "Output token count: [[33]]" in output
+        else:
+            assert "Output token count: [[1]]" in output and not "Output token count: [[20]]" in output
+    print_info(
+        f"Successfully tested token count functionality for {TOKEN_COUNT_TEST} mode"
+    )
@@ -230,3 +230,9 @@ l0_a30:
   - triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
   - triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
   - triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[python-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_tiny_llama_ifb_token_counts[tensorrtllm-both-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
@@ -328,6 +328,18 @@ input [
     dims: [ 1 ]
     optional: true
     allow_ragged_batch: true
+  },
+  {
+    name: "return_num_output_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_num_input_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
   }
 ]
 output [
@@ -420,6 +432,16 @@ output [
     name: "total_draft_tokens"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "num_output_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "num_input_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]
 instance_group [
 
@@ -158,6 +158,18 @@ input [
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "return_num_input_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+    name: "return_num_output_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "beam_width"
     data_type: TYPE_INT32
@@ -346,6 +358,16 @@ output [
     name: "total_draft_tokens"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "num_input_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "num_output_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]
 ensemble_scheduling {
@@ -529,6 +551,14 @@ ensemble_scheduling {
           key: "return_perf_metrics"
           value: "return_perf_metrics"
       }
+      input_map {
+          key: "return_num_input_tokens"
+          value: "return_num_input_tokens"
+      }
+      input_map {
+          key: "return_num_output_tokens"
+          value: "return_num_output_tokens"
+      }
       input_map {
           key: "num_return_sequences"
           value: "num_return_sequences"
@@ -652,6 +682,14 @@ ensemble_scheduling {
       output_map {
         key: "total_draft_tokens"
         value: "total_draft_tokens"
+      },
+      output_map {
+        key: "num_input_tokens"
+        value: "num_input_tokens"
+      },
+      output_map {
+        key: "num_output_tokens"
+        value: "num_output_tokens"
       }
     },
     {
 
@@ -62,6 +62,8 @@ class RequestData:
     num_input_tokens: int
     num_output_tokens: int
     response_sender: Any
+    return_num_input_tokens: bool = False
+    return_num_output_tokens: bool = False
 
 
 def mpi_comm():
@@ -657,7 +659,10 @@ def convert_response(response,
                      batch_index,
                      batch_size,
                      num_return_sequences,
-                     expected_logits_dtype=torch.float32):
+                     expected_logits_dtype=torch.float32,
+                     input_token_count=None,
+                     return_num_input_tokens=False,
+                     return_num_output_tokens=False):
 
     if response.has_error():
         return pb_utils.InferenceResponse(output_tensors=[],
@@ -723,6 +728,18 @@ def convert_response(response,
                 "sequence_index",
                 np.expand_dims(np.array([result.sequence_index], np.int32), 0)))
 
+    # Add token count outputs if requested
+    if return_num_input_tokens and input_token_count is not None:
+        triton_output_tensor = pb_utils.Tensor(
+            "num_input_tokens",
+            np.expand_dims(np.array([input_token_count], np.int32), 0))
+        output_tensors.append(triton_output_tensor)
+    if return_num_output_tokens:
+        triton_output_tensor = pb_utils.Tensor(
+            "num_output_tokens",
+            np.expand_dims(np.array([output_lengths], np.int32), 0))
+        output_tensors.append(triton_output_tensor)
+
     if result.request_perf_metrics is not None:
         kv_cache_metrics = result.request_perf_metrics.kv_cache_metrics
         output_tensors.append(
@@ -1420,7 +1437,13 @@ def execute(self, requests):
                     triton_req_id, triton_user_id, batch_index,
                     len(batch_indices),
                     executor_request.sampling_config.num_return_sequences, 0, 0,
-                    triton_request.get_response_sender())
+                    triton_request.get_response_sender(),
+                    get_input_scalar_by_name(triton_request,
+                                             'return_num_input_tokens',
+                                             batch_index=batch_index),
+                    get_input_scalar_by_name(triton_request,
+                                             'return_num_output_tokens',
+                                             batch_index=batch_index))
                 self.triton_req_id_to_req_ids[triton_req_id].add(req_id)
                 input_len = len(
                     executor_request.input_token_ids
@@ -1451,7 +1474,10 @@ def awaiter_loop(self):
 
                 triton_response, is_final, output_length = convert_response(
                     response, request_data.batch_index, request_data.batch_size,
-                    request_data.num_return_sequences, self.logits_dtype)
+                    request_data.num_return_sequences, self.logits_dtype,
+                    request_data.num_input_tokens,
+                    request_data.return_num_input_tokens,
+                    request_data.return_num_output_tokens)
                 with self.lock:
                     self.req_id_to_request_data[
                         req_id].num_output_tokens += output_length
 
@@ -282,6 +282,20 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "return_num_input_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_num_output_tokens"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "exclude_input_in_output"
     data_type: TYPE_BOOL
@@ -575,6 +589,16 @@ output [
     name: "total_draft_tokens"
     data_type: TYPE_INT32
     dims: [ 1 ]
+  },
+  {
+    name: "num_input_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "num_output_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
   }
 ]
 instance_group [