diff --git a/python/openai/README.md b/python/openai/README.md
index 5dda869bab..796596e531 100644
--- a/python/openai/README.md
+++ b/python/openai/README.md
@@ -689,3 +689,5 @@ curl -H "api-key: my-secret-key" \
 # Multiple APIs in single argument with shared authentication
 --openai-restricted-api "inference,model-repository shared-key shared-secret"
 ```
+
+#### Add a note about usage metrics limitation
diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
index 499cc623e7..134500995b 100644
--- a/python/openai/openai_frontend/engine/triton_engine.py
+++ b/python/openai/openai_frontend/engine/triton_engine.py
@@ -695,15 +695,6 @@ def _validate_chat_request(
         if request.stream_options and not request.stream:
             raise Exception("`stream_options` can only be used when `stream` is True")
 
-        if (
-            request.stream_options
-            and request.stream_options.include_usage
-            and metadata.backend != "vllm"
-        ):
-            raise Exception(
-                "`stream_options.include_usage` is currently only supported for the vLLM backend"
-            )
-
     def _verify_chat_tool_call_settings(self, request: CreateChatCompletionRequest):
         if (
             request.tool_choice
@@ -844,15 +835,6 @@ def _validate_completion_request(
         if request.stream_options and not request.stream:
             raise Exception("`stream_options` can only be used when `stream` is True")
 
-        if (
-            request.stream_options
-            and request.stream_options.include_usage
-            and metadata.backend != "vllm"
-        ):
-            raise Exception(
-                "`stream_options.include_usage` is currently only supported for the vLLM backend"
-            )
-
     def _should_stream_with_auto_tool_parsing(
         self, request: CreateChatCompletionRequest
     ):
diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py
index 636e58435d..f64cf72e11 100644
--- a/python/openai/openai_frontend/engine/utils/triton.py
+++ b/python/openai/openai_frontend/engine/utils/triton.py
@@ -178,6 +178,10 @@ def _create_trtllm_inference_request(
     if guided_json is not None:
         inputs["guided_decoding_guide_type"] = [["json_schema"]]
         inputs["guided_decoding_guide"] = [[guided_json]]
+
+    inputs["return_num_input_tokens"] = np.bool_([[True]])
+    inputs["return_num_output_tokens"] = np.bool_([[True]])
+
     # FIXME: TRT-LLM doesn't currently support runtime changes of 'echo' and it
     # is configured at model load time, so we don't handle it here for now.
     return model.create_request(inputs=inputs)
@@ -266,11 +270,6 @@ def _get_usage_from_response(
     """
     Extracts token usage statistics from a Triton inference response.
     """
-    # TODO: Remove this check once TRT-LLM backend supports both "num_input_tokens"
-    # and "num_output_tokens", and also update the test cases accordingly.
-    if backend != "vllm":
-        return None
-
     prompt_tokens = None
     completion_tokens = None
 
@@ -286,12 +285,22 @@ def _get_usage_from_response(
                 input_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_uint32)
             )
             prompt_tokens = prompt_tokens_ptr[0]
+        elif input_token_tensor.data_type == tritonserver.DataType.INT32:
+            prompt_tokens_ptr = ctypes.cast(
+                input_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_int32)
+            )
+            prompt_tokens = prompt_tokens_ptr[0]
 
         if output_token_tensor.data_type == tritonserver.DataType.UINT32:
             completion_tokens_ptr = ctypes.cast(
                 output_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_uint32)
             )
             completion_tokens = completion_tokens_ptr[0]
+        elif output_token_tensor.data_type == tritonserver.DataType.INT32:
+            completion_tokens_ptr = ctypes.cast(
+                output_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_int32)
+            )
+            completion_tokens = completion_tokens_ptr[0]
 
         if prompt_tokens is not None and completion_tokens is not None:
             total_tokens = prompt_tokens + completion_tokens
diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 5402be451d..3939d9a7a8 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -41,9 +41,7 @@ class TestChatCompletions:
     def client(self, fastapi_client_class_scope):
         yield fastapi_client_class_scope
 
-    def test_chat_completions_defaults(
-        self, client, model: str, messages: List[dict], backend: str
-    ):
+    def test_chat_completions_defaults(self, client, model: str, messages: List[dict]):
         response = client.post(
             "/v1/chat/completions",
             json={"model": model, "messages": messages},
@@ -55,10 +53,7 @@ def test_chat_completions_defaults(
         assert message["role"] == "assistant"
 
         usage = response.json().get("usage")
-        if backend == "vllm":
-            assert usage is not None
-        else:
-            assert usage is None
+        assert usage is not None
 
     def test_chat_completions_system_prompt(self, client, model: str):
         # NOTE: Currently just sanity check that there are no issues when a
@@ -536,14 +531,7 @@ def test_request_logprobs(self):
     def test_request_logit_bias(self):
         pass
 
-    def test_usage_response(
-        self, client, model: str, messages: List[dict], backend: str
-    ):
-        if backend != "vllm":
-            pytest.skip(
-                "Usage reporting is currently available only for the vLLM backend."
-            )
-
+    def test_usage_response(self, client, model: str, messages: List[dict]):
         response = client.post(
             "/v1/chat/completions",
             json={"model": model, "messages": messages},
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index 9ec3ffe7f7..ecba399398 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -35,7 +35,7 @@ class TestCompletions:
     def client(self, fastapi_client_class_scope):
         yield fastapi_client_class_scope
 
-    def test_completions_defaults(self, client, model: str, prompt: str, backend: str):
+    def test_completions_defaults(self, client, model: str, prompt: str):
         response = client.post(
             "/v1/completions",
             json={"model": model, "prompt": prompt},
@@ -48,10 +48,7 @@ def test_completions_defaults(self, client, model: str, prompt: str, backend: st
         assert response.json()["choices"][0]["text"].strip()
 
         usage = response.json().get("usage")
-        if backend == "vllm":
-            assert usage is not None
-        else:
-            assert usage is None
+        assert usage is not None
 
     @pytest.mark.parametrize(
         "sampling_parameter, value",
@@ -371,12 +368,7 @@ def test_lora(self):
     def test_multi_lora(self):
         pass
 
-    def test_usage_response(self, client, model: str, prompt: str, backend: str):
-        if backend != "vllm":
-            pytest.skip(
-                "Usage reporting is currently available only for the vLLM backend."
-            )
-
+    def test_usage_response(self, client, model: str, prompt: str):
         response = client.post(
             "/v1/completions",
             json={"model": model, "prompt": prompt},
diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
index 1a1001329b..5ffcbe4f1d 100644
--- a/python/openai/tests/test_openai_client.py
+++ b/python/openai/tests/test_openai_client.py
@@ -49,7 +49,7 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
             raise Exception(f"Unexpected backend {backend=}")
 
     def test_openai_client_completion(
-        self, client: openai.OpenAI, model: str, prompt: str, backend: str
+        self, client: openai.OpenAI, model: str, prompt: str
     ):
         completion = client.completions.create(
             prompt=prompt,
@@ -61,19 +61,16 @@ def test_openai_client_completion(
         assert completion.choices[0].finish_reason == "stop"
 
         usage = completion.usage
-        if backend == "vllm":
-            assert usage is not None
-            assert isinstance(usage.prompt_tokens, int)
-            assert isinstance(usage.completion_tokens, int)
-            assert isinstance(usage.total_tokens, int)
-            assert usage.prompt_tokens > 0
-            assert usage.completion_tokens > 0
-            assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
-        else:
-            assert usage is None
+        assert usage is not None
+        assert isinstance(usage.prompt_tokens, int)
+        assert isinstance(usage.completion_tokens, int)
+        assert isinstance(usage.total_tokens, int)
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
+        assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
 
     def test_openai_client_chat_completion(
-        self, client: openai.OpenAI, model: str, messages: List[dict], backend: str
+        self, client: openai.OpenAI, model: str, messages: List[dict]
     ):
         chat_completion = client.chat.completions.create(
             messages=messages,
@@ -85,16 +82,13 @@ def test_openai_client_chat_completion(
         assert chat_completion.choices[0].finish_reason == "stop"
 
         usage = chat_completion.usage
-        if backend == "vllm":
-            assert usage is not None
-            assert isinstance(usage.prompt_tokens, int)
-            assert isinstance(usage.completion_tokens, int)
-            assert isinstance(usage.total_tokens, int)
-            assert usage.prompt_tokens > 0
-            assert usage.completion_tokens > 0
-            assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
-        else:
-            assert usage is None
+        assert usage is not None
+        assert isinstance(usage.prompt_tokens, int)
+        assert isinstance(usage.completion_tokens, int)
+        assert isinstance(usage.total_tokens, int)
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
+        assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
 
     @pytest.mark.parametrize("echo", [False, True])
     def test_openai_client_completion_echo(
@@ -141,7 +135,7 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
 
     @pytest.mark.asyncio
     async def test_openai_client_completion(
-        self, client: openai.AsyncOpenAI, model: str, prompt: str, backend: str
+        self, client: openai.AsyncOpenAI, model: str, prompt: str
     ):
         completion = await client.completions.create(
             prompt=prompt,
@@ -153,20 +147,17 @@ async def test_openai_client_completion(
         assert completion.choices[0].finish_reason == "stop"
 
         usage = completion.usage
-        if backend == "vllm":
-            assert usage is not None
-            assert isinstance(usage.prompt_tokens, int)
-            assert isinstance(usage.completion_tokens, int)
-            assert isinstance(usage.total_tokens, int)
-            assert usage.prompt_tokens > 0
-            assert usage.completion_tokens > 0
-            assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
-        else:
-            assert usage is None
+        assert usage is not None
+        assert isinstance(usage.prompt_tokens, int)
+        assert isinstance(usage.completion_tokens, int)
+        assert isinstance(usage.total_tokens, int)
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
+        assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
 
     @pytest.mark.asyncio
     async def test_openai_client_chat_completion(
-        self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str
+        self, client: openai.AsyncOpenAI, model: str, messages: List[dict]
     ):
         chat_completion = await client.chat.completions.create(
             messages=messages,
@@ -177,16 +168,13 @@ async def test_openai_client_chat_completion(
         assert chat_completion.choices[0].finish_reason == "stop"
 
         usage = chat_completion.usage
-        if backend == "vllm":
-            assert usage is not None
-            assert isinstance(usage.prompt_tokens, int)
-            assert isinstance(usage.completion_tokens, int)
-            assert isinstance(usage.total_tokens, int)
-            assert usage.prompt_tokens > 0
-            assert usage.completion_tokens > 0
-            assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
-        else:
-            assert usage is None
+        assert usage is not None
+        assert isinstance(usage.prompt_tokens, int)
+        assert isinstance(usage.completion_tokens, int)
+        assert isinstance(usage.total_tokens, int)
+        assert usage.prompt_tokens > 0
+        assert usage.completion_tokens > 0
+        assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens
 
         print(f"Chat completion results: {chat_completion}")
 
@@ -300,13 +288,8 @@ async def test_chat_streaming(
 
     @pytest.mark.asyncio
     async def test_chat_streaming_usage_option(
-        self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str
+        self, client: openai.AsyncOpenAI, model: str, messages: List[dict]
     ):
-        if backend != "vllm":
-            pytest.skip(
-                "Usage reporting is currently available only for the vLLM backend."
-            )
-
         seed = 0
         temperature = 0.0
         max_tokens = 16
@@ -397,13 +380,8 @@ async def test_chat_streaming_usage_option(
 
     @pytest.mark.asyncio
     async def test_completion_streaming_usage_option(
-        self, client: openai.AsyncOpenAI, model: str, prompt: str, backend: str
+        self, client: openai.AsyncOpenAI, model: str, prompt: str
     ):
-        if backend != "vllm":
-            pytest.skip(
-                "Usage reporting is currently available only for the vLLM backend."
-            )
-
         seed = 0
         temperature = 0.0
         max_tokens = 16
@@ -509,36 +487,3 @@ async def test_stream_options_without_streaming(
                 stream_options={"include_usage": True},
             )
         assert "`stream_options` can only be used when `stream` is True" in str(e.value)
-
-    @pytest.mark.asyncio
-    async def test_streaming_usage_unsupported_backend(
-        self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str
-    ):
-        if backend == "vllm":
-            pytest.skip(
-                "This test is for backends that do not support usage reporting."
-            )
-
-        with pytest.raises(openai.BadRequestError) as e:
-            await client.completions.create(
-                model=model,
-                prompt="Test prompt",
-                stream=True,
-                stream_options={"include_usage": True},
-            )
-        assert (
-            "`stream_options.include_usage` is currently only supported for the vLLM backend"
-            in str(e.value)
-        )
-
-        with pytest.raises(openai.BadRequestError) as e:
-            await client.chat.completions.create(
-                model=model,
-                messages=messages,
-                stream=True,
-                stream_options={"include_usage": True},
-            )
-        assert (
-            "`stream_options.include_usage` is currently only supported for the vLLM backend"
-            in str(e.value)
-        )
diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
index 9e098a4140..16288efea1 100755
--- a/qa/L0_openai/test.sh
+++ b/qa/L0_openai/test.sh
@@ -70,8 +70,8 @@ function prepare_tensorrtllm() {
     FILL_TEMPLATE="/app/tools/fill_template.py"
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${ENGINE_PATH},triton_max_batch_size:64,preprocessing_instance_count:1,max_queue_size:0
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${ENGINE_PATH},triton_max_batch_size:64,postprocessing_instance_count:1
-    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
-    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32,prompt_embedding_table_data_type:TYPE_FP16
+    python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True,prompt_embedding_table_data_type:TYPE_FP16
 }
 
 function pre_test() {