diff --git a/python/openai/README.md b/python/openai/README.md index 5dda869bab..796596e531 100644 --- a/python/openai/README.md +++ b/python/openai/README.md @@ -689,3 +689,5 @@ curl -H "api-key: my-secret-key" \ # Multiple APIs in single argument with shared authentication --openai-restricted-api "inference,model-repository shared-key shared-secret" ``` + +#### Add a note about usage metrics limitation diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 499cc623e7..134500995b 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -695,15 +695,6 @@ def _validate_chat_request( if request.stream_options and not request.stream: raise Exception("`stream_options` can only be used when `stream` is True") - if ( - request.stream_options - and request.stream_options.include_usage - and metadata.backend != "vllm" - ): - raise Exception( - "`stream_options.include_usage` is currently only supported for the vLLM backend" - ) - def _verify_chat_tool_call_settings(self, request: CreateChatCompletionRequest): if ( request.tool_choice @@ -844,15 +835,6 @@ def _validate_completion_request( if request.stream_options and not request.stream: raise Exception("`stream_options` can only be used when `stream` is True") - if ( - request.stream_options - and request.stream_options.include_usage - and metadata.backend != "vllm" - ): - raise Exception( - "`stream_options.include_usage` is currently only supported for the vLLM backend" - ) - def _should_stream_with_auto_tool_parsing( self, request: CreateChatCompletionRequest ): diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py index 636e58435d..f64cf72e11 100644 --- a/python/openai/openai_frontend/engine/utils/triton.py +++ b/python/openai/openai_frontend/engine/utils/triton.py @@ -178,6 +178,10 @@ def _create_trtllm_inference_request( if guided_json is not None: inputs["guided_decoding_guide_type"] = [["json_schema"]] inputs["guided_decoding_guide"] = [[guided_json]] + + inputs["return_num_input_tokens"] = np.bool_([[True]]) + inputs["return_num_output_tokens"] = np.bool_([[True]]) + # FIXME: TRT-LLM doesn't currently support runtime changes of 'echo' and it # is configured at model load time, so we don't handle it here for now. return model.create_request(inputs=inputs) @@ -266,11 +270,6 @@ def _get_usage_from_response( """ Extracts token usage statistics from a Triton inference response. """ - # TODO: Remove this check once TRT-LLM backend supports both "num_input_tokens" - # and "num_output_tokens", and also update the test cases accordingly. - if backend != "vllm": - return None - prompt_tokens = None completion_tokens = None @@ -286,12 +285,22 @@ def _get_usage_from_response( input_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_uint32) ) prompt_tokens = prompt_tokens_ptr[0] + elif input_token_tensor.data_type == tritonserver.DataType.INT32: + prompt_tokens_ptr = ctypes.cast( + input_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_int32) + ) + prompt_tokens = prompt_tokens_ptr[0] if output_token_tensor.data_type == tritonserver.DataType.UINT32: completion_tokens_ptr = ctypes.cast( output_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_uint32) ) completion_tokens = completion_tokens_ptr[0] + elif output_token_tensor.data_type == tritonserver.DataType.INT32: + completion_tokens_ptr = ctypes.cast( + output_token_tensor.data_ptr, ctypes.POINTER(ctypes.c_int32) + ) + completion_tokens = completion_tokens_ptr[0] if prompt_tokens is not None and completion_tokens is not None: total_tokens = prompt_tokens + completion_tokens diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 5402be451d..3939d9a7a8 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -41,9 +41,7 @@ class TestChatCompletions: def client(self, fastapi_client_class_scope): yield fastapi_client_class_scope - def test_chat_completions_defaults( - self, client, model: str, messages: List[dict], backend: str - ): + def test_chat_completions_defaults(self, client, model: str, messages: List[dict]): response = client.post( "/v1/chat/completions", json={"model": model, "messages": messages}, @@ -55,10 +53,7 @@ def test_chat_completions_defaults( assert message["role"] == "assistant" usage = response.json().get("usage") - if backend == "vllm": - assert usage is not None - else: - assert usage is None + assert usage is not None def test_chat_completions_system_prompt(self, client, model: str): # NOTE: Currently just sanity check that there are no issues when a @@ -536,14 +531,7 @@ def test_request_logprobs(self): def test_request_logit_bias(self): pass - def test_usage_response( - self, client, model: str, messages: List[dict], backend: str - ): - if backend != "vllm": - pytest.skip( - "Usage reporting is currently available only for the vLLM backend." - ) - + def test_usage_response(self, client, model: str, messages: List[dict]): response = client.post( "/v1/chat/completions", json={"model": model, "messages": messages}, diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py index 9ec3ffe7f7..ecba399398 100644 --- a/python/openai/tests/test_completions.py +++ b/python/openai/tests/test_completions.py @@ -35,7 +35,7 @@ class TestCompletions: def client(self, fastapi_client_class_scope): yield fastapi_client_class_scope - def test_completions_defaults(self, client, model: str, prompt: str, backend: str): + def test_completions_defaults(self, client, model: str, prompt: str): response = client.post( "/v1/completions", json={"model": model, "prompt": prompt}, @@ -48,10 +48,7 @@ def test_completions_defaults(self, client, model: str, prompt: str, backend: st assert response.json()["choices"][0]["text"].strip() usage = response.json().get("usage") - if backend == "vllm": - assert usage is not None - else: - assert usage is None + assert usage is not None @pytest.mark.parametrize( "sampling_parameter, value", @@ -371,12 +368,7 @@ def test_lora(self): def test_multi_lora(self): pass - def test_usage_response(self, client, model: str, prompt: str, backend: str): - if backend != "vllm": - pytest.skip( - "Usage reporting is currently available only for the vLLM backend." - ) - + def test_usage_response(self, client, model: str, prompt: str): response = client.post( "/v1/completions", json={"model": model, "prompt": prompt}, diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py index 1a1001329b..5ffcbe4f1d 100644 --- a/python/openai/tests/test_openai_client.py +++ b/python/openai/tests/test_openai_client.py @@ -49,7 +49,7 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str): raise Exception(f"Unexpected backend {backend=}") def test_openai_client_completion( - self, client: openai.OpenAI, model: str, prompt: str, backend: str + self, client: openai.OpenAI, model: str, prompt: str ): completion = client.completions.create( prompt=prompt, @@ -61,19 +61,16 @@ def test_openai_client_completion( assert completion.choices[0].finish_reason == "stop" usage = completion.usage - if backend == "vllm": - assert usage is not None - assert isinstance(usage.prompt_tokens, int) - assert isinstance(usage.completion_tokens, int) - assert isinstance(usage.total_tokens, int) - assert usage.prompt_tokens > 0 - assert usage.completion_tokens > 0 - assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens - else: - assert usage is None + assert usage is not None + assert isinstance(usage.prompt_tokens, int) + assert isinstance(usage.completion_tokens, int) + assert isinstance(usage.total_tokens, int) + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens def test_openai_client_chat_completion( - self, client: openai.OpenAI, model: str, messages: List[dict], backend: str + self, client: openai.OpenAI, model: str, messages: List[dict] ): chat_completion = client.chat.completions.create( messages=messages, @@ -85,16 +82,13 @@ def test_openai_client_chat_completion( assert chat_completion.choices[0].finish_reason == "stop" usage = chat_completion.usage - if backend == "vllm": - assert usage is not None - assert isinstance(usage.prompt_tokens, int) - assert isinstance(usage.completion_tokens, int) - assert isinstance(usage.total_tokens, int) - assert usage.prompt_tokens > 0 - assert usage.completion_tokens > 0 - assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens - else: - assert usage is None + assert usage is not None + assert isinstance(usage.prompt_tokens, int) + assert isinstance(usage.completion_tokens, int) + assert isinstance(usage.total_tokens, int) + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens @pytest.mark.parametrize("echo", [False, True]) def test_openai_client_completion_echo( @@ -141,7 +135,7 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s @pytest.mark.asyncio async def test_openai_client_completion( - self, client: openai.AsyncOpenAI, model: str, prompt: str, backend: str + self, client: openai.AsyncOpenAI, model: str, prompt: str ): completion = await client.completions.create( prompt=prompt, @@ -153,20 +147,17 @@ async def test_openai_client_completion( assert completion.choices[0].finish_reason == "stop" usage = completion.usage - if backend == "vllm": - assert usage is not None - assert isinstance(usage.prompt_tokens, int) - assert isinstance(usage.completion_tokens, int) - assert isinstance(usage.total_tokens, int) - assert usage.prompt_tokens > 0 - assert usage.completion_tokens > 0 - assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens - else: - assert usage is None + assert usage is not None + assert isinstance(usage.prompt_tokens, int) + assert isinstance(usage.completion_tokens, int) + assert isinstance(usage.total_tokens, int) + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens @pytest.mark.asyncio async def test_openai_client_chat_completion( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] ): chat_completion = await client.chat.completions.create( messages=messages, @@ -177,16 +168,13 @@ async def test_openai_client_chat_completion( assert chat_completion.choices[0].finish_reason == "stop" usage = chat_completion.usage - if backend == "vllm": - assert usage is not None - assert isinstance(usage.prompt_tokens, int) - assert isinstance(usage.completion_tokens, int) - assert isinstance(usage.total_tokens, int) - assert usage.prompt_tokens > 0 - assert usage.completion_tokens > 0 - assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens - else: - assert usage is None + assert usage is not None + assert isinstance(usage.prompt_tokens, int) + assert isinstance(usage.completion_tokens, int) + assert isinstance(usage.total_tokens, int) + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens == usage.prompt_tokens + usage.completion_tokens print(f"Chat completion results: {chat_completion}") @@ -300,13 +288,8 @@ async def test_chat_streaming( @pytest.mark.asyncio async def test_chat_streaming_usage_option( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str + self, client: openai.AsyncOpenAI, model: str, messages: List[dict] ): - if backend != "vllm": - pytest.skip( - "Usage reporting is currently available only for the vLLM backend." - ) - seed = 0 temperature = 0.0 max_tokens = 16 @@ -397,13 +380,8 @@ async def test_chat_streaming_usage_option( @pytest.mark.asyncio async def test_completion_streaming_usage_option( - self, client: openai.AsyncOpenAI, model: str, prompt: str, backend: str + self, client: openai.AsyncOpenAI, model: str, prompt: str ): - if backend != "vllm": - pytest.skip( - "Usage reporting is currently available only for the vLLM backend." - ) - seed = 0 temperature = 0.0 max_tokens = 16 @@ -509,36 +487,3 @@ async def test_stream_options_without_streaming( stream_options={"include_usage": True}, ) assert "`stream_options` can only be used when `stream` is True" in str(e.value) - - @pytest.mark.asyncio - async def test_streaming_usage_unsupported_backend( - self, client: openai.AsyncOpenAI, model: str, messages: List[dict], backend: str - ): - if backend == "vllm": - pytest.skip( - "This test is for backends that do not support usage reporting." - ) - - with pytest.raises(openai.BadRequestError) as e: - await client.completions.create( - model=model, - prompt="Test prompt", - stream=True, - stream_options={"include_usage": True}, - ) - assert ( - "`stream_options.include_usage` is currently only supported for the vLLM backend" - in str(e.value) - ) - - with pytest.raises(openai.BadRequestError) as e: - await client.chat.completions.create( - model=model, - messages=messages, - stream=True, - stream_options={"include_usage": True}, - ) - assert ( - "`stream_options.include_usage` is currently only supported for the vLLM backend" - in str(e.value) - ) diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 9e098a4140..16288efea1 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -70,8 +70,8 @@ function prepare_tensorrtllm() { FILL_TEMPLATE="/app/tools/fill_template.py" python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/preprocessing/config.pbtxt tokenizer_dir:${ENGINE_PATH},triton_max_batch_size:64,preprocessing_instance_count:1,max_queue_size:0 python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${ENGINE_PATH},triton_max_batch_size:64,postprocessing_instance_count:1 - python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 - python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32,prompt_embedding_table_data_type:TYPE_FP16 + python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True,prompt_embedding_table_data_type:TYPE_FP16 } function pre_test() {