fix: Support max_completion_tokens option in OpenAI frontend (#8226)

pskiran1 · web-flow · commit 30e20cb90842 · 2025-06-24T22:09:51.000+05:30
diff --git a/python/openai/README.md b/python/openai/README.md
@@ -216,7 +216,7 @@ completion = client.chat.completions.create(
         },
         {"role": "user", "content": "What are LLMs?"},
     ],
-    max_tokens=256,
+    max_completion_tokens=256,
 )
 
 print(completion.choices[0].message.content)
@@ -487,7 +487,7 @@ messages = [
 ]
 
 tool_calls = client.chat.completions.create(
-    messages=messages, model=model, tools=tools, max_tokens=128
+    messages=messages, model=model, tools=tools, max_completion_tokens=128
 )
 function_name = tool_calls.choices[0].message.tool_calls[0].function.name
 function_arguments = tool_calls.choices[0].message.tool_calls[0].function.arguments
@@ -504,31 +504,6 @@ function arguments: {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}
 tool calling result: The weather in Dallas, Texas is 85 degrees fahrenheit. It is partly cloudly, with highs in the 90's.
 ```
 
-<!-- TODO: Remove this warning when the openai api supports the max_completion_tokens instead of max_tokens -->
-> [!WARNING]
-> When using LangChain to call the `v1/chat/completions` endpoint, you might encounter an exception related to `max_completion_tokens` if you have specified `max_tokens` in the request.
->
-> Example: `openai.BadRequestError: Error code: 400 - {'object': 'error', 'message': "[{'type': 'extra_forbidden', 'loc': ('body', 'max_completion_tokens'), 'msg': 'Extra inputs are not permitted', 'input': 800}]", 'type': 'BadRequestError', 'param': None, 'code': 400}`
->
-> This issue is due to an incompatibility between Triton's OpenAI API frontend and the latest OpenAI API. We are actively working to address this gap. A workaround is adding the `max_tokens` into the `model_kwargs` of the LangChain OpenAI request.
->
-> Example:
-```python
-from langchain.llms import OpenAI
-
-llm = OpenAI(
-    model_name="llama-3.1-8b-instruct",
-    temperature=0.0,
-    model_kwargs={
-        "max_tokens": 4096
-    }
-)
-
-response = llm("Write a short poem about a sunset.")
-print(response)
-
-```
-
 #### Named Tool Calling
 
 The OpenAI frontend supports named function calling, utilizing guided decoding in the vLLM and TensorRT-LLM backends. Users can specify one of the tools in `tool_choice` to force the model to select a specific tool for function calling.
@@ -639,12 +614,12 @@ messages = [
 ]
 
 tool_calls = client.chat.completions.create(
-    messages=messages, model=model, tools=tools, tool_choice=tool_choice, max_tokens=128
+    messages=messages, model=model, tools=tools, tool_choice=tool_choice, max_completion_tokens=128
 )
 function_name = tool_calls.choices[0].message.tool_calls[0].function.name
 function_arguments = tool_calls.choices[0].message.tool_calls[0].function.arguments
 
-print(f"function name: "{function_name}")
+print(f"function name: {function_name}")
 print(f"function arguments: {function_arguments}")
 print(f"tool calling result: {available_tools[function_name](**json.loads(function_arguments))}")
 ```
diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
@@ -102,6 +102,7 @@ def __init__(
         self,
         server: tritonserver.Server,
         tokenizer: str,
+        default_max_tokens: int,
         backend: Optional[str] = None,
         lora_separator: Optional[str] = None,
         tool_call_parser: Optional[str] = None,
@@ -113,6 +114,7 @@ def __init__(
         # TODO: Reconsider name of "backend" vs. something like "request_format"
         self.backend = backend
         self.lora_separator = lora_separator
+        self.default_max_tokens = default_max_tokens
 
         # NOTE: Creation time and model metadata will be static at startup for
         # now, and won't account for dynamically loading/unloading models.
@@ -184,7 +186,9 @@ async def chat(
 
         # Convert to Triton request format and perform inference
         responses = metadata.model.async_infer(
-            metadata.request_converter(metadata.model, prompt, request, lora_name)
+            metadata.request_converter(
+                metadata.model, prompt, request, lora_name, self.default_max_tokens
+            )
         )
 
         # Prepare and send responses back to client in OpenAI format
@@ -302,7 +306,11 @@ async def completion(
         # Convert to Triton request format and perform inference
         responses = metadata.model.async_infer(
             metadata.request_converter(
-                metadata.model, request.prompt, request, lora_name
+                metadata.model,
+                request.prompt,
+                request,
+                lora_name,
+                self.default_max_tokens,
             )
         )
 
diff --git a/python/openai/openai_frontend/engine/utils/triton.py b/python/openai/openai_frontend/engine/utils/triton.py
@@ -46,6 +46,7 @@ def _create_vllm_inference_request(
     prompt,
     request: CreateChatCompletionRequest | CreateCompletionRequest,
     lora_name: str | None,
+    default_max_tokens: int,
 ):
     inputs = {}
     # Exclude non-sampling parameters so they aren't passed to vLLM
@@ -67,6 +68,9 @@ def _create_vllm_inference_request(
         "function_call",
         "functions",
         "suffix",
+        "max_completion_tokens",
+        # will be handled explicitly
+        "max_tokens",
     }
 
     # NOTE: The exclude_none is important, as internals may not support
@@ -75,6 +79,23 @@ def _create_vllm_inference_request(
         exclude=excludes,
         exclude_none=True,
     )
+
+    # Indicates CreateChatCompletionRequest
+    if hasattr(request, "max_completion_tokens"):
+        if request.max_completion_tokens is not None:
+            sampling_parameters["max_tokens"] = request.max_completion_tokens
+        # Fallback to deprecated request.max_tokens
+        elif request.max_tokens is not None:
+            sampling_parameters["max_tokens"] = request.max_tokens
+        # If neither is set, use a default value for max_tokens
+        else:
+            sampling_parameters["max_tokens"] = default_max_tokens
+    # Indicates CreateCompletionRequest
+    elif request.max_tokens is not None:
+        sampling_parameters["max_tokens"] = request.max_tokens
+    else:
+        sampling_parameters["max_tokens"] = default_max_tokens
+
     if lora_name is not None:
         sampling_parameters["lora_name"] = lora_name
     sampling_parameters = json.dumps(sampling_parameters)
@@ -108,15 +129,31 @@ def _create_trtllm_inference_request(
     prompt,
     request: CreateChatCompletionRequest | CreateCompletionRequest,
     lora_name: str | None,
+    default_max_tokens: int,
 ):
     if lora_name is not None:
         raise Exception("LoRA selection is currently not supported for TRT-LLM backend")
 
     inputs = {}
     inputs["text_input"] = [[prompt]]
     inputs["stream"] = np.bool_([[request.stream]])
-    if request.max_tokens:
+
+    # Indicates CreateChatCompletionRequest
+    if hasattr(request, "max_completion_tokens"):
+        if request.max_completion_tokens is not None:
+            inputs["max_tokens"] = np.int32([[request.max_completion_tokens]])
+        # Fallback to deprecated request.max_tokens
+        elif request.max_tokens is not None:
+            inputs["max_tokens"] = np.int32([[request.max_tokens]])
+        # If neither is set, use a default value for max_tokens
+        else:
+            inputs["max_tokens"] = np.int32([[default_max_tokens]])
+    # Indicates CreateCompletionRequest
+    elif request.max_tokens is not None:
         inputs["max_tokens"] = np.int32([[request.max_tokens]])
+    else:
+        inputs["max_tokens"] = np.int32([[default_max_tokens]])
+
     if request.stop:
         if isinstance(request.stop, str):
             request.stop = [request.stop]
diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
@@ -143,6 +143,13 @@ def parse_args():
         help="The path to the custom Jinja chat template file. This is useful if you'd like to use a different chat template than the one provided by the model.",
     )
 
+    triton_group.add_argument(
+        "--default-max-tokens",
+        type=int,
+        default=16,
+        help="The default maximum number of tokens to generate if not specified in the request. The default is 16.",
+    )
+
     # OpenAI-Compatible Frontend (FastAPI)
     openai_group = parser.add_argument_group("Triton OpenAI-Compatible Frontend")
     openai_group.add_argument(
@@ -199,6 +206,7 @@ def main():
         lora_separator=args.lora_separator,
         tool_call_parser=args.tool_call_parser,
         chat_template=args.chat_template,
+        default_max_tokens=args.default_max_tokens,
     )
 
     # Attach TritonLLMEngine as the backbone for inference and model management
diff --git a/python/openai/openai_frontend/schemas/openai.py b/python/openai/openai_frontend/schemas/openai.py
@@ -103,7 +103,7 @@ class CreateCompletionRequest(BaseModel):
         description="Include the log probabilities on the `logprobs` most likely output tokens, as well the chosen tokens. For example, if `logprobs` is 5, the API will return a list of the 5 most likely tokens. The API will always return the `logprob` of the sampled token, so there may be up to `logprobs+1` elements in the response.\n\nThe maximum value for `logprobs` is 5.\n",
     )
     max_tokens: Optional[conint(ge=0)] = Field(
-        16,
+        None,
         description="The maximum number of [tokens](/tokenizer) that can be generated in the completion.\n\nThe token count of your prompt plus `max_tokens` cannot exceed the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n",
         examples=[16],
     )
@@ -526,14 +526,6 @@ class Logprobs2(BaseModel):
     )
 
 
-class ChatCompletionFinishReason(Enum):
-    stop = "stop"
-    length = "length"
-    tool_calls = "tool_calls"
-    content_filter = "content_filter"
-    function_call = "function_call"
-
-
 class ChatCompletionStreamingResponseChoice(BaseModel):
     delta: ChatCompletionStreamResponseDelta
     logprobs: Optional[Logprobs2] = Field(
@@ -850,11 +842,15 @@ class CreateChatCompletionRequest(BaseModel):
         None,
         description="An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. `logprobs` must be set to `true` if this parameter is used.",
     )
-    # TODO: Consider new max_completion_tokens field in the future: https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_completion_tokens
-    max_tokens: Optional[conint(ge=0)] = Field(
-        16,
+    max_completion_tokens: Optional[conint(ge=0)] = Field(
+        None,
         description="The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n",
     )
+    # TODO: Remove support for max_tokens field in the future: https://platform.openai.com/docs/api-reference/chat/create#chat-create-max_completion_tokens
+    max_tokens: Optional[conint(ge=0)] = Field(
+        None,
+        description="DEPRECATED: Use `max_completion_tokens` instead. The maximum number of [tokens](/tokenizer) that can be generated in the chat completion.\n\nThe total length of input tokens and generated tokens is limited by the model's context length. [Example Python code](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken) for counting tokens.\n",
+    )
     # TODO: Extension, flesh out description and defaults
     min_tokens: Optional[conint(ge=0)] = Field(
         None,
@@ -871,7 +867,7 @@ class CreateChatCompletionRequest(BaseModel):
     )
     response_format: Optional[ResponseFormat] = Field(
         None,
-        description='An object specifying the format that the model must output. Compatible with [GPT-4 Turbo](/docs/models/gpt-4-and-gpt-4-turbo) and all GPT-3.5 Turbo models newer than `gpt-3.5-turbo-1106`.\n\nSetting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON.\n\n**Important:** when using JSON mode, you **must** also instruct the model to produce JSON yourself via a system or user message. Without this, the model may generate an unending stream of whitespace until the generation reaches the token limit, resulting in a long-running and seemingly "stuck" request. Also note that the message content may be partially cut off if `finish_reason="length"`, which indicates the generation exceeded `max_tokens` or the conversation exceeded the max context length.\n',
+        description='An object specifying the format that the model must output. Compatible with [GPT-4 Turbo](/docs/models/gpt-4-and-gpt-4-turbo) and all GPT-3.5 Turbo models newer than `gpt-3.5-turbo-1106`.\n\nSetting to `{ "type": "json_object" }` enables JSON mode, which guarantees the message the model generates is valid JSON.\n\n**Important:** when using JSON mode, you **must** also instruct the model to produce JSON yourself via a system or user message. Without this, the model may generate an unending stream of whitespace until the generation reaches the token limit, resulting in a long-running and seemingly "stuck" request. Also note that the message content may be partially cut off if `finish_reason="length"`, which indicates the generation exceeded `max_completion_tokens` or the conversation exceeded the max context length.\n',
     )
     seed: Optional[conint(ge=-9223372036854775808, le=9223372036854775807)] = Field(
         None,
diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
@@ -123,6 +123,7 @@ def test_chat_completions_user_prompt_dict(self, client, model: str):
         [
             ("temperature", 0.7),
             ("max_tokens", 10),
+            ("max_completion_tokens", 10),
             ("top_p", 0.9),
             ("frequency_penalty", 0.5),
             ("presence_penalty", 0.2),
@@ -172,6 +173,7 @@ def test_chat_completions_sampling_parameters(
             ("temperature", 2.1),
             ("temperature", -0.1),
             ("max_tokens", -1),
+            ("max_completion_tokens", -1),
             ("top_p", 1.1),
             ("frequency_penalty", 3),
             ("frequency_penalty", -3),
@@ -199,14 +201,21 @@ def test_chat_completions_invalid_sampling_parameters(
         assert response.status_code == 422
 
     # Simple tests to verify max_tokens roughly behaves as expected
+    @pytest.mark.parametrize(
+        "max_tokens_key",
+        [
+            "max_tokens",
+            "max_completion_tokens",
+        ],
+    )
     def test_chat_completions_max_tokens(
-        self, client, model: str, messages: List[dict]
+        self, client, max_tokens_key, model: str, messages: List[dict]
     ):
         responses = []
-        payload = {"model": model, "messages": messages, "max_tokens": 1}
+        payload = {"model": model, "messages": messages}
 
-        # Send two requests with max_tokens = 1 to check their similarity
-        payload["max_tokens"] = 1
+        # Send two requests with max_tokens/max_completion_tokens = 1 to check their similarity
+        payload[max_tokens_key] = 1
         responses.append(
             client.post(
                 "/v1/chat/completions",
@@ -219,8 +228,8 @@ def test_chat_completions_max_tokens(
                 json=payload,
             )
         )
-        # Send one requests with larger max_tokens to check its dis-similarity
-        payload["max_tokens"] = 100
+        # Send one requests with larger max_tokens/max_completion_tokens to check its dis-similarity
+        payload[max_tokens_key] = 100
         responses.append(
             client.post(
                 "/v1/chat/completions",
@@ -245,6 +254,30 @@ def test_chat_completions_max_tokens(
         assert len(response1_text) == len(response2_text) == 1
         assert len(response3_text) > len(response1_text)
 
+    def test_chat_completions_max_completion_tokens_precedence(
+        self, client, model: str, messages: List[dict]
+    ):
+        payload = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": 50,  # Higher value for max_tokens
+            "max_completion_tokens": 1,  # Lower, expected to take precedence
+        }
+
+        response = client.post(
+            "/v1/chat/completions",
+            json=payload,
+        )
+
+        print("Response:", response.json())
+        assert response.status_code == 200
+
+        response_text_words = (
+            response.json()["choices"][0]["message"]["content"].strip().split()
+        )
+        # Check if the number of words is around max_completion_tokens
+        assert len(response_text_words) == 1
+
     @pytest.mark.parametrize(
         "temperature",
         [0.0, 1.0],
@@ -260,7 +293,7 @@ def test_chat_completions_temperature_vllm(
         payload = {
             "model": model,
             "messages": messages,
-            "max_tokens": 256,
+            "max_completion_tokens": 256,
             "temperature": temperature,
         }
 
@@ -321,7 +354,7 @@ def test_chat_completions_temperature_tensorrtllm(
             "model": model,
             "messages": messages,
             # Increase token length to allow more room for variability
-            "max_tokens": 200,
+            "max_completion_tokens": 200,
             "temperature": 0.0,
             # TRT-LLM requires certain settings of `top_k` / `top_p` to
             # respect changes in `temperature`
@@ -376,7 +409,7 @@ def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
             "model": model,
             "messages": messages,
             # Increase token length to allow more room for variability
-            "max_tokens": 200,
+            "max_completion_tokens": 200,
             "seed": 1,
         }
         payload2 = copy.deepcopy(payload1)
@@ -559,7 +592,12 @@ def test_chat_completions_custom_tokenizer(
 
         responses = []
         with TestClient(app_local) as client_local, TestClient(app_hf) as client_hf:
-            payload = {"model": model, "messages": messages, "temperature": 0}
+            payload = {
+                "model": model,
+                "messages": messages,
+                "temperature": 0,
+                "seed": 0,
+            }
             responses.append(client_local.post("/v1/chat/completions", json=payload))
             responses.append(client_hf.post("/v1/chat/completions", json=payload))
 
diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
diff --git a/python/openai/tests/test_tool_calling.py b/python/openai/tests/test_tool_calling.py
diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py