Add return_token_ids_alongside parameter to OpenAI API endpoints

ultmaster · claude · ultmaster · commit 802e7dd0c785 · 2025-08-10T16:24:04.000+08:00
- Add optional return_token_ids_alongside parameter to ChatCompletionRequest and CompletionRequest - Include token_ids and prompt_token_ids fields in response models when requested - Implement conditional logic in serving endpoints to return token IDs alongside generated text - Useful for debugging and agent scenarios where token-level tracing is needed 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -567,6 +567,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids_alongside: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If specified, the result will include both prompt and response "
+            "token ids alongside the generated text. "
+            "This is useful for debugging or when you "
+            "need to map generated text back to input tokens."
+        )
+    )
     cache_salt: Optional[str] = Field(
         default=None,
         description=(
@@ -1053,6 +1062,15 @@ class CompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids_alongside: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If specified, the result will include both prompt and response "
+            "token ids alongside the generated text. "
+            "This is useful for debugging or when you "
+            "need to map generated text back to input tokens."
+        )
+    )
 
     cache_salt: Optional[str] = Field(
         default=None,
@@ -1471,7 +1489,9 @@ class CompletionResponseChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
+    token_ids: Optional[list[int]] = None
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1671,6 +1691,8 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     finish_reason: Optional[str] = "stop"
     # not part of the OpenAI spec but included in vLLM for legacy reasons
     stop_reason: Optional[Union[int, str]] = None
+    # not part of the OpenAI spec but is useful for tracing the tokens in agent scenarios
+    token_ids: Optional[list[int]] = None
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
@@ -1686,6 +1708,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 
     # vLLM-specific fields that are not in OpenAI spec
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None
     kv_transfer_params: Optional[dict[str, Any]] = Field(
         default=None, description="KVTransfer parameters.")
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -1259,7 +1259,13 @@ async def chat_completion_full_generator(
                 logprobs=logprobs,
                 finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
-                stop_reason=output.stop_reason)
+                stop_reason=output.stop_reason,
+                token_ids=(
+                    token_ids
+                    if request.return_token_ids_alongside
+                    else None
+                ),
+            )
 
             choices.append(choice_data)
 
@@ -1300,6 +1306,11 @@ async def chat_completion_full_generator(
             choices=choices,
             usage=usage,
             prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            prompt_token_ids=(
+                final_res.prompt_token_ids
+                if request.return_token_ids_alongside
+                else None
+            ),
             kv_transfer_params=final_res.kv_transfer_params,
         )
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -548,6 +548,16 @@ def request_output_to_completion_response(
                     finish_reason=output.finish_reason,
                     stop_reason=output.stop_reason,
                     prompt_logprobs=final_res.prompt_logprobs,
+                    prompt_token_ids=(
+                        prompt_token_ids
+                        if request.return_token_ids_alongside
+                        else None
+                    ),
+                    token_ids=(
+                        token_ids
+                        if request.return_token_ids_alongside
+                        else None
+                    ),
                 )
                 choices.append(choice_data)