Skip to content

Commit 802e7dd

Browse files
ultmasterclaude
andcommitted
Add return_token_ids_alongside parameter to OpenAI API endpoints
- Add optional return_token_ids_alongside parameter to ChatCompletionRequest and CompletionRequest - Include token_ids and prompt_token_ids fields in response models when requested - Implement conditional logic in serving endpoints to return token IDs alongside generated text - Useful for debugging and agent scenarios where token-level tracing is needed 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent c498483 commit 802e7dd

File tree

3 files changed

+45
-1
lines changed

3 files changed

+45
-1
lines changed

vllm/entrypoints/openai/protocol.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
567567
"If specified with 'logprobs', tokens are represented "
568568
" as strings of the form 'token_id:{token_id}' so that tokens "
569569
"that are not JSON-encodable can be identified."))
570+
return_token_ids_alongside: Optional[bool] = Field(
571+
default=False,
572+
description=(
573+
"If specified, the result will include both prompt and response "
574+
"token ids alongside the generated text. "
575+
"This is useful for debugging or when you "
576+
"need to map generated text back to input tokens."
577+
)
578+
)
570579
cache_salt: Optional[str] = Field(
571580
default=None,
572581
description=(
@@ -1053,6 +1062,15 @@ class CompletionRequest(OpenAIBaseModel):
10531062
"If specified with 'logprobs', tokens are represented "
10541063
" as strings of the form 'token_id:{token_id}' so that tokens "
10551064
"that are not JSON-encodable can be identified."))
1065+
return_token_ids_alongside: Optional[bool] = Field(
1066+
default=False,
1067+
description=(
1068+
"If specified, the result will include both prompt and response "
1069+
"token ids alongside the generated text. "
1070+
"This is useful for debugging or when you "
1071+
"need to map generated text back to input tokens."
1072+
)
1073+
)
10561074

10571075
cache_salt: Optional[str] = Field(
10581076
default=None,
@@ -1471,7 +1489,9 @@ class CompletionResponseChoice(OpenAIBaseModel):
14711489
"to stop, None if the completion finished for some other reason "
14721490
"including encountering the EOS token"),
14731491
)
1492+
token_ids: Optional[list[int]] = None
14741493
prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1494+
prompt_token_ids: Optional[list[int]] = None
14751495

14761496

14771497
class CompletionResponse(OpenAIBaseModel):
@@ -1671,6 +1691,8 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
16711691
finish_reason: Optional[str] = "stop"
16721692
# not part of the OpenAI spec but included in vLLM for legacy reasons
16731693
stop_reason: Optional[Union[int, str]] = None
1694+
# not part of the OpenAI spec but is useful for tracing the tokens in agent scenarios
1695+
token_ids: Optional[list[int]] = None
16741696

16751697

16761698
class ChatCompletionResponse(OpenAIBaseModel):
@@ -1686,6 +1708,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
16861708

16871709
# vLLM-specific fields that are not in OpenAI spec
16881710
prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
1711+
prompt_token_ids: Optional[list[int]] = None
16891712
kv_transfer_params: Optional[dict[str, Any]] = Field(
16901713
default=None, description="KVTransfer parameters.")
16911714

vllm/entrypoints/openai/serving_chat.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1259,7 +1259,13 @@ async def chat_completion_full_generator(
12591259
logprobs=logprobs,
12601260
finish_reason="tool_calls" if auto_tools_called else
12611261
output.finish_reason if output.finish_reason else "stop",
1262-
stop_reason=output.stop_reason)
1262+
stop_reason=output.stop_reason,
1263+
token_ids=(
1264+
token_ids
1265+
if request.return_token_ids_alongside
1266+
else None
1267+
),
1268+
)
12631269

12641270
choices.append(choice_data)
12651271

@@ -1300,6 +1306,11 @@ async def chat_completion_full_generator(
13001306
choices=choices,
13011307
usage=usage,
13021308
prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
1309+
prompt_token_ids=(
1310+
final_res.prompt_token_ids
1311+
if request.return_token_ids_alongside
1312+
else None
1313+
),
13031314
kv_transfer_params=final_res.kv_transfer_params,
13041315
)
13051316

vllm/entrypoints/openai/serving_completion.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,16 @@ def request_output_to_completion_response(
548548
finish_reason=output.finish_reason,
549549
stop_reason=output.stop_reason,
550550
prompt_logprobs=final_res.prompt_logprobs,
551+
prompt_token_ids=(
552+
prompt_token_ids
553+
if request.return_token_ids_alongside
554+
else None
555+
),
556+
token_ids=(
557+
token_ids
558+
if request.return_token_ids_alongside
559+
else None
560+
),
551561
)
552562
choices.append(choice_data)
553563

0 commit comments

Comments
 (0)