Skip to content

Commit 686f5e3

Browse files
authored
Return usage for openai streaming requests (#1663)
1 parent 415d109 commit 686f5e3

File tree

2 files changed

+29
-3
lines changed

2 files changed

+29
-3
lines changed

vllm/entrypoints/openai/api_server.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ def create_stream_response_json(
245245
index: int,
246246
text: str,
247247
finish_reason: Optional[str] = None,
248+
usage: Optional[UsageInfo] = None,
248249
) -> str:
249250
choice_data = ChatCompletionResponseStreamChoice(
250251
index=index,
@@ -257,7 +258,10 @@ def create_stream_response_json(
257258
model=model_name,
258259
choices=[choice_data],
259260
)
260-
response_json = response.json(ensure_ascii=False)
261+
if usage is not None:
262+
response.usage = usage
263+
# exclude unset to leave details out of each sse
264+
response_json = response.json(exclude_unset=True, ensure_ascii=False)
261265

262266
return response_json
263267

@@ -283,17 +287,25 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
283287
i = output.index
284288
delta_text = output.text[len(previous_texts[i]):]
285289
previous_texts[i] = output.text
286-
previous_num_tokens[i] = len(output.token_ids)
290+
completion_tokens = len(output.token_ids)
291+
previous_num_tokens[i] = completion_tokens
287292
response_json = create_stream_response_json(
288293
index=i,
289294
text=delta_text,
290295
)
291296
yield f"data: {response_json}\n\n"
292297
if output.finish_reason is not None:
298+
prompt_tokens = len(res.prompt_token_ids)
299+
final_usage = UsageInfo(
300+
prompt_tokens=prompt_tokens,
301+
completion_tokens=completion_tokens,
302+
total_tokens=prompt_tokens + completion_tokens,
303+
)
293304
response_json = create_stream_response_json(
294305
index=i,
295306
text="",
296307
finish_reason=output.finish_reason,
308+
usage=final_usage,
297309
)
298310
yield f"data: {response_json}\n\n"
299311
yield "data: [DONE]\n\n"
@@ -462,6 +474,7 @@ def create_stream_response_json(
462474
text: str,
463475
logprobs: Optional[LogProbs] = None,
464476
finish_reason: Optional[str] = None,
477+
usage: Optional[UsageInfo] = None,
465478
) -> str:
466479
choice_data = CompletionResponseStreamChoice(
467480
index=index,
@@ -475,7 +488,9 @@ def create_stream_response_json(
475488
model=model_name,
476489
choices=[choice_data],
477490
)
478-
response_json = response.json(ensure_ascii=False)
491+
if usage is not None:
492+
response.usage = usage
493+
response_json = response.json(exclude_unset=True, ensure_ascii=False)
479494

480495
return response_json
481496

@@ -505,11 +520,19 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
505520
if output.finish_reason is not None:
506521
logprobs = (LogProbs()
507522
if request.logprobs is not None else None)
523+
prompt_tokens = len(res.prompt_token_ids)
524+
completion_tokens = len(output.token_ids)
525+
final_usage = UsageInfo(
526+
prompt_tokens=prompt_tokens,
527+
completion_tokens=completion_tokens,
528+
total_tokens=prompt_tokens + completion_tokens,
529+
)
508530
response_json = create_stream_response_json(
509531
index=i,
510532
text="",
511533
logprobs=logprobs,
512534
finish_reason=output.finish_reason,
535+
usage=final_usage,
513536
)
514537
yield f"data: {response_json}\n\n"
515538
yield "data: [DONE]\n\n"

vllm/entrypoints/openai/protocol.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class CompletionStreamResponse(BaseModel):
139139
created: int = Field(default_factory=lambda: int(time.time()))
140140
model: str
141141
choices: List[CompletionResponseStreamChoice]
142+
usage: Optional[UsageInfo]
142143

143144

144145
class ChatMessage(BaseModel):
@@ -178,3 +179,5 @@ class ChatCompletionStreamResponse(BaseModel):
178179
created: int = Field(default_factory=lambda: int(time.time()))
179180
model: str
180181
choices: List[ChatCompletionResponseStreamChoice]
182+
usage: Optional[UsageInfo] = Field(
183+
default=None, description="data about request and response")

0 commit comments

Comments
 (0)