@@ -222,7 +222,11 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
222222 finish_reason = None
223223 from .req_id_generator import convert_sub_id_to_group_id
224224
225+ prompt_tokens = 0
226+ completion_tokens = 0
225227 async for sub_req_id , request_output , metadata , finish_status in results_generator :
228+ prompt_tokens = metadata ["prompt_tokens" ]
229+ completion_tokens += 1
226230 if request .tool_choice != "none" and request .tools :
227231 delta = request_output
228232 group_request_id = convert_sub_id_to_group_id (sub_req_id )
@@ -309,6 +313,22 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
309313 choices = [stream_choice ],
310314 )
311315 yield ("data: " + json .dumps (stream_resp .dict (), ensure_ascii = False ) + "\n \n " ).encode ("utf-8" )
316+ # Additional usage chunk
317+
318+ if request .stream_options and request .stream_options .include_usage :
319+ usage = UsageInfo (
320+ prompt_tokens = prompt_tokens ,
321+ completion_tokens = completion_tokens ,
322+ total_tokens = prompt_tokens + completion_tokens ,
323+ )
324+ usage_chunk = ChatCompletionStreamResponse (
325+ id = group_request_id ,
326+ created = created_time ,
327+ choices = [], # Empty choices array as per OpenAI spec
328+ model = request .model ,
329+ usage = usage ,
330+ )
331+ yield f"data: { usage_chunk .model_dump_json ()} \n \n "
312332
313333 background_tasks = BackgroundTasks ()
314334 return StreamingResponse (stream_results (), media_type = "text/event-stream" , background = background_tasks )
@@ -453,9 +473,13 @@ async def _handle_streaming_completion(
453473 async def stream_results () -> AsyncGenerator [bytes , None ]:
454474 from .req_id_generator import convert_sub_id_to_group_id
455475
476+ prompt_tokens = 0
477+ completion_tokens = 0
478+
456479 async for sub_req_id , request_output , metadata , finish_status in results_generator :
457480 group_request_id = convert_sub_id_to_group_id (sub_req_id )
458-
481+ prompt_tokens = metadata ["prompt_tokens" ]
482+ completion_tokens += 1
459483 current_finish_reason = None
460484 if finish_status .is_finished ():
461485 current_finish_reason = finish_status .get_finish_reason ()
@@ -483,6 +507,21 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
483507
484508 yield "data: [DONE]\n \n " .encode ("utf-8" )
485509
510+ if request .stream_options and request .stream_options .include_usage :
511+ usage = UsageInfo (
512+ prompt_tokens = prompt_tokens ,
513+ completion_tokens = completion_tokens ,
514+ total_tokens = prompt_tokens + completion_tokens ,
515+ )
516+ usage_chunk = CompletionStreamResponse (
517+ id = group_request_id ,
518+ created = created_time ,
519+ choices = [], # Empty choices array as per OpenAI spec
520+ model = request .model ,
521+ usage = usage ,
522+ )
523+ yield f"data: { usage_chunk .model_dump_json ()} \n \n "
524+
486525 background_tasks = BackgroundTasks ()
487526 return StreamingResponse (stream_results (), media_type = "text/event-stream" , background = background_tasks )
488527
0 commit comments