5151 _create_trtllm_inference_request ,
5252 _create_vllm_inference_request ,
5353 _get_output ,
54+ _get_usage_from_response ,
5455 _get_vllm_lora_names ,
56+ _StreamingUsageAccumulator ,
5557 _validate_triton_responses_non_streaming ,
5658)
5759from schemas .openai import (
6567 ChatCompletionStreamResponseDelta ,
6668 ChatCompletionToolChoiceOption1 ,
6769 Choice ,
70+ CompletionUsage ,
6871 CreateChatCompletionRequest ,
6972 CreateChatCompletionResponse ,
7073 CreateChatCompletionStreamResponse ,
@@ -229,6 +232,8 @@ async def chat(
229232 backend = metadata .backend ,
230233 )
231234
235+ usage = _get_usage_from_response (response , metadata .backend )
236+
232237 return CreateChatCompletionResponse (
233238 id = request_id ,
234239 choices = [
@@ -243,6 +248,7 @@ async def chat(
243248 model = request .model ,
244249 system_fingerprint = None ,
245250 object = ObjectType .chat_completion ,
251+ usage = usage ,
246252 )
247253
248254 def _get_chat_completion_response_message (
@@ -319,7 +325,7 @@ async def completion(
319325 created = int (time .time ())
320326 if request .stream :
321327 return self ._streaming_completion_iterator (
322- request_id , created , request . model , responses
328+ request_id , created , request , responses , metadata . backend
323329 )
324330
325331 # Response validation with decoupled models in mind
@@ -328,6 +334,8 @@ async def completion(
328334 response = responses [0 ]
329335 text = _get_output (response )
330336
337+ usage = _get_usage_from_response (response , metadata .backend )
338+
331339 choice = Choice (
332340 finish_reason = FinishReason .stop ,
333341 index = 0 ,
@@ -341,6 +349,7 @@ async def completion(
341349 object = ObjectType .text_completion ,
342350 created = created ,
343351 model = request .model ,
352+ usage = usage ,
344353 )
345354
346355 # TODO: This behavior should be tested further
@@ -421,6 +430,7 @@ def _get_streaming_chat_response_chunk(
421430 request_id : str ,
422431 created : int ,
423432 model : str ,
433+ usage : Optional [CompletionUsage ] = None ,
424434 ) -> CreateChatCompletionStreamResponse :
425435 return CreateChatCompletionStreamResponse (
426436 id = request_id ,
@@ -429,6 +439,7 @@ def _get_streaming_chat_response_chunk(
429439 model = model ,
430440 system_fingerprint = None ,
431441 object = ObjectType .chat_completion_chunk ,
442+ usage = usage ,
432443 )
433444
434445 def _get_first_streaming_chat_response (
@@ -444,7 +455,7 @@ def _get_first_streaming_chat_response(
444455 finish_reason = None ,
445456 )
446457 chunk = self ._get_streaming_chat_response_chunk (
447- choice , request_id , created , model
458+ choice , request_id , created , model , usage = None
448459 )
449460 return chunk
450461
@@ -470,6 +481,8 @@ async def _streaming_chat_iterator(
470481 )
471482
472483 previous_text = ""
484+ include_usage = request .stream_options and request .stream_options .include_usage
485+ usage_accumulator = _StreamingUsageAccumulator (backend )
473486
474487 chunk = self ._get_first_streaming_chat_response (
475488 request_id , created , model , role
@@ -478,6 +491,8 @@ async def _streaming_chat_iterator(
478491
479492 async for response in responses :
480493 delta_text = _get_output (response )
494+ if include_usage :
495+ usage_accumulator .update (response )
481496
482497 (
483498 response_delta ,
@@ -512,10 +527,25 @@ async def _streaming_chat_iterator(
512527 )
513528
514529 chunk = self ._get_streaming_chat_response_chunk (
515- choice , request_id , created , model
530+ choice , request_id , created , model , usage = None
516531 )
517532 yield f"data: { chunk .model_dump_json (exclude_unset = True )} \n \n "
518533
534+ # Send the final usage chunk if requested via stream_options.
535+ if include_usage :
536+ usage_payload = usage_accumulator .get_final_usage ()
537+ if usage_payload :
538+ final_usage_chunk = CreateChatCompletionStreamResponse (
539+ id = request_id ,
540+ choices = [],
541+ created = created ,
542+ model = model ,
543+ system_fingerprint = None ,
544+ object = ObjectType .chat_completion_chunk ,
545+ usage = usage_payload ,
546+ )
547+ yield f"data: { final_usage_chunk .model_dump_json (exclude_unset = True )} \n \n "
548+
519549 yield "data: [DONE]\n \n "
520550
521551 def _get_streaming_response_delta (
@@ -662,6 +692,18 @@ def _validate_chat_request(
662692
663693 self ._verify_chat_tool_call_settings (request = request )
664694
695+ if request .stream_options and not request .stream :
696+ raise Exception ("`stream_options` can only be used when `stream` is True" )
697+
698+ if (
699+ request .stream_options
700+ and request .stream_options .include_usage
701+ and metadata .backend != "vllm"
702+ ):
703+ raise Exception (
704+ "`stream_options.include_usage` is currently only supported for the vLLM backend"
705+ )
706+
665707 def _verify_chat_tool_call_settings (self , request : CreateChatCompletionRequest ):
666708 if (
667709 request .tool_choice
@@ -698,9 +740,21 @@ def _verify_chat_tool_call_settings(self, request: CreateChatCompletionRequest):
698740 )
699741
700742 async def _streaming_completion_iterator (
701- self , request_id : str , created : int , model : str , responses : AsyncIterable
743+ self ,
744+ request_id : str ,
745+ created : int ,
746+ request : CreateCompletionRequest ,
747+ responses : AsyncIterable ,
748+ backend : str ,
702749 ) -> AsyncIterator [str ]:
750+ model = request .model
751+ include_usage = request .stream_options and request .stream_options .include_usage
752+ usage_accumulator = _StreamingUsageAccumulator (backend )
753+
703754 async for response in responses :
755+ if include_usage :
756+ usage_accumulator .update (response )
757+
704758 text = _get_output (response )
705759 choice = Choice (
706760 finish_reason = FinishReason .stop if response .final else None ,
@@ -715,10 +769,26 @@ async def _streaming_completion_iterator(
715769 object = ObjectType .text_completion ,
716770 created = created ,
717771 model = model ,
772+ usage = None ,
718773 )
719774
720775 yield f"data: { chunk .model_dump_json (exclude_unset = True )} \n \n "
721776
777+ # Send the final usage chunk if requested via stream_options.
778+ if include_usage :
779+ usage_payload = usage_accumulator .get_final_usage ()
780+ if usage_payload :
781+ final_usage_chunk = CreateCompletionResponse (
782+ id = request_id ,
783+ choices = [],
784+ system_fingerprint = None ,
785+ object = ObjectType .text_completion ,
786+ created = created ,
787+ model = model ,
788+ usage = usage_payload ,
789+ )
790+ yield f"data: { final_usage_chunk .model_dump_json (exclude_unset = True )} \n \n "
791+
722792 yield "data: [DONE]\n \n "
723793
724794 def _validate_completion_request (
@@ -771,6 +841,18 @@ def _validate_completion_request(
771841 if request .logit_bias is not None or request .logprobs is not None :
772842 raise Exception ("logit bias and log probs not supported" )
773843
844+ if request .stream_options and not request .stream :
845+ raise Exception ("`stream_options` can only be used when `stream` is True" )
846+
847+ if (
848+ request .stream_options
849+ and request .stream_options .include_usage
850+ and metadata .backend != "vllm"
851+ ):
852+ raise Exception (
853+ "`stream_options.include_usage` is currently only supported for the vLLM backend"
854+ )
855+
774856 def _should_stream_with_auto_tool_parsing (
775857 self , request : CreateChatCompletionRequest
776858 ):
0 commit comments