@@ -35,6 +35,9 @@ class Valves(BaseModel):
3535 # Request timeout in seconds
3636 timeout : int = 300
3737
38+ # Prefer OpenAI Responses API instead of Chat Completions
39+ use_responses_api : bool = True
40+
3841 def __init__ (self ):
3942 # Important: type should be "manifold" instead of "pipe"
4043 # manifold type Pipeline will be displayed in the model list
@@ -51,6 +54,7 @@ def __init__(self):
5154 "log_vsr_info" : True ,
5255 "debug" : True ,
5356 "timeout" : 300 ,
57+ "use_responses_api" : True ,
5458 }
5559 )
5660
@@ -380,7 +384,10 @@ def pipe(
380384 print ("=" * 80 )
381385
382386 # Prepare the request to vLLM Semantic Router
383- url = f"{ self .valves .vsr_base_url } /v1/chat/completions"
387+ if self .valves .use_responses_api :
388+ url = f"{ self .valves .vsr_base_url } /v1/responses"
389+ else :
390+ url = f"{ self .valves .vsr_base_url } /v1/chat/completions"
384391
385392 if self .valves .debug :
386393 print (f"\n 📡 Sending request to: { url } " )
@@ -412,6 +419,10 @@ def pipe(
412419 print (f" Streaming: { is_streaming } " )
413420 print (f" Timeout: { self .valves .timeout } s" )
414421
422+ # If using Responses API for streaming, set Accept header for SSE
423+ if self .valves .use_responses_api and is_streaming :
424+ headers ["Accept" ] = "text/event-stream"
425+
415426 try :
416427 if self .valves .debug :
417428 print (f"\n 🔌 Connecting to vLLM Semantic Router..." )
@@ -459,7 +470,12 @@ def pipe(
459470 if self .valves .debug :
460471 print (f"\n 📺 Handling streaming response..." )
461472 # Handle streaming response
462- return self ._handle_streaming_response (response , vsr_headers )
473+ if self .valves .use_responses_api :
474+ return self ._handle_streaming_response_responses (
475+ response , vsr_headers
476+ )
477+ else :
478+ return self ._handle_streaming_response (response , vsr_headers )
463479 else :
464480 if self .valves .debug :
465481 print (f"\n 📄 Handling non-streaming response..." )
@@ -493,13 +509,29 @@ def pipe(
493509 print ("=" * 80 + "\n " )
494510 return f"{ error_msg } : { str (e )} "
495511
496- if self .valves .debug :
497- print (f" Response data keys: { list (response_data .keys ())} " )
498- if "choices" in response_data :
499- print (f" Choices count: { len (response_data ['choices' ])} " )
500-
501- # Add VSR info to the response if enabled
502- if self .valves .show_vsr_info and vsr_headers :
512+ # Transform Responses API JSON to Chat Completions JSON if enabled
513+ if self .valves .use_responses_api :
514+ response_data = self ._responses_to_chat_completions (
515+ response_data , vsr_headers
516+ )
517+ if self .valves .debug :
518+ print (
519+ f" Transformed Responses → ChatCompletions. keys: { list (response_data .keys ())} "
520+ )
521+ if "choices" in response_data :
522+ print (f" Choices count: { len (response_data ['choices' ])} " )
523+ else :
524+ if self .valves .debug :
525+ print (f" Response data keys: { list (response_data .keys ())} " )
526+ if "choices" in response_data :
527+ print (f" Choices count: { len (response_data ['choices' ])} " )
528+
529+ # Add VSR info to the response if enabled (only for Chat Completions shape)
530+ if (
531+ (not self .valves .use_responses_api )
532+ and self .valves .show_vsr_info
533+ and vsr_headers
534+ ):
503535 vsr_info = self ._format_vsr_info (vsr_headers , position = "prefix" )
504536
505537 if self .valves .debug :
@@ -540,6 +572,69 @@ def pipe(
540572 print ("=" * 80 + "\n " )
541573 return error_msg
542574
575+ def _responses_to_chat_completions (self , resp : dict , vsr_headers : dict ) -> dict :
576+ """
577+ Convert minimal OpenAI Responses JSON to legacy Chat Completions JSON
578+ and inject VSR info as prefix to assistant content.
579+ """
580+ # Extract assistant text from output array
581+ content_parts = []
582+ output = resp .get ("output" , [])
583+ if isinstance (output , list ):
584+ for item in output :
585+ if isinstance (item , dict ) and item .get ("type" ) == "message" :
586+ if item .get ("role" ) == "assistant" :
587+ text = item .get ("content" , "" )
588+ if isinstance (text , str ) and text :
589+ content_parts .append (text )
590+ content = "" .join (content_parts )
591+
592+ # Map usage
593+ usage = resp .get ("usage" , {}) or {}
594+ prompt_tokens = usage .get ("input_tokens" , 0 )
595+ completion_tokens = usage .get ("output_tokens" , 0 )
596+ total_tokens = usage .get ("total_tokens" , prompt_tokens + completion_tokens )
597+
598+ # Build Chat Completions JSON
599+ chat = {
600+ "id" : resp .get ("id" , "" ),
601+ "object" : "chat.completion" ,
602+ "created" : resp .get ("created" , 0 ),
603+ "model" : resp .get ("model" , "auto" ),
604+ "system_fingerprint" : "vsr" ,
605+ "choices" : [
606+ {
607+ "index" : 0 ,
608+ "message" : {"role" : "assistant" , "content" : content },
609+ "logprobs" : None ,
610+ "finish_reason" : resp .get ("stop_reason" , "stop" ),
611+ }
612+ ],
613+ "usage" : {
614+ "prompt_tokens" : prompt_tokens ,
615+ "completion_tokens" : completion_tokens ,
616+ "total_tokens" : total_tokens ,
617+ "prompt_tokens_details" : {"cached_tokens" : 0 },
618+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
619+ },
620+ "token_usage" : {
621+ "prompt_tokens" : prompt_tokens ,
622+ "completion_tokens" : completion_tokens ,
623+ "total_tokens" : total_tokens ,
624+ "prompt_tokens_details" : {"cached_tokens" : 0 },
625+ "completion_tokens_details" : {"reasoning_tokens" : 0 },
626+ },
627+ }
628+
629+ # Prepend VSR info if enabled
630+ if self .valves .show_vsr_info and vsr_headers :
631+ vsr_info = self ._format_vsr_info (vsr_headers , position = "prefix" )
632+ chat ["choices" ][0 ]["message" ]["content" ] = (
633+ vsr_info + chat ["choices" ][0 ]["message" ]["content" ]
634+ )
635+
636+ return chat
637+
543638 def _handle_streaming_response (
544639 self , response : requests .Response , vsr_headers : dict
545640 ) -> Generator :
@@ -646,3 +741,106 @@ def _handle_streaming_response(
646741 except json .JSONDecodeError :
647742 # If not valid JSON, pass through as-is
648743 yield f"data: { data_str } \n \n "
744+
745+ def _handle_streaming_response_responses (
746+ self , response : requests .Response , vsr_headers : dict
747+ ) -> Generator :
748+ """
749+ Handle SSE stream for Responses API and convert to Chat Completions chunks.
750+ Inject VSR info at the first assistant content delta.
751+ """
752+ vsr_info_added = False
753+
754+ for line in response .iter_lines (decode_unicode = True ):
755+ if not line :
756+ continue
757+
758+ if not line .startswith ("data: " ):
759+ continue
760+
761+ data_str = line [6 :].strip ()
762+
763+ if data_str == "[DONE]" :
764+ yield f"data: [DONE]\n \n "
765+ if self .valves .debug :
766+ print (f"✅ Streaming completed (Responses)" )
767+ continue
768+
769+ try :
770+ ev = json .loads (data_str )
771+ except json .JSONDecodeError :
772+ # Pass through unknown payloads
773+ yield f"data: { data_str } \n \n "
774+ continue
775+
776+ etype = ev .get ("type" , "" )
777+
778+ if etype == "response.output_text.delta" :
779+ delta_text = ev .get ("delta" , "" )
780+ if self .valves .show_vsr_info and not vsr_info_added :
781+ vsr_info = self ._format_vsr_info (vsr_headers , position = "prefix" )
782+ delta_text = vsr_info + (delta_text or "" )
783+ vsr_info_added = True
784+
785+ chunk = {
786+ "id" : f"chatcmpl-{ ev .get ('created' , 0 )} " ,
787+ "object" : "chat.completion.chunk" ,
788+ "created" : ev .get ("created" , 0 ),
789+ "model" : "auto" ,
790+ "system_fingerprint" : "vsr" ,
791+ "choices" : [
792+ {
793+ "index" : 0 ,
794+ "delta" : {"content" : delta_text },
795+ "logprobs" : None ,
796+ "finish_reason" : None ,
797+ }
798+ ],
799+ }
800+ yield f"data: { json .dumps (chunk )} \n \n "
801+
802+ elif etype == "response.tool_calls.delta" :
803+ chunk = {
804+ "id" : f"chatcmpl-{ ev .get ('created' , 0 )} " ,
805+ "object" : "chat.completion.chunk" ,
806+ "created" : ev .get ("created" , 0 ),
807+ "model" : "auto" ,
808+ "system_fingerprint" : "vsr" ,
809+ "choices" : [
810+ {
811+ "index" : 0 ,
812+ "delta" : {
813+ "function_call" : {
814+ "name" : ev .get ("name" , "" ),
815+ "arguments" : ev .get ("arguments_delta" , "" ),
816+ }
817+ },
818+ "logprobs" : None ,
819+ "finish_reason" : None ,
820+ }
821+ ],
822+ }
823+ yield f"data: { json .dumps (chunk )} \n \n "
824+
825+ elif etype == "response.completed" :
826+ finish = ev .get ("stop_reason" , "stop" )
827+ chunk = {
828+ "id" : "chatcmpl-end" ,
829+ "object" : "chat.completion.chunk" ,
830+ "created" : ev .get ("created" , 0 ),
831+ "model" : "auto" ,
832+ "system_fingerprint" : "vsr" ,
833+ "choices" : [
834+ {
835+ "index" : 0 ,
836+ "delta" : {},
837+ "logprobs" : None ,
838+ "finish_reason" : finish ,
839+ }
840+ ],
841+ }
842+ yield f"data: { json .dumps (chunk )} \n \n "
843+
844+ else :
845+ # Unknown event type: pass-through
846+ yield f"data: { data_str } \n \n "
0 commit comments