Skip to content

Commit 91918b6

Browse files
committed
add openwebui support
Signed-off-by: JaredforReal <[email protected]>
1 parent d83c9ec commit 91918b6

File tree

2 files changed

+207
-9
lines changed

2 files changed

+207
-9
lines changed

dashboard/backend/.gitkeep

Whitespace-only changes.

deploy/docker-compose/addons/vllm_semantic_router_pipe.py

Lines changed: 207 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ class Valves(BaseModel):
3535
# Request timeout in seconds
3636
timeout: int = 300
3737

38+
# Prefer OpenAI Responses API instead of Chat Completions
39+
use_responses_api: bool = True
40+
3841
def __init__(self):
3942
# Important: type should be "manifold" instead of "pipe"
4043
# manifold type Pipeline will be displayed in the model list
@@ -51,6 +54,7 @@ def __init__(self):
5154
"log_vsr_info": True,
5255
"debug": True,
5356
"timeout": 300,
57+
"use_responses_api": True,
5458
}
5559
)
5660

@@ -380,7 +384,10 @@ def pipe(
380384
print("=" * 80)
381385

382386
# Prepare the request to vLLM Semantic Router
383-
url = f"{self.valves.vsr_base_url}/v1/chat/completions"
387+
if self.valves.use_responses_api:
388+
url = f"{self.valves.vsr_base_url}/v1/responses"
389+
else:
390+
url = f"{self.valves.vsr_base_url}/v1/chat/completions"
384391

385392
if self.valves.debug:
386393
print(f"\n📡 Sending request to: {url}")
@@ -412,6 +419,10 @@ def pipe(
412419
print(f" Streaming: {is_streaming}")
413420
print(f" Timeout: {self.valves.timeout}s")
414421

422+
# If using Responses API for streaming, set Accept header for SSE
423+
if self.valves.use_responses_api and is_streaming:
424+
headers["Accept"] = "text/event-stream"
425+
415426
try:
416427
if self.valves.debug:
417428
print(f"\n🔌 Connecting to vLLM Semantic Router...")
@@ -459,7 +470,12 @@ def pipe(
459470
if self.valves.debug:
460471
print(f"\n📺 Handling streaming response...")
461472
# Handle streaming response
462-
return self._handle_streaming_response(response, vsr_headers)
473+
if self.valves.use_responses_api:
474+
return self._handle_streaming_response_responses(
475+
response, vsr_headers
476+
)
477+
else:
478+
return self._handle_streaming_response(response, vsr_headers)
463479
else:
464480
if self.valves.debug:
465481
print(f"\n📄 Handling non-streaming response...")
@@ -493,13 +509,29 @@ def pipe(
493509
print("=" * 80 + "\n")
494510
return f"{error_msg}: {str(e)}"
495511

496-
if self.valves.debug:
497-
print(f" Response data keys: {list(response_data.keys())}")
498-
if "choices" in response_data:
499-
print(f" Choices count: {len(response_data['choices'])}")
500-
501-
# Add VSR info to the response if enabled
502-
if self.valves.show_vsr_info and vsr_headers:
512+
# Transform Responses API JSON to Chat Completions JSON if enabled
513+
if self.valves.use_responses_api:
514+
response_data = self._responses_to_chat_completions(
515+
response_data, vsr_headers
516+
)
517+
if self.valves.debug:
518+
print(
519+
f" Transformed Responses → ChatCompletions. keys: {list(response_data.keys())}"
520+
)
521+
if "choices" in response_data:
522+
print(f" Choices count: {len(response_data['choices'])}")
523+
else:
524+
if self.valves.debug:
525+
print(f" Response data keys: {list(response_data.keys())}")
526+
if "choices" in response_data:
527+
print(f" Choices count: {len(response_data['choices'])}")
528+
529+
# Add VSR info to the response if enabled (only for Chat Completions shape)
530+
if (
531+
(not self.valves.use_responses_api)
532+
and self.valves.show_vsr_info
533+
and vsr_headers
534+
):
503535
vsr_info = self._format_vsr_info(vsr_headers, position="prefix")
504536

505537
if self.valves.debug:
@@ -540,6 +572,69 @@ def pipe(
540572
print("=" * 80 + "\n")
541573
return error_msg
542574

575+
def _responses_to_chat_completions(self, resp: dict, vsr_headers: dict) -> dict:
576+
"""
577+
Convert minimal OpenAI Responses JSON to legacy Chat Completions JSON
578+
and inject VSR info as prefix to assistant content.
579+
"""
580+
# Extract assistant text from output array
581+
content_parts = []
582+
output = resp.get("output", [])
583+
if isinstance(output, list):
584+
for item in output:
585+
if isinstance(item, dict) and item.get("type") == "message":
586+
if item.get("role") == "assistant":
587+
text = item.get("content", "")
588+
if isinstance(text, str) and text:
589+
content_parts.append(text)
590+
content = "".join(content_parts)
591+
592+
# Map usage
593+
usage = resp.get("usage", {}) or {}
594+
prompt_tokens = usage.get("input_tokens", 0)
595+
completion_tokens = usage.get("output_tokens", 0)
596+
total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens)
597+
598+
# Build Chat Completions JSON
599+
chat = {
600+
"id": resp.get("id", ""),
601+
"object": "chat.completion",
602+
"created": resp.get("created", 0),
603+
"model": resp.get("model", "auto"),
604+
"system_fingerprint": "vsr",
605+
"choices": [
606+
{
607+
"index": 0,
608+
"message": {"role": "assistant", "content": content},
609+
"logprobs": None,
610+
"finish_reason": resp.get("stop_reason", "stop"),
611+
}
612+
],
613+
"usage": {
614+
"prompt_tokens": prompt_tokens,
615+
"completion_tokens": completion_tokens,
616+
"total_tokens": total_tokens,
617+
"prompt_tokens_details": {"cached_tokens": 0},
618+
"completion_tokens_details": {"reasoning_tokens": 0},
619+
},
620+
"token_usage": {
621+
"prompt_tokens": prompt_tokens,
622+
"completion_tokens": completion_tokens,
623+
"total_tokens": total_tokens,
624+
"prompt_tokens_details": {"cached_tokens": 0},
625+
"completion_tokens_details": {"reasoning_tokens": 0},
626+
},
627+
}
628+
629+
# Prepend VSR info if enabled
630+
if self.valves.show_vsr_info and vsr_headers:
631+
vsr_info = self._format_vsr_info(vsr_headers, position="prefix")
632+
chat["choices"][0]["message"]["content"] = (
633+
vsr_info + chat["choices"][0]["message"]["content"]
634+
)
635+
636+
return chat
637+
543638
def _handle_streaming_response(
544639
self, response: requests.Response, vsr_headers: dict
545640
) -> Generator:
@@ -646,3 +741,106 @@ def _handle_streaming_response(
646741
except json.JSONDecodeError:
647742
# If not valid JSON, pass through as-is
648743
yield f"data: {data_str}\n\n"
744+
745+
def _handle_streaming_response_responses(
746+
self, response: requests.Response, vsr_headers: dict
747+
) -> Generator:
748+
"""
749+
Handle SSE stream for Responses API and convert to Chat Completions chunks.
750+
Inject VSR info at the first assistant content delta.
751+
"""
752+
vsr_info_added = False
753+
754+
for line in response.iter_lines(decode_unicode=True):
755+
if not line:
756+
continue
757+
758+
if not line.startswith("data: "):
759+
continue
760+
761+
data_str = line[6:].strip()
762+
763+
if data_str == "[DONE]":
764+
yield f"data: [DONE]\n\n"
765+
if self.valves.debug:
766+
print(f"✅ Streaming completed (Responses)")
767+
continue
768+
769+
try:
770+
ev = json.loads(data_str)
771+
except json.JSONDecodeError:
772+
# Pass through unknown payloads
773+
yield f"data: {data_str}\n\n"
774+
continue
775+
776+
etype = ev.get("type", "")
777+
778+
if etype == "response.output_text.delta":
779+
delta_text = ev.get("delta", "")
780+
if self.valves.show_vsr_info and not vsr_info_added:
781+
vsr_info = self._format_vsr_info(vsr_headers, position="prefix")
782+
delta_text = vsr_info + (delta_text or "")
783+
vsr_info_added = True
784+
785+
chunk = {
786+
"id": f"chatcmpl-{ev.get('created', 0)}",
787+
"object": "chat.completion.chunk",
788+
"created": ev.get("created", 0),
789+
"model": "auto",
790+
"system_fingerprint": "vsr",
791+
"choices": [
792+
{
793+
"index": 0,
794+
"delta": {"content": delta_text},
795+
"logprobs": None,
796+
"finish_reason": None,
797+
}
798+
],
799+
}
800+
yield f"data: {json.dumps(chunk)}\n\n"
801+
802+
elif etype == "response.tool_calls.delta":
803+
chunk = {
804+
"id": f"chatcmpl-{ev.get('created', 0)}",
805+
"object": "chat.completion.chunk",
806+
"created": ev.get("created", 0),
807+
"model": "auto",
808+
"system_fingerprint": "vsr",
809+
"choices": [
810+
{
811+
"index": 0,
812+
"delta": {
813+
"function_call": {
814+
"name": ev.get("name", ""),
815+
"arguments": ev.get("arguments_delta", ""),
816+
}
817+
},
818+
"logprobs": None,
819+
"finish_reason": None,
820+
}
821+
],
822+
}
823+
yield f"data: {json.dumps(chunk)}\n\n"
824+
825+
elif etype == "response.completed":
826+
finish = ev.get("stop_reason", "stop")
827+
chunk = {
828+
"id": "chatcmpl-end",
829+
"object": "chat.completion.chunk",
830+
"created": ev.get("created", 0),
831+
"model": "auto",
832+
"system_fingerprint": "vsr",
833+
"choices": [
834+
{
835+
"index": 0,
836+
"delta": {},
837+
"logprobs": None,
838+
"finish_reason": finish,
839+
}
840+
],
841+
}
842+
yield f"data: {json.dumps(chunk)}\n\n"
843+
844+
else:
845+
# Unknown event type: pass-through
846+
yield f"data: {data_str}\n\n"

0 commit comments

Comments
 (0)