diff --git a/deploy/docker-compose/README.md b/deploy/docker-compose/README.md index c226bdaf..09c9ff5f 100644 --- a/deploy/docker-compose/README.md +++ b/deploy/docker-compose/README.md @@ -35,7 +35,20 @@ Example mappings: ## Profiles - `testing` : enables `mock-vllm` and `llm-katan` -- `llm-katan` : enables only `llm-katan` +- `llm-katan` : only `llm-katan` + +## Services and Ports + +These host ports are exposed when you bring the stack up: + +- Dashboard: http://localhost:8700 (Semantic Router Dashboard) +- Envoy proxy: http://localhost:8801 +- Envoy admin: http://localhost:19000 +- Grafana: http://localhost:3000 (admin/admin) +- Prometheus: http://localhost:9090 +- Open WebUI: http://localhost:3001 +- Mock vLLM (testing profile): http://localhost:8000 +- LLM Katan (testing/llm-katan profiles): http://localhost:8002 ## Quick Start @@ -71,6 +84,8 @@ docker compose -f deploy/docker-compose/docker-compose.yml --profile testing up docker compose -f deploy/docker-compose/docker-compose.yml down ``` +After the stack is healthy, open the Dashboard at http://localhost:8700. + ## Overrides You can place a `docker-compose.override.yml` at repo root and combine: @@ -130,18 +145,3 @@ All services join the `semantic-network` bridge network with a fixed subnet to m - Local observability only: `tools/observability/docker-compose.obs.yml` - Tracing stack: `tools/tracing/docker-compose.tracing.yaml` - -## Related Stacks - -- Local observability only: `tools/observability/docker-compose.obs.yml` -- Tracing stack (standalone, dev): `tools/tracing/docker-compose.tracing.yaml` - -## Tracing & Grafana - -- Jaeger UI: http://localhost:16686 -- Grafana: http://localhost:3000 (admin/admin) - - Prometheus datasource (default) for metrics - - Jaeger datasource for exploring traces (search service `vllm-semantic-router`) - -By default, the router container uses `config/config.tracing.yaml` (enabled tracing, exporter to Jaeger). -Override with `CONFIG_FILE=/app/config/config.yaml` if you don’t want tracing. diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md index 175763cd..d45dd022 100644 --- a/deploy/kubernetes/README.md +++ b/deploy/kubernetes/README.md @@ -1,6 +1,6 @@ # Semantic Router Kubernetes Deployment -This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize. +Kustomize manifests for deploying the Semantic Router and its observability stack (Prometheus, Grafana, Dashboard, optional Open WebUI + Pipelines) on Kubernetes. ## Architecture @@ -12,8 +12,9 @@ The deployment consists of: - **Init Container**: Downloads/copies model files to persistent volume - **Main Container**: Runs the semantic router service - **Services**: - - Main service exposing gRPC port (50051), Classification API (8080), and metrics port (9190) - - Separate metrics service for monitoring + - Main service exposing gRPC (50051), Classification API (8080), and metrics (9190) + - Separate metrics service for monitoring (`semantic-router-metrics`) + - Observability services (Grafana, Prometheus, Dashboard, optional Open WebUI) ## Ports @@ -23,17 +24,40 @@ The deployment consists of: ## Quick Start -### Standard Kubernetes Deployment +### Deploy Core (Router) ```bash kubectl apply -k deploy/kubernetes/ # Check deployment status -kubectl get pods -l app=semantic-router -n semantic-router -kubectl get services -l app=semantic-router -n semantic-router +kubectl get pods -l app=semantic-router -n vllm-semantic-router-system +kubectl get services -l app=semantic-router -n vllm-semantic-router-system # View logs -kubectl logs -l app=semantic-router -n semantic-router -f +kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f + +### Add Observability (Prometheus + Grafana + Dashboard + Playground) + +```bash +kubectl apply -k deploy/kubernetes/observability/ +``` + +Port-forward to UIs (local dev): + +```bash +kubectl port-forward -n vllm-semantic-router-system svc/prometheus 9090:9090 +kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 +kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 +``` + +Then open: + +- Prometheus → http://localhost:9090 +- Grafana → http://localhost:3000 +- Dashboard → http://localhost:8700 +- Open WebUI (Playground) → http://localhost:3001 + ``` ### Kind (Kubernetes in Docker) Deployment @@ -86,20 +110,20 @@ kubectl wait --for=condition=Ready nodes --all --timeout=300s kubectl apply -k deploy/kubernetes/ # Wait for deployment to be ready -kubectl wait --for=condition=Available deployment/semantic-router -n semantic-router --timeout=600s +kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s ``` **Step 3: Check deployment status** ```bash # Check pods -kubectl get pods -n semantic-router -o wide +kubectl get pods -n vllm-semantic-router-system -o wide # Check services -kubectl get services -n semantic-router +kubectl get services -n vllm-semantic-router-system # View logs -kubectl logs -l app=semantic-router -n semantic-router -f +kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f ``` #### Resource Requirements for Kind @@ -131,19 +155,30 @@ make port-forward-grpc # Access metrics make port-forward-metrics + +# Access Dashboard / Grafana / Open WebUI +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 +kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 +kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 ``` Or using kubectl directly: ```bash # Access Classification API (HTTP REST) -kubectl port-forward -n semantic-router svc/semantic-router 8080:8080 +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 8080:8080 # Access gRPC API -kubectl port-forward -n semantic-router svc/semantic-router 50051:50051 +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 50051:50051 # Access metrics -kubectl port-forward -n semantic-router svc/semantic-router-metrics 9190:9190 +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-metrics 9190:9190 + +# Access Prometheus/Grafana/Dashboard/Open WebUI +kubectl port-forward -n vllm-semantic-router-system svc/prometheus 9090:9090 +kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 +kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 +kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 ``` #### Testing the Deployment @@ -313,7 +348,10 @@ Edit the `resources` section in `deployment.yaml` accordingly. - `namespace.yaml` - Dedicated namespace for the application - `config.yaml` - Application configuration - `tools_db.json` - Tools database for semantic routing -- `kustomization.yaml` - Kustomize configuration for easy deployment +- `kustomization.yaml` - Kustomize configuration for core deployment +- `observability/` - Prometheus, Grafana, Dashboard, optional Open WebUI + Pipelines (with its own `kustomization.yaml`) + +For detailed observability setup and screenshots, see `deploy/kubernetes/observability/README.md`. ### Development Tools diff --git a/deploy/kubernetes/observability/README.md b/deploy/kubernetes/observability/README.md index 640621ce..e5b7e41a 100644 --- a/deploy/kubernetes/observability/README.md +++ b/deploy/kubernetes/observability/README.md @@ -10,6 +10,9 @@ This guide adds a production-ready Prometheus + Grafana stack to the existing Se |--------------|---------|-----------| | Prometheus | Scrapes Semantic Router metrics and stores them with persistent retention | `prometheus/` (`rbac.yaml`, `configmap.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)| | Grafana | Visualizes metrics using the bundled LLM Router dashboard and a pre-configured Prometheus datasource | `grafana/` (`secret.yaml`, `configmap-*.yaml`, `deployment.yaml`, `pvc.yaml`, `service.yaml`)| +| Dashboard | Unified UI that links Router, Prometheus, and embeds Grafana; reads Router config | `dashboard/` (`configmap.yaml`, `deployment.yaml`, `service.yaml`)| +| Open WebUI | Playground UI for interacting with the router via a Manifold Pipeline | `openwebui/` (`deployment.yaml`, `service.yaml`)| +| Pipelines | Executes the `vllm_semantic_router_pipe.py` manifold for Open WebUI | `pipelines/deployment.yaml` (includes a ConfigMap with the pipeline code) | | Ingress (optional) | Exposes the UIs outside the cluster | `ingress.yaml`| | Dashboard provisioning | Automatically loads `deploy/llm-router-dashboard.json` into Grafana | `grafana/configmap-dashboard.yaml`| @@ -110,7 +113,7 @@ Verify pods: kubectl get pods -n vllm-semantic-router-system ``` -You should see `prometheus-...` and `grafana-...` pods in `Running` state. +You should see `prometheus-...`, `grafana-...`, and `semantic-router-dashboard-...` pods in `Running` state. ### 5.3. Integration with the core deployment @@ -133,9 +136,11 @@ You should see `prometheus-...` and `grafana-...` pods in `Running` state. ```bash kubectl port-forward svc/prometheus 9090:9090 -n vllm-semantic-router-system kubectl port-forward svc/grafana 3000:3000 -n vllm-semantic-router-system + kubectl port-forward svc/semantic-router-dashboard 8700:80 -n vllm-semantic-router-system + kubectl port-forward svc/openwebui 3001:8080 -n vllm-semantic-router-system ``` - Prometheus → http://localhost:9090, Grafana → http://localhost:3000 + Prometheus → http://localhost:9090, Grafana → http://localhost:3000, Dashboard → http://localhost:8700, Open WebUI → http://localhost:3001 - **Ingress (production)** – Customize `ingress.yaml` with real domains, TLS secrets, and your ingress class before applying. Replace `*.example.com` and configure HTTPS certificates via cert-manager or your provider. @@ -145,6 +150,7 @@ You should see `prometheus-...` and `grafana-...` pods in `Running` state. 2. Query `rate(llm_model_completion_tokens_total[5m])` – should return data after traffic. 3. Open Grafana, log in with the admin credentials, and confirm the **LLM Router Metrics** dashboard exists under the *Semantic Router* folder. 4. Generate traffic to Semantic Router (classification or routing requests). Key panels should start populating: +5.Playground: open Open WebUI (port-forward or ingress), select the `vllm-semantic-router/auto` model (from the Manifold pipeline), and send prompts. The Dashboard Monitoring page should reflect traffic, and the pipeline will display VSR decision headers inline. - Prompt Category counts - Token usage rate per model - Routing modifications between models diff --git a/deploy/kubernetes/observability/dashboard/configmap.yaml b/deploy/kubernetes/observability/dashboard/configmap.yaml new file mode 100644 index 00000000..5c31dc86 --- /dev/null +++ b/deploy/kubernetes/observability/dashboard/configmap.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: semantic-router-dashboard-config + labels: + app: semantic-router-dashboard + app.kubernetes.io/part-of: semantic-router + app.kubernetes.io/component: observability +data: + TARGET_GRAFANA_URL: http://grafana.vllm-semantic-router-system.svc.cluster.local:3000 + TARGET_PROMETHEUS_URL: http://prometheus.vllm-semantic-router-system.svc.cluster.local:9090 + TARGET_ROUTER_API_URL: http://semantic-router.vllm-semantic-router-system.svc.cluster.local:8080 + TARGET_ROUTER_METRICS_URL: http://semantic-router-metrics.vllm-semantic-router-system.svc.cluster.local:9190/metrics + TARGET_OPENWEBUI_URL: http://openwebui.vllm-semantic-router-system.svc.cluster.local:8080 diff --git a/deploy/kubernetes/observability/dashboard/deployment.yaml b/deploy/kubernetes/observability/dashboard/deployment.yaml new file mode 100644 index 00000000..ac021aa4 --- /dev/null +++ b/deploy/kubernetes/observability/dashboard/deployment.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: semantic-router-dashboard + labels: + app: semantic-router-dashboard +spec: + replicas: 1 + selector: + matchLabels: + app: semantic-router-dashboard + template: + metadata: + labels: + app: semantic-router-dashboard + spec: + containers: + - name: dashboard + image: ghcr.io/vllm-project/semantic-router/dashboard:latest + imagePullPolicy: IfNotPresent + args: ["-port=8700", "-static=/app/frontend", "-config=/app/config/config.yaml"] + env: + - name: TARGET_GRAFANA_URL + valueFrom: + configMapKeyRef: + name: semantic-router-dashboard-config + key: TARGET_GRAFANA_URL + - name: TARGET_PROMETHEUS_URL + valueFrom: + configMapKeyRef: + name: semantic-router-dashboard-config + key: TARGET_PROMETHEUS_URL + - name: TARGET_ROUTER_API_URL + valueFrom: + configMapKeyRef: + name: semantic-router-dashboard-config + key: TARGET_ROUTER_API_URL + - name: TARGET_ROUTER_METRICS_URL + valueFrom: + configMapKeyRef: + name: semantic-router-dashboard-config + key: TARGET_ROUTER_METRICS_URL + - name: TARGET_OPENWEBUI_URL + valueFrom: + configMapKeyRef: + name: semantic-router-dashboard-config + key: TARGET_OPENWEBUI_URL + - name: ROUTER_CONFIG_PATH + value: /app/config/config.yaml + ports: + - name: http + containerPort: 8700 + volumeMounts: + - name: router-config + mountPath: /app/config + readOnly: true + volumes: + - name: router-config + configMap: + name: semantic-router-config diff --git a/deploy/kubernetes/observability/dashboard/service.yaml b/deploy/kubernetes/observability/dashboard/service.yaml new file mode 100644 index 00000000..1f94ee37 --- /dev/null +++ b/deploy/kubernetes/observability/dashboard/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: semantic-router-dashboard + labels: + app: semantic-router-dashboard +spec: + type: ClusterIP + selector: + app: semantic-router-dashboard + ports: + - name: http + port: 80 + targetPort: http diff --git a/deploy/kubernetes/observability/ingress.yaml b/deploy/kubernetes/observability/ingress.yaml index 7ef2cdf4..16c2a34d 100644 --- a/deploy/kubernetes/observability/ingress.yaml +++ b/deploy/kubernetes/observability/ingress.yaml @@ -51,3 +51,59 @@ spec: name: prometheus port: name: http + +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: dashboard + labels: + app: semantic-router-dashboard + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/ssl-redirect: "true" +spec: + tls: + - hosts: + - dashboard.example.com + secretName: dashboard-tls + rules: + - host: dashboard.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: semantic-router-dashboard + port: + name: http + +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: openwebui + labels: + app: openwebui + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/ssl-redirect: "true" +spec: + tls: + - hosts: + - openwebui.example.com + secretName: openwebui-tls + rules: + - host: openwebui.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: openwebui + port: + name: http diff --git a/deploy/kubernetes/observability/kustomization.yaml b/deploy/kubernetes/observability/kustomization.yaml index d3ec5569..adb77505 100644 --- a/deploy/kubernetes/observability/kustomization.yaml +++ b/deploy/kubernetes/observability/kustomization.yaml @@ -19,4 +19,18 @@ resources: - grafana/configmap-dashboard.yaml - grafana/deployment.yaml - grafana/service.yaml + - dashboard/configmap.yaml + - dashboard/deployment.yaml + - dashboard/service.yaml + - pipelines/deployment.yaml + - openwebui/deployment.yaml - ingress.yaml + +# Generate ConfigMaps from source files +generatorOptions: + disableNameSuffixHash: true + +configMapGenerator: + - name: openwebui-pipelines-config + files: + - vllm_semantic_router_pipe.py=pipelines/vllm_semantic_router_pipe.py diff --git a/deploy/kubernetes/observability/openwebui/deployment.yaml b/deploy/kubernetes/observability/openwebui/deployment.yaml new file mode 100644 index 00000000..dadf955c --- /dev/null +++ b/deploy/kubernetes/observability/openwebui/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: openwebui + labels: + app: openwebui +spec: + replicas: 1 + selector: + matchLabels: + app: openwebui + template: + metadata: + labels: + app: openwebui + spec: + containers: + - name: openwebui + image: ghcr.io/open-webui/open-webui:main + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 8080 + env: + - name: WEBUI_NAME + value: "Open WebUI" + - name: OPENAI_API_BASE_URL + value: "http://openwebui-pipelines:9099" + - name: OPENAI_API_KEY + value: "0p3n-w3bu!" + volumeMounts: + - name: data + mountPath: /app/backend/data + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi + volumes: + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: openwebui + labels: + app: openwebui +spec: + selector: + app: openwebui + ports: + - name: http + port: 8080 + targetPort: http + type: ClusterIP diff --git a/deploy/kubernetes/observability/pipelines/deployment.yaml b/deploy/kubernetes/observability/pipelines/deployment.yaml new file mode 100644 index 00000000..8a39c3a6 --- /dev/null +++ b/deploy/kubernetes/observability/pipelines/deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: openwebui-pipelines + labels: + app: openwebui-pipelines +spec: + replicas: 1 + selector: + matchLabels: + app: openwebui-pipelines + template: + metadata: + labels: + app: openwebui-pipelines + spec: + containers: + - name: pipelines + image: ghcr.io/open-webui/pipelines:main + imagePullPolicy: IfNotPresent + ports: + - name: http + containerPort: 9099 + env: + - name: PYTHONUNBUFFERED + value: "1" + volumeMounts: + - name: pipelines-volume + mountPath: /app/pipelines + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: pipelines-volume + configMap: + name: openwebui-pipelines-config diff --git a/deploy/kubernetes/observability/pipelines/vllm_semantic_router_pipe.py b/deploy/kubernetes/observability/pipelines/vllm_semantic_router_pipe.py new file mode 100644 index 00000000..217c5c39 --- /dev/null +++ b/deploy/kubernetes/observability/pipelines/vllm_semantic_router_pipe.py @@ -0,0 +1,648 @@ +""" +title: vLLM Semantic Router Pipe +author: open-webui +date: 2025-10-01 +version: 1.1 +license: Apache-2.0 +description: A pipe for proxying requests to vLLM Semantic Router and displaying decision headers (category, reasoning, model, injection) and security alerts (PII violations, jailbreak detection). +requirements: requests, pydantic +""" + +import json +from typing import Generator, Iterator, List, Union + +import requests +from pydantic import BaseModel + + +class Pipeline: + class Valves(BaseModel): + # vLLM Semantic Router endpoint URL + vsr_base_url: str = "http://localhost:8000" + + # API key for authentication (if required) + api_key: str = "" + + # Enable/disable displaying VSR headers in the UI + show_vsr_info: bool = True + + # Enable/disable logging VSR headers to console + log_vsr_info: bool = True + + # Enable/disable debug logging + debug: bool = True + + # Request timeout in seconds + timeout: int = 300 + + def __init__(self): + # Important: type should be "manifold" instead of "pipe" + # manifold type Pipeline will be displayed in the model list + self.type = "manifold" + self.id = "vllm_semantic_router" + self.name = "vllm-semantic-router/" + + # Initialize valves + self.valves = self.Valves( + **{ + "vsr_base_url": "http://localhost:8000", + "api_key": "", + "show_vsr_info": True, + "log_vsr_info": True, + "debug": True, + "timeout": 300, + } + ) + + # Store VSR headers from the last request + self.last_vsr_headers = {} + + print("=" * 80) + print("🚀 vLLM Semantic Router Pipe - Initialization") + print("=" * 80) + print(f" Type: {self.type}") + print(f" ID: {self.id}") + print(f" Name: {self.name}") + print(f" VSR Base URL: {self.valves.vsr_base_url}") + print(f" Debug Mode: {self.valves.debug}") + print("=" * 80) + + async def on_startup(self): + print("\n" + "=" * 80) + print("🔥 on_startup: vLLM Semantic Router Pipe initialized") + print("=" * 80) + print(f" VSR Base URL: {self.valves.vsr_base_url}") + print(f" API Key: {'***' if self.valves.api_key else '(not set)'}") + print(f" Show VSR Info: {self.valves.show_vsr_info}") + print(f" Log VSR Info: {self.valves.log_vsr_info}") + print(f" Debug: {self.valves.debug}") + print(f" Timeout: {self.valves.timeout}s") + + # Test if pipelines() is being called + pipes_list = self.pipelines() + print(f"\n📋 Available Pipes/Models:") + for pipe in pipes_list: + print(f" - ID: {pipe['id']}") + print(f" Name: {pipe['name']}") + print("=" * 80 + "\n") + + async def on_shutdown(self): + print("\n" + "=" * 80) + print("🛑 on_shutdown: vLLM Semantic Router Pipe") + print("=" * 80 + "\n") + + async def on_valves_updated(self): + print("\n" + "=" * 80) + print("⚙️ on_valves_updated: vLLM Semantic Router Pipe valves updated") + print("=" * 80) + print(f" VSR Base URL: {self.valves.vsr_base_url}") + print(f" API Key: {'***' if self.valves.api_key else '(not set)'}") + print(f" Show VSR Info: {self.valves.show_vsr_info}") + print(f" Log VSR Info: {self.valves.log_vsr_info}") + print(f" Debug: {self.valves.debug}") + print(f" Timeout: {self.valves.timeout}s") + print("=" * 80 + "\n") + + def pipes(self) -> List[dict]: + """ + Deprecated: manifold type uses pipelines() method instead of pipes() + The returned model list will be displayed in Open WebUI's model selector + """ + return self.pipelines() + + def pipelines(self) -> List[dict]: + """ + Important: manifold type uses pipelines() method instead of pipes() + The returned model list will be displayed in Open WebUI's model selector + """ + pipelines_list = [ + { + "id": "vllm-semantic-router-auto", + "name": "vllm-semantic-router/auto", + } + ] + + if self.valves.debug: + print("\n" + "=" * 80) + print("📞 pipelines() method called - Returning available models") + print("=" * 80) + for pipeline in pipelines_list: + print(f" - ID: {pipeline['id']}") + print(f" Name: {pipeline['name']}") + print("=" * 80 + "\n") + + return pipelines_list + + def _extract_vsr_headers(self, headers: dict) -> dict: + """ + Extract VSR-specific headers from response headers. + """ + vsr_headers = {} + + # List of VSR headers to extract + vsr_header_keys = [ + # Decision headers + "x-vsr-selected-category", + "x-vsr-selected-reasoning", + "x-vsr-selected-model", + "x-vsr-injected-system-prompt", + "x-vsr-cache-hit", + # Security headers + "x-vsr-pii-violation", + "x-vsr-jailbreak-blocked", + "x-vsr-jailbreak-type", + "x-vsr-jailbreak-confidence", + ] + + # Extract headers (case-insensitive) + for key in vsr_header_keys: + # Try lowercase + value = headers.get(key) + if not value: + # Try uppercase + value = headers.get(key.upper()) + if not value: + # Try title case + value = headers.get(key.title()) + + if value: + vsr_headers[key] = value + + return vsr_headers + + def _format_vsr_info(self, vsr_headers: dict, position: str = "prefix") -> str: + """ + Format VSR headers into a readable message for display. + Shows the semantic router's decision chain in 3 stages (multi-line format): + Stage 1: Security Validation + Stage 2: Cache Check + Stage 3: Intelligent Routing + + Args: + vsr_headers: VSR decision headers + position: "prefix" (before response) or "suffix" (after response) + """ + if not vsr_headers: + return "" + + # Build decision chain in stages (multi-line format) + lines = ["**🔀 vLLM Semantic Router - Chain-Of-Thought 🔀**"] + + # ============================================================ + # Stage 1: Security Validation (🛡️) + # ============================================================ + security_parts = [] + + has_jailbreak = vsr_headers.get("x-vsr-jailbreak-blocked") == "true" + has_pii = vsr_headers.get("x-vsr-pii-violation") == "true" + is_blocked = has_jailbreak or has_pii + + # Jailbreak check + if has_jailbreak: + jailbreak_type = vsr_headers.get("x-vsr-jailbreak-type", "unknown") + jailbreak_confidence = vsr_headers.get("x-vsr-jailbreak-confidence", "N/A") + security_parts.append( + f"🚨 *Jailbreak Detected, Confidence: {jailbreak_confidence}*" + ) + else: + security_parts.append("✅ *No Jailbreak*") + + # PII check + if has_pii: + security_parts.append("🚨 *PII Detected*") + else: + security_parts.append("✅ *No PII*") + + # Result + if is_blocked: + security_parts.append("❌ ***BLOCKED***") + else: + security_parts.append("💯 ***Continue***") + + lines.append( + " → 🛡️ ***Stage 1 - Prompt Guard***: " + " → ".join(security_parts) + ) + + # If blocked, stop here + if is_blocked: + result = "\n".join(lines) + if position == "prefix": + return result + "\n\n---\n\n" + else: + return "\n\n---\n\n" + result + + # ============================================================ + # Stage 2: Cache Check (🔥) + # ============================================================ + cache_parts = [] + has_cache_hit = vsr_headers.get("x-vsr-cache-hit") == "true" + + if has_cache_hit: + cache_parts.append("🔥 *HIT*") + cache_parts.append("⚡️ *Retrieve Memory*") + cache_parts.append("💯 ***Fast Response***") + else: + cache_parts.append("🌊 *MISS*") + cache_parts.append("🧠 *Update Memory*") + cache_parts.append("💯 ***Continue***") + + lines.append(" → 🔥 ***Stage 2 - Router Memory***: " + " → ".join(cache_parts)) + + # If cache hit, stop here + if has_cache_hit: + result = "\n".join(lines) + if position == "prefix": + return result + "\n\n---\n\n" + else: + return "\n\n---\n\n" + result + + # ============================================================ + # Stage 3: Intelligent Routing (🧠) + # ============================================================ + routing_parts = [] + + # Domain + category = vsr_headers.get("x-vsr-selected-category", "").strip() + if not category: + category = "other" + routing_parts.append(f"📂 *{category}*") + + # Reasoning mode + if vsr_headers.get("x-vsr-selected-reasoning"): + reasoning = vsr_headers["x-vsr-selected-reasoning"] + if reasoning == "on": + routing_parts.append("🧠 *Reasoning On*") + else: + routing_parts.append("⚡ *Reasoning Off*") + + # Model + if vsr_headers.get("x-vsr-selected-model"): + model = vsr_headers["x-vsr-selected-model"] + routing_parts.append(f"🥷 *{model}*") + + # Prompt optimization + if vsr_headers.get("x-vsr-injected-system-prompt") == "true": + routing_parts.append("🎯 *Prompt Optimized*") + + routing_parts.append(f"💯 ***Continue***") + + if routing_parts: + lines.append( + " → 🧠 ***Stage 3 - Smart Routing***: " + " → ".join(routing_parts) + ) + + # Combine all lines + result = "\n".join(lines) + + if position == "prefix": + return result + "\n\n---\n\n" + else: + return "\n\n---\n\n" + result + + def _log_vsr_info(self, vsr_headers: dict): + """ + Log VSR information to console. + """ + if not vsr_headers or not self.valves.log_vsr_info: + return + + # Check if there are security violations + has_security_violation = ( + vsr_headers.get("x-vsr-pii-violation") == "true" + or vsr_headers.get("x-vsr-jailbreak-blocked") == "true" + ) + + print("=" * 60) + if has_security_violation: + print("🛡️ SECURITY ALERT & Routing Decision:") + else: + print("vLLM Semantic Router Decision:") + print("=" * 60) + + # Log security violations first + if vsr_headers.get("x-vsr-pii-violation") == "true": + print(" 🚨 PII VIOLATION: Request blocked") + + if vsr_headers.get("x-vsr-jailbreak-blocked") == "true": + print(" 🚨 JAILBREAK BLOCKED: Potential attack detected") + if vsr_headers.get("x-vsr-jailbreak-type"): + print(f" Type: {vsr_headers['x-vsr-jailbreak-type']}") + if vsr_headers.get("x-vsr-jailbreak-confidence"): + print(f" Confidence: {vsr_headers['x-vsr-jailbreak-confidence']}") + + # Log routing decision information + if vsr_headers.get("x-vsr-selected-category"): + print(f" Category: {vsr_headers['x-vsr-selected-category']}") + + if vsr_headers.get("x-vsr-selected-reasoning"): + print(f" Reasoning Mode: {vsr_headers['x-vsr-selected-reasoning']}") + + if vsr_headers.get("x-vsr-selected-model"): + print(f" Selected Model: {vsr_headers['x-vsr-selected-model']}") + + if vsr_headers.get("x-vsr-injected-system-prompt"): + print( + f" System Prompt Injected: {vsr_headers['x-vsr-injected-system-prompt']}" + ) + + if vsr_headers.get("x-vsr-cache-hit"): + cache_hit = vsr_headers["x-vsr-cache-hit"].lower() + print(f" Cache Hit: {cache_hit}") + + print("=" * 60) + + def pipe( + self, user_message: str, model_id: str, messages: List[dict], body: dict + ) -> Union[str, Generator, Iterator]: + """ + Main pipe function that handles the request/response flow. + + Manifold type pipe() method signature: + - user_message: User's last message + - model_id: Selected model ID + - messages: Complete message history + - body: Complete request body + """ + + if self.valves.debug: + print("\n" + "=" * 80) + print("🔄 pipe() method called - Processing request") + print("=" * 80) + print( + f" User message: {user_message[:100]}..." + if len(user_message) > 100 + else f" User message: {user_message}" + ) + print(f" Model ID: {model_id}") + print(f" Model requested: {body.get('model', 'N/A')}") + print(f" Stream mode: {body.get('stream', False)}") + print(f" Messages count: {len(messages)}") + print("=" * 80) + + # Prepare the request to vLLM Semantic Router + url = f"{self.valves.vsr_base_url}/v1/chat/completions" + + if self.valves.debug: + print(f"\n📡 Sending request to: {url}") + + headers = { + "Content-Type": "application/json", + } + + if self.valves.api_key: + headers["Authorization"] = f"Bearer {self.valves.api_key}" + if self.valves.debug: + print(f" Authorization: Bearer ***") + + # Important: Change model in body to "auto" + # VSR backend only accepts model="auto", then automatically selects model based on request content + request_body = body.copy() + original_model = request_body.get("model", "N/A") + request_body["model"] = "auto" + + if self.valves.debug: + print(f"\n🔄 Model mapping:") + print(f" Original model: {original_model}") + print(f" Sending to VSR: auto") + + # Check if streaming is requested + is_streaming = request_body.get("stream", False) + + if self.valves.debug: + print(f" Streaming: {is_streaming}") + print(f" Timeout: {self.valves.timeout}s") + + try: + if self.valves.debug: + print(f"\n🔌 Connecting to vLLM Semantic Router...") + + response = requests.post( + url, + json=request_body, # Use modified request_body + headers=headers, + timeout=self.valves.timeout, + stream=request_body.get("stream", False), + ) + + if self.valves.debug: + print(f"✅ Response received - Status: {response.status_code}") + print(f" Response headers count: {len(response.headers)}") + + # Check for HTTP errors + if response.status_code != 200: + error_msg = f"Error: vLLM Semantic Router returned status {response.status_code}" + if self.valves.debug: + print(f"\n❌ {error_msg}") + print(f" Response text: {response.text[:500]}") + print("=" * 80 + "\n") + return f"{error_msg}: {response.text}" + + # Extract VSR headers from response + vsr_headers = self._extract_vsr_headers(dict(response.headers)) + self.last_vsr_headers = vsr_headers + + if self.valves.debug: + print(f" VSR headers found: {len(vsr_headers)}") + for key, value in vsr_headers.items(): + print(f" {key}: {value}") + + # Print all response headers for debugging + print(f"\n All response headers:") + for key, value in response.headers.items(): + if key.lower().startswith("x-vsr"): + print(f" {key}: {value}") + + # Log VSR information + self._log_vsr_info(vsr_headers) + + if is_streaming: + if self.valves.debug: + print(f"\n📺 Handling streaming response...") + # Handle streaming response + return self._handle_streaming_response(response, vsr_headers) + else: + if self.valves.debug: + print(f"\n📄 Handling non-streaming response...") + print(f" Response status: {response.status_code}") + print(f" Response content length: {len(response.content)}") + print( + f" Response content type: {response.headers.get('content-type', 'unknown')}" + ) + + # Check if response is empty + if not response.content: + error_msg = "Error: Empty response from vLLM Semantic Router" + if self.valves.debug: + print(f"\n❌ {error_msg}") + print("=" * 80 + "\n") + return error_msg + + # Try to parse JSON response + try: + response_data = response.json() + except json.JSONDecodeError as e: + error_msg = ( + f"Error: Invalid JSON response from vLLM Semantic Router" + ) + if self.valves.debug: + print(f"\n❌ {error_msg}") + print(f" JSON error: {str(e)}") + print( + f" Response text (first 500 chars): {response.text[:500]}" + ) + print("=" * 80 + "\n") + return f"{error_msg}: {str(e)}" + + if self.valves.debug: + print(f" Response data keys: {list(response_data.keys())}") + if "choices" in response_data: + print(f" Choices count: {len(response_data['choices'])}") + + # Add VSR info to the response if enabled + if self.valves.show_vsr_info and vsr_headers: + vsr_info = self._format_vsr_info(vsr_headers, position="prefix") + + if self.valves.debug: + print( + f" Adding VSR info to response (length: {len(vsr_info)})" + ) + + # Prepend to the assistant's message + if "choices" in response_data and len(response_data["choices"]) > 0: + for choice in response_data["choices"]: + if "message" in choice and "content" in choice["message"]: + choice["message"]["content"] = ( + vsr_info + choice["message"]["content"] + ) + if self.valves.debug: + print(f" ✅ VSR info prepended to response") + + if self.valves.debug: + print(f"\n✅ Request completed successfully") + print("=" * 80 + "\n") + + return response_data + + except requests.exceptions.Timeout: + error_msg = f"Error: Request to vLLM Semantic Router timed out after {self.valves.timeout} seconds" + if self.valves.debug: + print(f"\n❌ {error_msg}") + print("=" * 80 + "\n") + return error_msg + except Exception as e: + error_msg = ( + f"Error: Failed to communicate with vLLM Semantic Router: {str(e)}" + ) + if self.valves.debug: + print(f"\n❌ {error_msg}") + print(f" Exception type: {type(e).__name__}") + print(f" Exception details: {str(e)}") + print("=" * 80 + "\n") + return error_msg + + def _handle_streaming_response( + self, response: requests.Response, vsr_headers: dict + ) -> Generator: + """ + Handle streaming SSE response from vLLM Semantic Router. + Manually parse SSE stream, no need for sseclient-py dependency. + + Strategy: + 1. Add VSR info before the first content chunk (if enabled) + 2. Detect VSR header updates during streaming (via SSE events) + 3. Ensure it's only added once + """ + vsr_info_added = False + first_content_chunk = True # Mark whether it's the first content chunk + # Use initial vsr_headers, but may be updated during streaming + current_vsr_headers = vsr_headers.copy() + + if self.valves.debug: + print(f"\n📝 Initial VSR headers:") + for key, value in current_vsr_headers.items(): + print(f" {key}: {value}") + + # Read streaming response line by line + for line in response.iter_lines(decode_unicode=True): + if not line: + continue + + # SSE format: data: {...} + if line.startswith("data: "): + data_str = line[6:].strip() # Remove "data: " prefix + + if data_str == "[DONE]": + yield f"data: [DONE]\n\n" + + if self.valves.debug: + print( + f"✅ Streaming completed, VSR info added: {vsr_info_added}" + ) + else: + try: + chunk_data = json.loads(data_str) + + # Check if chunk contains updated VSR header information + # Some SSE implementations may include updated headers in chunk metadata + if "vsr_headers" in chunk_data: + if self.valves.debug: + print(f"🔄 VSR headers updated in stream:") + for key, value in chunk_data["vsr_headers"].items(): + full_key = ( + f"x-vsr-{key}" + if not key.startswith("x-vsr-") + else key + ) + if current_vsr_headers.get(full_key) != value: + if self.valves.debug: + print( + f" {full_key}: {current_vsr_headers.get(full_key)} → {value}" + ) + current_vsr_headers[full_key] = value + + # Add VSR info before the first content chunk + if ( + first_content_chunk + and self.valves.show_vsr_info + and not vsr_info_added + ): + if ( + "choices" in chunk_data + and len(chunk_data["choices"]) > 0 + ): + choice = chunk_data["choices"][0] + delta = choice.get("delta", {}) + + # Check if there is content (role or content) + if "role" in delta or "content" in delta: + if self.valves.debug: + print( + f"✅ Adding VSR info at first content chunk" + ) + print(f" VSR headers:") + for key, value in current_vsr_headers.items(): + print(f" {key}: {value}") + + # Format VSR info (using prefix mode) + vsr_info = self._format_vsr_info( + current_vsr_headers, position="prefix" + ) + + # Add VSR info before the first content + current_content = delta.get("content", "") + delta["content"] = vsr_info + current_content + chunk_data["choices"][0]["delta"] = delta + vsr_info_added = True + first_content_chunk = False + + # If not the first chunk, mark as False + if "choices" in chunk_data and len(chunk_data["choices"]) > 0: + choice = chunk_data["choices"][0] + delta = choice.get("delta", {}) + if "role" in delta or "content" in delta: + first_content_chunk = False + + yield f"data: {json.dumps(chunk_data)}\n\n" + except json.JSONDecodeError: + # If not valid JSON, pass through as-is + yield f"data: {data_str}\n\n"