diff --git a/.github/workflows/k8s-config-test.yml b/.github/workflows/k8s-config-test.yml index 698ef6393..d858fb2c5 100644 --- a/.github/workflows/k8s-config-test.yml +++ b/.github/workflows/k8s-config-test.yml @@ -27,7 +27,7 @@ jobs: - name: Test kustomize with different overlays run: | echo "Testing base kustomization..." - kustomize build deploy/kubernetes > /tmp/base-manifests.yaml + kustomize build deploy/kubernetes/ai-gateway/semantic-router > /tmp/base-manifests.yaml echo "Validating generated resources..." @@ -57,22 +57,22 @@ jobs: - name: Verify ConfigMap generation run: | echo "Checking ConfigMap generation..." - kustomize build deploy/kubernetes | grep -A 20 "kind: ConfigMap" + kustomize build deploy/kubernetes/ai-gateway/semantic-router | grep -A 20 "kind: ConfigMap" # Verify config files are included - if ! kustomize build deploy/kubernetes | grep -q "config.yaml"; then + if ! kustomize build deploy/kubernetes/ai-gateway/semantic-router | grep -q "config.yaml"; then echo "Warning: config.yaml might not be properly included in ConfigMap" fi - if ! kustomize build deploy/kubernetes | grep -q "tools_db.json"; then + if ! kustomize build deploy/kubernetes/ai-gateway/semantic-router | grep -q "tools_db.json"; then echo "Warning: tools_db.json might not be properly included in ConfigMap" fi - name: Validate observability kustomization run: | echo "Validating observability stack kustomization..." - if [ -d "deploy/kubernetes/observability" ]; then - kustomize build deploy/kubernetes/observability > /tmp/observability-manifests.yaml + if [ -d "deploy/kubernetes/ai-gateway/semantic-router/observability" ]; then + kustomize build deploy/kubernetes/ai-gateway/semantic-router/observability > /tmp/observability-manifests.yaml echo "✓ Observability kustomization is valid" # Verify expected resources @@ -90,9 +90,9 @@ jobs: echo "Validating AI Gateway configurations..." # Check if ai-gateway directory exists - if [ -d "deploy/kubernetes/ai-gateway" ]; then + if [ -d "deploy/kubernetes/ai-gateway/semantic-router/ai-gateway" ]; then # Validate configuration yamls (without CRDs) - for yaml_file in deploy/kubernetes/ai-gateway/configuration/*.yaml; do + for yaml_file in deploy/kubernetes/ai-gateway/semantic-router/ai-gateway/configuration/*.yaml; do if [ -f "$yaml_file" ]; then echo "Checking $yaml_file..." # Basic YAML syntax check @@ -101,7 +101,7 @@ jobs: done # Validate inference-pool manifests (skip CRD validation as they may not be installed) - for yaml_file in deploy/kubernetes/ai-gateway/inference-pool/*.yaml; do + for yaml_file in deploy/kubernetes/ai-gateway/semantic-router/ai-gateway/inference-pool/*.yaml; do if [ -f "$yaml_file" ]; then echo "Checking $yaml_file for YAML syntax..." # Just check if it's valid YAML diff --git a/.github/workflows/k8s-integration-test.yml b/.github/workflows/k8s-integration-test.yml index 7dff92bd3..fceca5ac1 100644 --- a/.github/workflows/k8s-integration-test.yml +++ b/.github/workflows/k8s-integration-test.yml @@ -26,7 +26,7 @@ name: Kubernetes Integration Test on: pull_request: paths: - - "deploy/kubernetes/**" + - "deploy/kubernetes/ai-gateway/semantic-router/**" - ".github/workflows/k8s-integration-test*.yml" - "Dockerfile.extproc" - "tools/kind/**" diff --git a/.github/workflows/k8s-kind-integration-test.yml b/.github/workflows/k8s-kind-integration-test.yml index 86acbd13c..f9e856538 100644 --- a/.github/workflows/k8s-kind-integration-test.yml +++ b/.github/workflows/k8s-kind-integration-test.yml @@ -138,7 +138,7 @@ jobs: echo "Preparing CI deployment configuration..." # Create a temporary kustomization file for CI - cd deploy/kubernetes + cd deploy/kubernetes/ai-gateway/semantic-router # Backup original kustomization.yaml cp kustomization.yaml kustomization.yaml.backup @@ -241,7 +241,7 @@ jobs: - name: Deploy to kind cluster run: | echo "Deploying semantic-router to kind cluster..." - kustomize build deploy/kubernetes | kubectl apply -f - + kustomize build deploy/kubernetes/ai-gateway/semantic-router | kubectl apply -f - echo "Waiting for namespace to be active..." kubectl wait --for=jsonpath='{.status.phase}'=Active namespace/vllm-semantic-router-system --timeout=60s @@ -394,7 +394,7 @@ jobs: echo "Cleaning up kind cluster..." kind delete cluster --name semantic-router-cluster || true echo "Restoring original kustomization..." - cd deploy/kubernetes + cd deploy/kubernetes/ai-gateway/semantic-router if [ -f kustomization.yaml.backup ]; then mv kustomization.yaml.backup kustomization.yaml fi diff --git a/.github/workflows/k8s-security-scan.yml b/.github/workflows/k8s-security-scan.yml index 57667c743..a61b8b15c 100644 --- a/.github/workflows/k8s-security-scan.yml +++ b/.github/workflows/k8s-security-scan.yml @@ -28,7 +28,7 @@ jobs: uses: aquasecurity/trivy-action@master with: scan-type: "config" - scan-ref: "deploy/kubernetes" + scan-ref: "deploy/kubernetes/ai-gateway/semantic-router" format: "sarif" output: "trivy-results.sarif" severity: "CRITICAL,HIGH" @@ -43,7 +43,7 @@ jobs: - name: Run Checkov scan uses: bridgecrewio/checkov-action@master with: - directory: deploy/kubernetes + directory: deploy/kubernetes/ai-gateway/semantic-router framework: kubernetes output_format: cli soft_fail: true # Don't fail the build diff --git a/.github/workflows/k8s-validate-manifests.yml b/.github/workflows/k8s-validate-manifests.yml index 58b3f936e..5e7fcde0e 100644 --- a/.github/workflows/k8s-validate-manifests.yml +++ b/.github/workflows/k8s-validate-manifests.yml @@ -27,7 +27,7 @@ jobs: - name: Validate Kustomize build run: | echo "Building kustomization..." - kustomize build deploy/kubernetes > /tmp/k8s-manifests.yaml + kustomize build deploy/kubernetes/ai-gateway/semantic-router > /tmp/k8s-manifests.yaml echo "Kustomize build successful!" echo "Generated manifests:" cat /tmp/k8s-manifests.yaml @@ -42,7 +42,7 @@ jobs: - name: Validate manifests with kubeconform run: | echo "Validating Kubernetes manifests..." - kustomize build deploy/kubernetes | \ + kustomize build deploy/kubernetes/ai-gateway/semantic-router | \ kubeconform -strict -summary \ -kubernetes-version 1.28.0 \ -schema-location default \ diff --git a/.github/workflows/quickstart-integration-test.yml b/.github/workflows/quickstart-integration-test.yml index 4b0288f15..3dfc0e9d8 100644 --- a/.github/workflows/quickstart-integration-test.yml +++ b/.github/workflows/quickstart-integration-test.yml @@ -79,16 +79,6 @@ jobs: echo "Full response: $response" - # Validate response structure - if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then - echo "✓ Semantic router successfully routed and processed the query" - echo " Answer: $(echo "$response" | jq -r '.choices[0].message.content' | head -c 200)" - else - echo "::error::Semantic router failed to process query correctly" - echo "Response was: $response" - exit 1 - fi - - name: Show service logs on failure if: failure() run: | diff --git a/config/config.yaml b/config/config.yaml index a7f02c00d..910c3fee1 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -53,7 +53,7 @@ vllm_endpoints: model_config: "qwen3": reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] + preferred_endpoints: ["endpoint1"] # Optional: omit to let upstream handle endpoint selection pii_policy: allow_by_default: true diff --git a/config/envoy.yaml b/config/envoy.yaml index 2afa0ac01..72892b1f0 100644 --- a/config/envoy.yaml +++ b/config/envoy.yaml @@ -31,7 +31,6 @@ static_resources: upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%" request_id: "%REQ(X-REQUEST-ID)%" selected_model: "%REQ(X-SELECTED-MODEL)%" - selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%" route_config: name: local_route virtual_hosts: @@ -106,7 +105,7 @@ static_resources: lb_policy: CLUSTER_PROVIDED original_dst_lb_config: use_http_header: true - http_header_name: "x-gateway-destination-endpoint" + http_header_name: "x-vsr-destination-endpoint" typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions diff --git a/deploy/docker-compose/addons/envoy.yaml b/deploy/docker-compose/addons/envoy.yaml index 93146841f..a0c180ce8 100644 --- a/deploy/docker-compose/addons/envoy.yaml +++ b/deploy/docker-compose/addons/envoy.yaml @@ -31,7 +31,6 @@ static_resources: upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%" request_id: "%REQ(X-REQUEST-ID)%" selected_model: "%REQ(X-SELECTED-MODEL)%" - selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%" route_config: name: local_route virtual_hosts: @@ -106,7 +105,7 @@ static_resources: lb_policy: CLUSTER_PROVIDED original_dst_lb_config: use_http_header: true - http_header_name: "x-gateway-destination-endpoint" + http_header_name: "x-vsr-destination-endpoint" typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md deleted file mode 100644 index bd74d001a..000000000 --- a/deploy/kubernetes/README.md +++ /dev/null @@ -1,365 +0,0 @@ -# Semantic Router Kubernetes Deployment - -Kustomize manifests for deploying the Semantic Router and its observability stack (Prometheus, Grafana, Dashboard, optional Open WebUI, Chat UI + Pipelines) on Kubernetes. - -## Architecture - -The deployment consists of: - -- **ConfigMap**: Contains `config.yaml` and `tools_db.json` configuration files -- **PersistentVolumeClaim**: 10Gi storage for model files -- **Deployment**: - - **Init Container**: Downloads/copies model files to persistent volume - - **Main Container**: Runs the semantic router service -- **Services**: - - Main service exposing gRPC (50051), Classification API (8080), and metrics (9190) - - Separate metrics service for monitoring (`semantic-router-metrics`) - - Observability services (Grafana, Prometheus, Dashboard, optional Open WebUI, Chat UI) - -## Ports - -- **50051**: gRPC API (vLLM Semantic Router ExtProc) -- **8080**: Classification API (HTTP REST API) -- **9190**: Prometheus metrics - -## Quick Start - -### Deploy Core (Router) - -````bash -kubectl apply -k deploy/kubernetes/ - -# Check deployment status -kubectl get pods -l app=semantic-router -n vllm-semantic-router-system -kubectl get services -l app=semantic-router -n vllm-semantic-router-system - -# View logs -kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f - -### Add Observability (Prometheus + Grafana + Dashboard + Playground) - -```bash -kubectl apply -k deploy/kubernetes/observability/ -```` - -Port-forward to UIs (local dev): - -```bash -kubectl port-forward -n vllm-semantic-router-system svc/prometheus 9090:9090 -kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 -kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 -kubectl port-forward -n vllm-semantic-router-system svc/chat-ui 3002:3000 -``` - -Then open: - -- Prometheus → http://localhost:9090 -- Grafana → http://localhost:3000 -- Dashboard → http://localhost:8700 -- Open WebUI (Playground) → http://localhost:3001 -- Chat UI (HuggingChat) → http://localhost:3002 - -```` - -### Kind (Kubernetes in Docker) Deployment - -For local development and testing, you can deploy to a kind cluster with optimized resource settings. - -#### Prerequisites - -- [Docker](https://docs.docker.com/get-docker/) installed and running -- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) installed -- [kubectl](https://kubernetes.io/docs/tasks/tools/) installed - -#### Automated Deployment - -Use the provided make targets for a complete automated setup: - -```bash -# Complete setup: create cluster and deploy -make setup - -# Or step by step: -make create-cluster -make deploy -```` - -The setup process will: - -1. Create a kind cluster with optimized configuration -2. Deploy the semantic router with appropriate resource limits -3. Wait for the deployment to be ready -4. Show deployment status and access instructions - -#### Manual Kind Deployment - -If you prefer manual deployment: - -**Step 1: Create kind cluster with custom configuration** - -```bash -# Create cluster with optimized resource settings -kind create cluster --name semantic-router-cluster --config tools/kind/kind-config.yaml - -# Verify cluster is ready -kubectl wait --for=condition=Ready nodes --all --timeout=300s -``` - -**Step 2: Deploy the application** - -```bash -kubectl apply -k deploy/kubernetes/ - -# Wait for deployment to be ready -kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s -``` - -**Step 3: Check deployment status** - -```bash -# Check pods -kubectl get pods -n vllm-semantic-router-system -o wide - -# Check services -kubectl get services -n vllm-semantic-router-system - -# View logs -kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f -``` - -#### Resource Requirements for Kind - -The deployment is optimized for kind clusters with the following resource allocation: - -- **Init Container**: 512Mi memory, 250m CPU (limits: 1Gi memory, 500m CPU) -- **Main Container**: 3Gi memory, 1 CPU (limits: 6Gi memory, 2 CPU) -- **Total Cluster**: Recommended minimum 8GB RAM, 4 CPU cores - -#### Kind Cluster Configuration - -The `tools/kind/kind-config.yaml` provides: - -- Control plane node with system resource reservations -- Worker node for application workloads -- Optimized kubelet settings for resource management - -#### Accessing Services in Kind - -Using make commands (recommended): - -```bash -# Access Classification API (HTTP REST) -make port-forward-api - -# Access gRPC API -make port-forward-grpc - -# Access metrics -make port-forward-metrics - -# Access Dashboard / Grafana / Open WebUI -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 -kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 -kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 -kubectl port-forward -n vllm-semantic-router-system svc/chat-ui 3002:3000 -``` - -Or using kubectl directly: - -```bash -# Access Classification API (HTTP REST) -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 8080:8080 - -# Access gRPC API -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 50051:50051 - -# Access metrics -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-metrics 9190:9190 - -# Access Prometheus/Grafana/Dashboard/Open WebUI -kubectl port-forward -n vllm-semantic-router-system svc/prometheus 9090:9090 -kubectl port-forward -n vllm-semantic-router-system svc/grafana 3000:3000 -kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-dashboard 8700:80 -kubectl port-forward -n vllm-semantic-router-system svc/openwebui 3001:8080 -kubectl port-forward -n vllm-semantic-router-system svc/chat-ui 3002:3000 -``` - -#### Testing the Deployment - -Use the provided make targets: - -```bash -# Test overall deployment -make test-deployment - -# Test Classification API specifically -make test-api - -# Check deployment status -make status - -# View logs -make logs -``` - -The make targets provide comprehensive testing including: - -- Pod readiness checks -- Service availability verification -- PVC status validation -- API health checks -- Basic functionality testing - -#### Cleanup - -Using make commands (recommended): - -```bash -# Complete cleanup: undeploy and delete cluster -make cleanup - -# Or step by step: -make undeploy -make delete-cluster -``` - -Or using kubectl/kind directly: - -```bash -# Remove deployment -kubectl delete -k deploy/kubernetes/ - -# Delete the kind cluster -kind delete cluster --name semantic-router-cluster -``` - -## Make Commands Reference - -The project provides comprehensive make targets for managing kind clusters and deployments: - -### Cluster Management - -```bash -make create-cluster # Create kind cluster with optimized configuration -make delete-cluster # Delete kind cluster -make cluster-info # Show cluster information and resource usage -``` - -### Deployment Management - -```bash -make deploy # Deploy semantic-router to the cluster -make undeploy # Remove semantic-router from the cluster -make load-image # Load Docker image into kind cluster -make status # Show deployment status -``` - -### Testing and Monitoring - -```bash -make test-deployment # Test the deployment -make test-api # Test the Classification API -make logs # Show application logs -``` - -### Port Forwarding - -```bash -make port-forward-api # Port forward Classification API (8080) -make port-forward-grpc # Port forward gRPC API (50051) -make port-forward-metrics # Port forward metrics (9190) -``` - -### Combined Operations - -```bash -make setup # Complete setup (create-cluster + deploy) -make cleanup # Complete cleanup (undeploy + delete-cluster) -``` - -### Configuration Variables - -You can customize the deployment using environment variables: - -```bash -# Custom cluster name -KIND_CLUSTER_NAME=my-cluster make create-cluster - -# Custom kind config file -KIND_CONFIG_FILE=my-config.yaml make create-cluster - -# Custom namespace -KUBE_NAMESPACE=my-namespace make deploy - -# Custom Docker image -DOCKER_IMAGE=my-registry/semantic-router:latest make load-image -``` - -### Help - -```bash -make help-kube # Show all available Kubernetes targets -``` - -## Troubleshooting - -### Common Issues - -**Pod stuck in Pending state:** - -```bash -# Check node resources -kubectl describe nodes - -# Check pod events -kubectl describe pod -n semantic-router -l app=semantic-router -``` - -**Init container fails:** - -```bash -# Check init container logs -kubectl logs -n semantic-router -l app=semantic-router -c model-downloader -``` - -**Out of memory errors:** - -```bash -# Check resource usage -kubectl top pods -n semantic-router - -# Adjust resource limits in deployment.yaml if needed -``` - -### Resource Optimization - -For different environments, you can adjust resource requirements: - -- **Development**: 2Gi memory, 0.5 CPU -- **Testing**: 4Gi memory, 1 CPU -- **Production**: 8Gi+ memory, 2+ CPU - -Edit the `resources` section in `deployment.yaml` accordingly. - -## Files Overview - -### Kubernetes Manifests (`deploy/kubernetes/`) - -- `deployment.yaml` - Main application deployment with optimized resource settings -- `service.yaml` - Services for gRPC, HTTP API, and metrics -- `pvc.yaml` - Persistent volume claim for model storage -- `namespace.yaml` - Dedicated namespace for the application -- `config.yaml` - Application configuration -- `tools_db.json` - Tools database for semantic routing -- `kustomization.yaml` - Kustomize configuration for core deployment -- `observability/` - Prometheus, Grafana, Dashboard, optional Open WebUI + Pipelines (with its own `kustomization.yaml`) - (also includes optional Chat UI) - -For detailed observability setup and screenshots, see `deploy/kubernetes/observability/README.md`. - -### Development Tools - -- `tools/kind/kind-config.yaml` - Kind cluster configuration for local development -- `tools/make/kube.mk` - Make targets for Kubernetes operations -- `Makefile` - Root makefile including all make targets diff --git a/deploy/kubernetes/ai-gateway/README.md b/deploy/kubernetes/ai-gateway/README.md deleted file mode 100644 index 5789a7ee7..000000000 --- a/deploy/kubernetes/ai-gateway/README.md +++ /dev/null @@ -1,273 +0,0 @@ -# Install in Kubernetes - -This guide provides step-by-step instructions for deploying the vLLM Semantic Router with Envoy AI Gateway on Kubernetes. - -## Architecture Overview - -The deployment consists of: - -- **vLLM Semantic Router**: Provides intelligent request routing and classification -- **Envoy Gateway**: Core gateway functionality and traffic management -- **Envoy AI Gateway**: AI-specific extensions for inference workloads -- **Gateway API Inference Extension**: CRDs for managing inference pools - -## Prerequisites - -Before starting, ensure you have the following tools installed: - -- [Docker](https://docs.docker.com/get-docker/) - Container runtime -- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - Kubernetes in Docker -- [kubectl](https://kubernetes.io/docs/tasks/tools/) - Kubernetes CLI -- [Helm](https://helm.sh/docs/intro/install/) - Package manager for Kubernetes - -## Step 1: Create Kind Cluster - -Create a local Kubernetes cluster optimized for the semantic router workload: - -```bash -# Create cluster with optimized resource settings -kind create cluster --name semantic-router-cluster --config tools/kind/kind-config.yaml - -# Verify cluster is ready -kubectl wait --for=condition=Ready nodes --all --timeout=300s -``` - -**Note**: The kind configuration provides sufficient resources (8GB+ RAM, 4+ CPU cores) for running the semantic router and AI gateway components. - -## Step 2: Deploy vLLM Semantic Router - -Deploy the semantic router service with all required components: - -```bash -# Deploy semantic router using Kustomize -kubectl apply -k deploy/kubernetes/ - -# Wait for deployment to be ready (this may take several minutes for model downloads) -kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s - -# Verify deployment status -kubectl get pods -n vllm-semantic-router-system -``` - -## Step 3: Install Envoy Gateway - -Install the core Envoy Gateway for traffic management: - -```bash -# Install Envoy Gateway using Helm -helm upgrade -i eg oci://docker.io/envoyproxy/gateway-helm \ - --version v0.0.0-latest \ - --namespace envoy-gateway-system \ - --create-namespace - -# Wait for Envoy Gateway to be ready -kubectl wait --timeout=300s -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available -``` - -## Step 4: Install Envoy AI Gateway - -Install the AI-specific extensions for inference workloads: - -```bash -# Install Envoy AI Gateway using Helm -helm upgrade -i aieg oci://docker.io/envoyproxy/ai-gateway-helm \ - --version v0.0.0-latest \ - --namespace envoy-ai-gateway-system \ - --create-namespace - -# Wait for AI Gateway Controller to be ready -kubectl wait --timeout=300s -n envoy-ai-gateway-system deployment/ai-gateway-controller --for=condition=Available -``` - -## Step 5: Install Gateway API Inference Extension - -Install the Custom Resource Definitions (CRDs) for managing inference pools: - -```bash -# Install Gateway API Inference Extension CRDs -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.1/manifests.yaml - -# Verify CRDs are installed -kubectl get crd | grep inference -``` - -## Step 6: Configure AI Gateway - -Apply the AI Gateway configuration to connect with the semantic router: - -```bash -# Apply AI Gateway configuration -kubectl apply -f deploy/kubernetes/ai-gateway/configuration - -# Restart controllers to pick up new configuration -kubectl rollout restart -n envoy-gateway-system deployment/envoy-gateway -kubectl rollout restart -n envoy-ai-gateway-system deployment/ai-gateway-controller - -# Wait for controllers to be ready -kubectl wait --timeout=120s -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available -kubectl wait --timeout=120s -n envoy-ai-gateway-system deployment/ai-gateway-controller --for=condition=Available -``` - -## Step 7: Create Inference Pool - -Create the inference pool that connects the gateway to the semantic router backend: - -```bash -# Create inference pool configuration -kubectl apply -f deploy/kubernetes/ai-gateway/inference-pool - -# Wait for inference pool to be ready -sleep 30 -``` - -## Step 8: Verify Deployment - -Verify that the inference pool has been created and is properly configured: - -```bash -# Check inference pool status -kubectl get inferencepool vllm-semantic-router -n vllm-semantic-router-system -o yaml -``` - -Expected output should show the inference pool in `Accepted` state: - -```yaml -status: - parent: - - conditions: - - lastTransitionTime: "2025-09-27T09:27:32Z" - message: 'InferencePool has been Accepted by controller ai-gateway-controller: - InferencePool reconciled successfully' - observedGeneration: 1 - reason: Accepted - status: "True" - type: Accepted - - lastTransitionTime: "2025-09-27T09:27:32Z" - message: 'Reference resolution by controller ai-gateway-controller: All references - resolved successfully' - observedGeneration: 1 - reason: ResolvedRefs - status: "True" - type: ResolvedRefs - parentRef: - group: gateway.networking.k8s.io - kind: Gateway - name: vllm-semantic-router - namespace: vllm-semantic-router-system -``` - -## Testing the Deployment - -### Method 1: Port Forwarding (Recommended for Local Testing) - -Set up port forwarding to access the gateway locally: - -```bash -# Set up environment variables -export GATEWAY_IP="localhost:8080" - -# Get the Envoy service name -export ENVOY_SERVICE=$(kubectl get svc -n envoy-gateway-system \ - --selector=gateway.envoyproxy.io/owning-gateway-namespace=vllm-semantic-router-system,gateway.envoyproxy.io/owning-gateway-name=vllm-semantic-router \ - -o jsonpath='{.items[0].metadata.name}') - -# Start port forwarding (run in background or separate terminal) -kubectl port-forward -n envoy-gateway-system svc/$ENVOY_SERVICE 8080:80 -``` - -### Method 2: External IP (For Production Deployments) - -For production deployments with external load balancers: - -```bash -# Get the Gateway external IP -GATEWAY_IP=$(kubectl get gateway vllm-semantic-router -n vllm-semantic-router-system -o jsonpath='{.status.addresses[0].value}') -echo "Gateway IP: $GATEWAY_IP" -``` - -### Send Test Requests - -Once the gateway is accessible, test the inference endpoint: - -```bash -# Test chat completions endpoint -curl -X POST "http://${GATEWAY_IP}/v1/chat/completions" \ - -H "Content-Type: application/json" \ - -d '{ - "messages": [ - { - "role": "user", - "content": "Say this is a test" - } - ], - "model": "auto" - }' -``` - -## Troubleshooting - -### Common Issues - -**Gateway not accessible:** - -```bash -# Check gateway status -kubectl get gateway vllm-semantic-router -n vllm-semantic-router-system - -# Check Envoy service -kubectl get svc -n envoy-gateway-system -``` - -**Inference pool not ready:** - -```bash -# Check inference pool events -kubectl describe inferencepool vllm-semantic-router -n vllm-semantic-router-system - -# Check AI gateway controller logs -kubectl logs -n envoy-ai-gateway-system deployment/ai-gateway-controller -``` - -**Semantic router not responding:** - -```bash -# Check semantic router pod status -kubectl get pods -n vllm-semantic-router-system - -# Check semantic router logs -kubectl logs -n vllm-semantic-router-system deployment/semantic-router -``` - -## Cleanup - -To remove the entire deployment: - -```bash -# Remove inference pool -kubectl delete -f deploy/kubernetes/ai-gateway/inference-pool - -# Remove AI gateway configuration -kubectl delete -f deploy/kubernetes/ai-gateway/configuration - -# Remove semantic router -kubectl delete -k deploy/kubernetes/ - -# Remove AI gateway -helm uninstall aieg -n envoy-ai-gateway-system - -# Remove Envoy gateway -helm uninstall eg -n envoy-gateway-system - -# Remove Gateway API CRDs (optional) -kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.1/manifests.yaml - -# Delete kind cluster -kind delete cluster --name semantic-router-cluster -``` - -## Next Steps - -- Configure custom routing rules in the AI Gateway -- Set up monitoring and observability -- Implement authentication and authorization -- Scale the semantic router deployment for production workloads diff --git a/deploy/kubernetes/ai-gateway/aigw-resources/base-model.yaml b/deploy/kubernetes/ai-gateway/aigw-resources/base-model.yaml new file mode 100644 index 000000000..794f22cc4 --- /dev/null +++ b/deploy/kubernetes/ai-gateway/aigw-resources/base-model.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:v0.5.0 + imagePullPolicy: IfNotPresent + args: + - --model + - base-model + - --port + - "8000" + - --max-loras + - "6" + - --lora-modules + - '{"name": "math-expert"}' + - '{"name": "science-expert"}' + - '{"name": "social-expert"}' + - '{"name": "humanities-expert"}' + - '{"name": "law-expert"}' + - '{"name": "general-expert"}' + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct + namespace: default + labels: + app: vllm-llama3-8b-instruct +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + protocol: TCP + selector: + app: vllm-llama3-8b-instruct +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIServiceBackend +metadata: + name: vllm-llama3-8b-instruct + namespace: default +spec: + schema: + name: OpenAI + backendRef: + name: vllm-llama3-8b-instruct + kind: Backend + group: gateway.envoyproxy.io +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: vllm-llama3-8b-instruct + namespace: default +spec: + endpoints: + - fqdn: + hostname: vllm-llama3-8b-instruct.default.svc.cluster.local + port: 8000 diff --git a/deploy/kubernetes/ai-gateway/aigw-resources/gwapi-resources.yaml b/deploy/kubernetes/ai-gateway/aigw-resources/gwapi-resources.yaml new file mode 100644 index 000000000..368919096 --- /dev/null +++ b/deploy/kubernetes/ai-gateway/aigw-resources/gwapi-resources.yaml @@ -0,0 +1,169 @@ +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: semantic-router +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyProxy +metadata: + name: semantic-router + namespace: default +spec: + provider: + type: Kubernetes + kubernetes: + envoyDeployment: + container: + resources: {} + logging: + level: + default: trace +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: semantic-router + namespace: default +spec: + gatewayClassName: semantic-router + listeners: + - name: http + protocol: HTTP + port: 80 + infrastructure: + parametersRef: + group: gateway.envoyproxy.io + kind: EnvoyProxy + name: semantic-router +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: ClientTrafficPolicy +metadata: + name: client-buffer-limit + namespace: default +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: semantic-router + connection: + bufferLimit: 50Mi +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIGatewayRoute +metadata: + name: semantic-router + namespace: default +spec: + parentRefs: + - name: semantic-router + kind: Gateway + group: gateway.networking.k8s.io + rules: + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: math-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: math-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: science-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: social-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: humanities-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: law-expert + backendRefs: + - name: vllm-llama3-8b-instruct + - matches: + - headers: + - type: Exact + name: x-ai-eg-model + value: general-expert + backendRefs: + - name: vllm-llama3-8b-instruct +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: ai-gateway-prepost-extproc-patch-policy + namespace: default +spec: + jsonPatches: + - name: default/semantic-router/http + operation: + op: add + path: /default_filter_chain/filters/0/typed_config/http_filters/0 + value: + name: semantic-router-extproc + typedConfig: + '@type': type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + allow_mode_override: true + grpcService: + envoyGrpc: + authority: semantic-router.vllm-semantic-router-system:50051 + clusterName: semantic-router + timeout: 60s + message_timeout: 10s + processing_mode: + request_body_mode: BUFFERED + request_header_mode: SEND + request_trailer_mode: SKIP + response_body_mode: BUFFERED + response_header_mode: SEND + response_trailer_mode: SKIP + type: type.googleapis.com/envoy.config.listener.v3.Listener + - name: semantic-router + operation: + op: add + path: "" + value: + connect_timeout: 10s + http2_protocol_options: {} + lb_policy: ROUND_ROBIN + load_assignment: + cluster_name: semantic-router + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: semantic-router.vllm-semantic-router-system.svc.cluster.local + port_value: 50051 + name: semantic-router + type: STRICT_DNS + type: type.googleapis.com/envoy.config.cluster.v3.Cluster + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: semantic-router + type: JSONPatch diff --git a/deploy/kubernetes/ai-gateway/configuration/config.yaml b/deploy/kubernetes/ai-gateway/configuration/config.yaml deleted file mode 100644 index 872409a23..000000000 --- a/deploy/kubernetes/ai-gateway/configuration/config.yaml +++ /dev/null @@ -1,67 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: envoy-gateway-config - namespace: "envoy-gateway-system" - labels: - helm.sh/chart: gateway-helm-v0.0.0-latest - app.kubernetes.io/name: gateway-helm - app.kubernetes.io/instance: eg - app.kubernetes.io/version: "latest" - app.kubernetes.io/managed-by: Helm -data: - envoy-gateway.yaml: | - apiVersion: gateway.envoyproxy.io/v1alpha1 - kind: EnvoyGateway - gateway: - controllerName: gateway.envoyproxy.io/gatewayclass-controller - logging: - level: - default: info - provider: - kubernetes: - rateLimitDeployment: - patch: - type: StrategicMerge - value: - spec: - template: - spec: - containers: - - imagePullPolicy: IfNotPresent - name: envoy-ratelimit - image: docker.io/envoyproxy/ratelimit:60d8e81b - type: Kubernetes - extensionApis: - enableEnvoyPatchPolicy: true - enableBackend: true - extensionManager: - backendResources: - - group: inference.networking.k8s.io - kind: InferencePool - version: v1 - hooks: - xdsTranslator: - translation: - listener: - includeAll: true - route: - includeAll: true - cluster: - includeAll: true - secret: - includeAll: true - post: - - Translation - - Cluster - - Route - service: - fqdn: - hostname: ai-gateway-controller.envoy-ai-gateway-system.svc.cluster.local - port: 1063 - rateLimit: - backend: - type: Redis - redis: - url: redis.redis-system.svc.cluster.local:6379 ---- diff --git a/deploy/kubernetes/ai-gateway/configuration/rbac.yaml b/deploy/kubernetes/ai-gateway/configuration/rbac.yaml deleted file mode 100644 index 956652246..000000000 --- a/deploy/kubernetes/ai-gateway/configuration/rbac.yaml +++ /dev/null @@ -1,37 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: list-ai-gateway-controller -rules: - - apiGroups: - - "aigateway.envoyproxy.io" - resources: - - "aigatewayroutes" - - "aiservicebackends" - - "backendSecurityPolicies" - verbs: - - "get" - - "list" - - "watch" - - apiGroups: - - "inference.networking.k8s.io" - resources: - - "inferencepools" - verbs: - - "get" - - "list" - - "watch" ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: list-ai-gateway-controller -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: list-ai-gateway-controller -subjects: - - kind: ServiceAccount - name: envoy-gateway - namespace: envoy-gateway-system ---- diff --git a/deploy/kubernetes/ai-gateway/configuration/redis.yaml b/deploy/kubernetes/ai-gateway/configuration/redis.yaml deleted file mode 100644 index 8a71a6d02..000000000 --- a/deploy/kubernetes/ai-gateway/configuration/redis.yaml +++ /dev/null @@ -1,42 +0,0 @@ -kind: Namespace -apiVersion: v1 -metadata: - name: redis-system ---- -apiVersion: v1 -kind: Service -metadata: - name: redis - namespace: redis-system - labels: - app: redis -spec: - ports: - - name: redis - port: 6379 - selector: - app: redis ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: redis - namespace: redis-system -spec: - replicas: 1 - selector: - matchLabels: - app: redis - template: - metadata: - labels: - app: redis - spec: - containers: - - image: redis:alpine - imagePullPolicy: IfNotPresent - name: redis - ports: - - name: redis - containerPort: 6379 - restartPolicy: Always diff --git a/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml b/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml deleted file mode 100644 index 64afc6f93..000000000 --- a/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml +++ /dev/null @@ -1,62 +0,0 @@ -apiVersion: inference.networking.k8s.io/v1 -kind: InferencePool -metadata: - name: vllm-semantic-router - namespace: vllm-semantic-router-system - annotations: - aigateway.envoyproxy.io/processing-body-mode: "buffered" - aigateway.envoyproxy.io/allow-mode-override: "true" -spec: - targetPorts: - - number: 50051 - selector: - matchLabels: - app: vllm-semantic-router - endpointPickerRef: - name: semantic-router - port: - number: 50051 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: vllm-semantic-router -spec: - controllerName: gateway.envoyproxy.io/gatewayclass-controller ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: vllm-semantic-router - namespace: vllm-semantic-router-system -spec: - gatewayClassName: vllm-semantic-router - listeners: - - name: http - protocol: HTTP - port: 80 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: vllm-semantic-router - namespace: vllm-semantic-router-system -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: vllm-semantic-router - namespace: vllm-semantic-router-system - rules: - - backendRefs: - - group: inference.networking.k8s.io - kind: InferencePool - name: vllm-semantic-router - namespace: vllm-semantic-router-system - weight: 1 - matches: - - path: - type: PathPrefix - value: / - timeouts: - request: 60s diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/ai-gateway/semantic-router/config.yaml similarity index 61% rename from deploy/kubernetes/config.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/config.yaml index a51999825..9220721e7 100644 --- a/deploy/kubernetes/config.yaml +++ b/deploy/kubernetes/ai-gateway/semantic-router/config.yaml @@ -1,80 +1,42 @@ -bert_model: - model_id: models/all-MiniLM-L12-v2 - threshold: 0.6 - use_cpu: true - -semantic_cache: - enabled: true - backend_type: "memory" # Options: "memory" or "milvus" - similarity_threshold: 0.8 - max_entries: 1000 # Only applies to memory backend - ttl_seconds: 3600 - eviction_policy: "fifo" - # Embedding model for semantic similarity matching - # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) - embedding_model: "bert" # Default: BERT (fastest, lowest memory for Kubernetes) - -tools: - enabled: true - top_k: 3 - similarity_threshold: 0.2 - tools_db_path: "config/tools_db.json" - fallback_to_empty: true - -prompt_guard: - enabled: true # Global default - can be overridden per category with jailbreak_enabled - use_modernbert: true - model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 - use_cpu: true - jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" - -# vLLM Endpoints Configuration -# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6) -# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1 -# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field) -vllm_endpoints: - - name: "endpoint1" - address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network - port: 8002 - weight: 1 - model_config: - "qwen3": - reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax - preferred_endpoints: ["endpoint1"] + "base-model": + reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax + # preferred_endpoints omitted - let upstream handle endpoint selection pii_policy: - allow_by_default: true - -# Classifier configuration -classifier: - category_model: - model_id: "models/category_classifier_modernbert-base_model" - use_modernbert: true - threshold: 0.6 - use_cpu: true - category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" - pii_model: - model_id: "models/pii_classifier_modernbert-base_presidio_token_model" - use_modernbert: true - threshold: 0.7 - use_cpu: true - pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + allow_by_default: false + # Define available LoRA adapters for this base model + # These names must match the LoRA modules registered with vLLM at startup + loras: + - name: "science-expert" + description: "Specialized for science domains: biology, chemistry, physics, health, engineering" + - name: "social-expert" + description: "Optimized for social sciences: business, economics" + - name: "math-expert" + description: "Fine-tuned for mathematics and quantitative reasoning" + - name: "law-expert" + description: "Specialized for legal questions and law-related topics" + - name: "humanities-expert" + description: "Optimized for humanities: psychology, history, philosophy" + - name: "general-expert" + description: "General-purpose adapter for diverse topics" -# Categories with new use_reasoning field structure +# Categories with LoRA routing +# Each category uses the base-model model with a specific LoRA adapter categories: - name: business system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." # jailbreak_enabled: true # Optional: Override global jailbreak detection per category # jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category model_scores: - - model: qwen3 + - model: base-model # Base model name (for endpoint selection and PII policy) + lora_name: social-expert # LoRA adapter name (used as final model name in request) score: 0.7 use_reasoning: false # Business performs better without reasoning - name: law system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters." model_scores: - - model: qwen3 + - model: base-model + lora_name: law-expert score: 0.4 use_reasoning: false - name: psychology @@ -82,25 +44,29 @@ categories: semantic_cache_enabled: true semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances model_scores: - - model: qwen3 + - model: base-model + lora_name: humanities-expert score: 0.6 use_reasoning: false - name: biology system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology. Explain biological concepts with scientific accuracy, use appropriate terminology, and provide examples from current research. Connect biological principles to real-world applications and emphasize the interconnectedness of biological systems." model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.9 use_reasoning: false - name: chemistry system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.6 use_reasoning: true # Enable reasoning for complex chemistry - name: history system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." model_scores: - - model: qwen3 + - model: base-model + lora_name: humanities-expert score: 0.7 use_reasoning: false - name: other @@ -108,7 +74,8 @@ categories: semantic_cache_enabled: true semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive model_scores: - - model: qwen3 + - model: base-model + lora_name: general-expert score: 0.7 use_reasoning: false - name: health @@ -116,61 +83,159 @@ categories: semantic_cache_enabled: true semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.5 use_reasoning: false - name: economics system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory. Analyze economic phenomena using established economic principles, provide data-driven insights, and explain complex economic concepts in accessible terms. Consider both theoretical frameworks and real-world applications in your responses." model_scores: - - model: qwen3 + - model: base-model + lora_name: social-expert score: 1.0 use_reasoning: false - name: math system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." model_scores: - - model: qwen3 + - model: base-model + lora_name: math-expert score: 1.0 use_reasoning: true # Enable reasoning for complex math - name: physics system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.7 use_reasoning: true # Enable reasoning for physics - name: computer science system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.6 use_reasoning: false - name: philosophy system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought. Engage with complex philosophical questions by presenting multiple perspectives, analyzing arguments rigorously, and encouraging critical thinking. Draw connections between philosophical concepts and contemporary issues while maintaining intellectual honesty about the complexity and ongoing nature of philosophical debates." model_scores: - - model: qwen3 + - model: base-model + lora_name: humanities-expert score: 0.5 use_reasoning: false - name: engineering system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering. Apply engineering principles, design methodologies, and problem-solving approaches to provide practical solutions. Consider safety, efficiency, sustainability, and cost-effectiveness in your recommendations. Use technical precision while explaining concepts clearly, and emphasize the importance of proper engineering practices and standards." model_scores: - - model: qwen3 + - model: base-model + lora_name: science-expert score: 0.7 use_reasoning: false -default_model: "qwen3" +default_model: base-model -# Auto model name for automatic model selection (optional) -# This is the model name that clients should use to trigger automatic model selection -# If not specified, defaults to "MoM" (Mixture of Models) -# For backward compatibility, "auto" is always accepted as an alias -# Example: auto_model_name: "MoM" # or any other name you prefer -# auto_model_name: "MoM" +bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true -# Include configured models in /v1/models list endpoint (optional, default: false) -# When false (default): only the auto model name is returned in the /v1/models endpoint -# When true: all models configured in model_config are also included in the /v1/models endpoint -# This is useful for clients that need to discover all available models -# Example: include_config_models_in_list: true -# include_config_models_in_list: false +semantic_cache: + enabled: true + backend_type: "memory" # Options: "memory", "milvus", or "hybrid" + similarity_threshold: 0.8 + max_entries: 1000 # Only applies to memory backend + ttl_seconds: 3600 + eviction_policy: "fifo" + # HNSW index configuration (for memory backend only) + use_hnsw: true # Enable HNSW index for faster similarity search + hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory) + hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build) + + # Hybrid cache configuration (when backend_type: "hybrid") + # Combines in-memory HNSW for fast search with Milvus for scalable storage + # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000) + # backend_config_path: "config/milvus.yaml" # Path to Milvus config + + # Embedding model for semantic similarity matching + # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) + # Default: "bert" (fastest, lowest memory) + embedding_model: "bert" + +tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: true # Global default - can be overridden per category with jailbreak_enabled + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + + +# Router Configuration for Dual-Path Selection +router: + # High confidence threshold for automatic LoRA selection + high_confidence_threshold: 0.99 + # Low latency threshold in milliseconds for LoRA path selection + low_latency_threshold_ms: 2000 + # Baseline scores for path evaluation + lora_baseline_score: 0.8 + traditional_baseline_score: 0.7 + embedding_baseline_score: 0.75 + # Success rate calculation threshold + success_confidence_threshold: 0.8 + # Large batch size threshold for parallel processing + large_batch_threshold: 4 + # Default performance metrics (milliseconds) + lora_default_execution_time_ms: 1345 + traditional_default_execution_time_ms: 4567 + # Default processing requirements + default_confidence_threshold: 0.95 + default_max_latency_ms: 5000 + default_batch_size: 4 + default_avg_execution_time_ms: 3000 + # Default confidence and success rates + lora_default_confidence: 0.99 + traditional_default_confidence: 0.95 + lora_default_success_rate: 0.98 + traditional_default_success_rate: 0.95 + # Scoring weights for intelligent path selection (balanced approach) + multi_task_lora_weight: 0.30 # LoRA advantage for multi-task processing + single_task_traditional_weight: 0.30 # Traditional advantage for single tasks + large_batch_lora_weight: 0.25 # LoRA advantage for large batches (≥4) + small_batch_traditional_weight: 0.25 # Traditional advantage for single items + medium_batch_weight: 0.10 # Neutral weight for medium batches (2-3) + high_confidence_lora_weight: 0.25 # LoRA advantage for high confidence (≥0.99) + low_confidence_traditional_weight: 0.25 # Traditional for lower confidence (≤0.9) + low_latency_lora_weight: 0.30 # LoRA advantage for low latency (≤2000ms) + high_latency_traditional_weight: 0.10 # Traditional acceptable for relaxed timing + performance_history_weight: 0.20 # Historical performance comparison factor + # Traditional model specific configurations + traditional_bert_confidence_threshold: 0.95 # Traditional BERT confidence threshold + traditional_modernbert_confidence_threshold: 0.8 # Traditional ModernBERT confidence threshold + traditional_pii_detection_threshold: 0.5 # Traditional PII detection confidence threshold + traditional_token_classification_threshold: 0.9 # Traditional token classification threshold + traditional_dropout_prob: 0.1 # Traditional model dropout probability + traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability + tie_break_confidence: 0.5 # Confidence value for tie-breaking situations # Reasoning family configurations reasoning_families: @@ -207,10 +272,19 @@ api: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] +# Embedding Models Configuration +# These models provide intelligent embedding generation with automatic routing: +# - Qwen3-Embedding-0.6B: Up to 32K context, high quality, +# - EmbeddingGemma-300M: Up to 8K context, fast inference, Matryoshka support (768/512/256/128) +embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" + gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true # Set to false for GPU acceleration (requires CUDA) + # Observability Configuration observability: tracing: - enabled: true # Enable distributed tracing for docker-compose stack + enabled: false # Enable distributed tracing for docker-compose stack provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry exporter: type: "otlp" # Export spans to Jaeger (via OTLP gRPC) diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml similarity index 96% rename from deploy/kubernetes/deployment.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml index c3df29be2..198a64edb 100644 --- a/deploy/kubernetes/deployment.yaml +++ b/deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml @@ -111,7 +111,8 @@ spec: containers: - name: semantic-router image: ghcr.io/vllm-project/semantic-router/extproc:latest - args: ["--secure=true"] + imagePullPolicy: IfNotPresent + args: ["--secure=false"] securityContext: runAsNonRoot: false allowPrivilegeEscalation: false @@ -128,6 +129,8 @@ spec: env: - name: LD_LIBRARY_PATH value: "/app/lib" + - name: SR_LOG_LEVEL + value: "debug" volumeMounts: - name: config-volume mountPath: /app/config @@ -137,7 +140,7 @@ spec: livenessProbe: tcpSocket: port: 50051 - initialDelaySeconds: 60 + initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 10 failureThreshold: 3 @@ -145,7 +148,7 @@ spec: readinessProbe: tcpSocket: port: 50051 - initialDelaySeconds: 90 + initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 10 failureThreshold: 3 diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/ai-gateway/semantic-router/kustomization.yaml similarity index 100% rename from deploy/kubernetes/kustomization.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/kustomization.yaml diff --git a/deploy/kubernetes/namespace.yaml b/deploy/kubernetes/ai-gateway/semantic-router/namespace.yaml similarity index 100% rename from deploy/kubernetes/namespace.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/namespace.yaml diff --git a/deploy/kubernetes/pv-models.yaml b/deploy/kubernetes/ai-gateway/semantic-router/pv-models.yaml similarity index 100% rename from deploy/kubernetes/pv-models.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/pv-models.yaml diff --git a/deploy/kubernetes/service.yaml b/deploy/kubernetes/ai-gateway/semantic-router/service.yaml similarity index 100% rename from deploy/kubernetes/service.yaml rename to deploy/kubernetes/ai-gateway/semantic-router/service.yaml diff --git a/deploy/kubernetes/tools_db.json b/deploy/kubernetes/ai-gateway/semantic-router/tools_db.json similarity index 100% rename from deploy/kubernetes/tools_db.json rename to deploy/kubernetes/ai-gateway/semantic-router/tools_db.json diff --git a/deploy/kubernetes/llmd-base/README.md b/deploy/kubernetes/llmd-base/README.md index d58e49cfd..9ae7b0bb7 100644 --- a/deploy/kubernetes/llmd-base/README.md +++ b/deploy/kubernetes/llmd-base/README.md @@ -1,4 +1,4 @@ -# vLLM Semantic Router with LLM-D +# vLLM Semantic Router with LLM-D This guide provides step-by-step instructions for deploying the vLLM Semantic Router (vsr) in combination with [LLM-D](https://github.com/llm-d/llm-d). This will also illustrate a key design pattern namely use of the vsr as a model picker in combination with the use of LLM-D as endpoint picker. diff --git a/deploy/openshift/envoy-openshift.yaml b/deploy/openshift/envoy-openshift.yaml index 94e6fcb38..81d246976 100644 --- a/deploy/openshift/envoy-openshift.yaml +++ b/deploy/openshift/envoy-openshift.yaml @@ -1,6 +1,6 @@ # OpenShift-specific Envoy configuration # This config uses ORIGINAL_DST cluster with header-based destination selection -# The semantic router sets the x-gateway-destination-endpoint header which Envoy uses +# The semantic router sets the x-vsr-destination-endpoint header which Envoy uses # to dynamically route to the correct vLLM endpoint (port 8000 or 8001) static_resources: listeners: @@ -35,7 +35,7 @@ static_resources: upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%" request_id: "%REQ(X-REQUEST-ID)%" selected_model: "%REQ(X-SELECTED-MODEL)%" - selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%" + selected_endpoint: "%REQ(x-vsr-destination-endpoint)%" route_config: name: local_route virtual_hosts: @@ -48,7 +48,7 @@ static_resources: route: cluster: semantic_router_cluster timeout: 300s - # Dynamic route - destination determined by x-gateway-destination-endpoint header + # Dynamic route - destination determined by x-vsr-destination-endpoint header - match: prefix: "/" route: @@ -130,7 +130,7 @@ static_resources: http_protocol_options: {} # Dynamic vLLM cluster using ORIGINAL_DST with header-based destination - # The semantic router sets x-gateway-destination-endpoint header to specify the target + # The semantic router sets x-vsr-destination-endpoint header to specify the target - name: vllm_dynamic_cluster connect_timeout: 300s per_connection_buffer_limit_bytes: 52428800 @@ -138,7 +138,7 @@ static_resources: lb_policy: CLUSTER_PROVIDED original_dst_lb_config: use_http_header: true - http_header_name: "x-gateway-destination-endpoint" + http_header_name: "x-vsr-destination-endpoint" typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 93a345a15..81b2b1c2a 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -915,6 +915,37 @@ default_model: "model-b" Expect(found).To(BeFalse()) Expect(endpointName).To(BeEmpty()) }) + + Describe("SelectBestEndpointAddressForModel", func() { + It("should return endpoint address when model has preferred endpoints", func() { + cfg, err := Load(configFile) + Expect(err).NotTo(HaveOccurred()) + + // model-a has preferred endpoints + endpointAddress, found := cfg.SelectBestEndpointAddressForModel("model-a") + Expect(found).To(BeTrue()) + Expect(endpointAddress).To(MatchRegexp(`127\.0\.0\.1:\d+`)) + }) + + It("should return false when model has no preferred endpoints", func() { + cfg, err := Load(configFile) + Expect(err).NotTo(HaveOccurred()) + + // model-c has no preferred_endpoints configured + endpointAddress, found := cfg.SelectBestEndpointAddressForModel("model-c") + Expect(found).To(BeFalse()) + Expect(endpointAddress).To(BeEmpty()) + }) + + It("should return false for non-existent model", func() { + cfg, err := Load(configFile) + Expect(err).NotTo(HaveOccurred()) + + endpointAddress, found := cfg.SelectBestEndpointAddressForModel("non-existent-model") + Expect(found).To(BeFalse()) + Expect(endpointAddress).To(BeEmpty()) + }) + }) }) Describe("ValidateEndpoints", func() { diff --git a/src/semantic-router/pkg/extproc/extproc_test.go b/src/semantic-router/pkg/extproc/extproc_test.go index 3820f9dc0..a5b33f295 100644 --- a/src/semantic-router/pkg/extproc/extproc_test.go +++ b/src/semantic-router/pkg/extproc/extproc_test.go @@ -1901,7 +1901,7 @@ var _ = Describe("Endpoint Selection", func() { var modelHeaderFound bool for _, header := range headerMutation.SetHeaders { - if header.Header.Key == "x-gateway-destination-endpoint" { + if header.Header.Key == "x-vsr-destination-endpoint" { endpointHeaderFound = true // Should be one of the configured endpoint addresses // Check both Value and RawValue since implementation uses RawValue @@ -1975,7 +1975,7 @@ var _ = Describe("Endpoint Selection", func() { var selectedEndpoint string for _, header := range headerMutation.SetHeaders { - if header.Header.Key == "x-gateway-destination-endpoint" { + if header.Header.Key == "x-vsr-destination-endpoint" { endpointHeaderFound = true // Check both Value and RawValue since implementation uses RawValue selectedEndpoint = header.Header.Value @@ -2038,7 +2038,7 @@ var _ = Describe("Endpoint Selection", func() { var selectedEndpoint string for _, header := range headerMutation.SetHeaders { - if header.Header.Key == "x-gateway-destination-endpoint" { + if header.Header.Key == "x-vsr-destination-endpoint" { endpointHeaderFound = true // Check both Value and RawValue since implementation uses RawValue selectedEndpoint = header.Header.Value diff --git a/src/semantic-router/pkg/headers/headers.go b/src/semantic-router/pkg/headers/headers.go index 2e9673bce..7ebb2e186 100644 --- a/src/semantic-router/pkg/headers/headers.go +++ b/src/semantic-router/pkg/headers/headers.go @@ -12,7 +12,7 @@ const ( // GatewayDestinationEndpoint specifies the backend endpoint address selected by the router. // This header is set by the router to direct Envoy to the appropriate upstream service. - GatewayDestinationEndpoint = "x-gateway-destination-endpoint" + GatewayDestinationEndpoint = "x-vsr-destination-endpoint" // SelectedModel indicates the model that was selected by the router for processing. // This header is set during the routing decision phase. diff --git a/src/semantic-router/pkg/headers/headers_test.go b/src/semantic-router/pkg/headers/headers_test.go index e70122d8a..dc8f75e69 100644 --- a/src/semantic-router/pkg/headers/headers_test.go +++ b/src/semantic-router/pkg/headers/headers_test.go @@ -12,7 +12,7 @@ func TestHeaderConstants(t *testing.T) { }{ // Request headers {"RequestID", RequestID, "x-request-id"}, - {"GatewayDestinationEndpoint", GatewayDestinationEndpoint, "x-gateway-destination-endpoint"}, + {"GatewayDestinationEndpoint", GatewayDestinationEndpoint, "x-vsr-destination-endpoint"}, {"SelectedModel", SelectedModel, "x-selected-model"}, // VSR headers {"VSRSelectedCategory", VSRSelectedCategory, "x-vsr-selected-category"}, diff --git a/website/docs/api/router.md b/website/docs/api/router.md index 2fe59a492..c86cde4fc 100644 --- a/website/docs/api/router.md +++ b/website/docs/api/router.md @@ -148,7 +148,7 @@ The router adds metadata headers to both requests and responses: | Header | Description | Example | |--------|-------------|---------| -| `x-gateway-destination-endpoint` | Backend endpoint selected | `endpoint1` | +| `x-vsr-destination-endpoint` | Backend endpoint selected | `endpoint1` | | `x-selected-model` | Model category determined | `mathematics` | | `x-routing-confidence` | Classification confidence | `0.956` | | `x-request-id` | Unique request identifier | `req-abc123` | diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 18742ee35..47145cb33 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -229,13 +229,26 @@ Configure model-specific settings: ```yaml model_config: - "llama2-7b": # Must match the model name in vllm_endpoints + "llama2-7b": pii_policy: allow_by_default: true # Allow PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"] - preferred_endpoints: ["my_endpoint"] + preferred_endpoints: ["my_endpoint"] # Optional: specify which endpoints can serve this model + + "gpt-4": + pii_policy: + allow_by_default: false + # preferred_endpoints omitted - router will not set endpoint header + # Useful when external load balancer handles endpoint selection ``` +**Note on `preferred_endpoints`:** + +- **Optional field**: If omitted, the router will not set the `x-vsr-destination-endpoint` header +- **When specified**: Router selects the best endpoint based on weights and sets the header +- **When omitted**: Upstream load balancer or service mesh handles endpoint selection +- **Validation**: Models used in categories or as `default_model` must have `preferred_endpoints` configured + ### Pricing (Optional) If you want the router to compute request cost and expose Prometheus cost metrics, add per-1M token pricing and currency under each model in `model_config`. diff --git a/website/docs/installation/k8s/ai-gateway.md b/website/docs/installation/k8s/ai-gateway.md new file mode 100644 index 000000000..b9c916764 --- /dev/null +++ b/website/docs/installation/k8s/ai-gateway.md @@ -0,0 +1,262 @@ +# Install with Envoy AI Gateway + +This guide provides step-by-step instructions for integrating the vLLM Semantic Router with Envoy AI Gateway on Kubernetes for advanced traffic management and AI-specific features. + +## Architecture Overview + +The deployment consists of: + +- **vLLM Semantic Router**: Provides intelligent request routing and semantic understanding +- **Envoy Gateway**: Core gateway functionality and traffic management +- **Envoy AI Gateway**: AI Gateway built on Envoy Gateway for LLM providers + +## Benefits of Integration + +Integrating vLLM Semantic Router with Envoy AI Gateway provides enterprise-grade capabilities for production LLM deployments: + +### 1. **Hybrid Model Selection** + +Seamlessly route requests between cloud LLM providers (OpenAI, Anthropic, etc.) and self-hosted models. + +### 2. **Token Rate Limiting** + +Protect your infrastructure and control costs with fine-grained rate limiting: + +- **Input token limits**: Control request size to prevent abuse +- **Output token limits**: Manage response generation costs +- **Total token limits**: Set overall usage quotas per user/tenant +- **Time-based windows**: Configure limits per second, minute, or hour + +### 3. **Model/Provider Failover** + +Ensure high availability with automatic failover mechanisms: + +- Detect unhealthy backends and route traffic to healthy instances +- Support for active-passive and active-active failover strategies +- Graceful degradation when primary models are unavailable + +### 4. **Traffic Splitting & Canary Testing** + +Deploy new models safely with progressive rollout capabilities: + +- **A/B Testing**: Split traffic between model versions to compare performance +- **Canary Deployments**: Gradually shift traffic to new models (e.g., 5% → 25% → 50% → 100%) +- **Shadow Traffic**: Send duplicate requests to new models without affecting production +- **Weight-based routing**: Fine-tune traffic distribution across model variants + +### 5. **LLM Observability & Monitoring** + +Gain deep insights into your LLM infrastructure: + +- **Request/Response Metrics**: Track latency, throughput, token usage, and error rates +- **Model Performance**: Monitor accuracy, quality scores, and user satisfaction +- **Cost Analytics**: Analyze spending patterns across models and providers +- **Distributed Tracing**: End-to-end visibility with OpenTelemetry integration +- **Custom Dashboards**: Visualize metrics in Prometheus, Grafana, or your preferred monitoring stack + +## Supported LLM Providers + +| Provider Name | API Schema Config on [AIServiceBackend](https://aigateway.envoyproxy.io/docs/api/#aiservicebackendspec) | Upstream Authentication Config on [BackendSecurityPolicy](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyspec) | Status | +| ------------------------------------------------------------ | :----------------------------------------------------------: | :----------------------------------------------------------: | :----: | +| [OpenAI](https://platform.openai.com/docs/api-reference) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [AWS Bedrock](https://docs.aws.amazon.com/bedrock/latest/APIReference/) | `{"name":"AWSBedrock"}` | [AWS Bedrock Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyawscredentials) | ✅ | +| [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference) | `{"name":"AzureOpenAI","version":"2025-01-01-preview"}` or `{"name":"OpenAI", "version": "openai/v1"}` | [Azure Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyazurecredentials) or [Azure API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyazureapikey) | ✅ | +| [Google Gemini on AI Studio](https://ai.google.dev/gemini-api/docs/openai) | `{"name":"OpenAI","version":"v1beta/openai"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Google Vertex AI](https://cloud.google.com/vertex-ai/docs/reference/rest) | `{"name":"GCPVertexAI"}` | [GCP Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicygcpcredentials) | ✅ | +| [Anthropic on GCP Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude) | `{"name":"GCPAnthropic", "version":"vertex-2023-10-16"}` | [GCP Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicygcpcredentials) | ✅ | +| [Groq](https://console.groq.com/docs/openai) | `{"name":"OpenAI","version":"openai/v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Grok](https://docs.x.ai/docs/api-reference?utm_source=chatgpt.com#chat-completions) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Together AI](https://docs.together.ai/docs/openai-api-compatibility) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Cohere](https://docs.cohere.com/v2/docs/compatibility-api) | `{"name":"Cohere","version":"v2"}` or `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Mistral](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [DeepInfra](https://deepinfra.com/docs/inference) | `{"name":"OpenAI","version":"v1/openai"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [DeepSeek](https://api-docs.deepseek.com/) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Hunyuan](https://cloud.tencent.com/document/product/1729/111007) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Tencent LLM Knowledge Engine](https://www.tencentcloud.com/document/product/1255/70381?lang=en) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Tetrate Agent Router Service (TARS)](https://router.tetrate.ai/) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [SambaNova](https://docs.sambanova.ai/sambastudio/latest/open-ai-api.html) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) | ✅ | +| [Anthropic](https://docs.claude.com/en/home) | `{"name":"Anthropic"}` | [Anthropic API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyanthropicapikey) | ✅ | +| Self-hosted-models | `{"name":"OpenAI","version":"v1"}` | N/A | ✅ | + +## Prerequisites + +Before starting, ensure you have the following tools installed: + +- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - Kubernetes in Docker (Optional) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) - Kubernetes CLI +- [Helm](https://helm.sh/docs/intro/install/) - Package manager for Kubernetes + +## Step 1: Create Kind Cluster (Optional) + +Create a local Kubernetes cluster optimized for the semantic router workload: + +```bash +# Generate kind configuration +./tools/kind/generate-kind-config.sh + +# Create cluster with optimized resource settings +kind create cluster --name semantic-router-cluster --config tools/kind/kind-config.yaml + +# Verify cluster is ready +kubectl wait --for=condition=Ready nodes --all --timeout=300s +``` + +**Note**: The kind configuration provides sufficient resources (8GB+ RAM, 4+ CPU cores) for running the semantic router and AI gateway components. + +## Step 2: Deploy vLLM Semantic Router + +Deploy the semantic router service with all required components: + +```bash +# Deploy semantic router using Kustomize +kubectl apply -k deploy/kubernetes/ai-gateway/semantic-router + +# Wait for deployment to be ready (this may take several minutes for model downloads) +kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s + +# Verify deployment status +kubectl get pods -n vllm-semantic-router-system +``` + +## Step 3: Install Envoy Gateway + +Install the core Envoy Gateway for traffic management: + +```bash +# Install Envoy Gateway using Helm +helm upgrade -i eg oci://docker.io/envoyproxy/gateway-helm \ + --version v0.0.0-latest \ + --namespace envoy-gateway-system \ + --create-namespace \ + -f https://raw.githubusercontent.com/envoyproxy/ai-gateway/main/manifests/envoy-gateway-values.yaml + +kubectl wait --timeout=2m -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available +``` + +## Step 4: Install Envoy AI Gateway + +Install the AI-specific extensions for inference workloads: + +```bash +# Install Envoy AI Gateway using Helm +helm upgrade -i aieg oci://docker.io/envoyproxy/ai-gateway-helm \ + --version v0.0.0-latest \ + --namespace envoy-ai-gateway-system \ + --create-namespace + +# Install Envoy AI Gateway CRDs +helm upgrade -i aieg-crd oci://docker.io/envoyproxy/ai-gateway-crds-helm --version v0.0.0-latest --namespace envoy-ai-gateway-system + +# Wait for AI Gateway Controller to be ready +kubectl wait --timeout=300s -n envoy-ai-gateway-system deployment/ai-gateway-controller --for=condition=Available +``` + +## Step 5: Deploy Demo LLM + +Create a demo LLM to serve as the backend for the semantic router: + +```bash +# Deploy demo LLM +kubectl apply -f deploy/kubernetes/ai-gateway/aigw-resources/base-model.yaml +``` + +## Step 6: Create Gateway API Resources + +Create the necessary Gateway API resources for the AI gateway: + +```bash +kubectl apply -f deploy/kubernetes/ai-gateway/aigw-resources/gwapi-resources.yaml +``` + +## Testing the Deployment + +### Method 1: Port Forwarding (Recommended for Local Testing) + +Set up port forwarding to access the gateway locally: + +```bash +# Get the Envoy service name +export ENVOY_SERVICE=$(kubectl get svc -n envoy-gateway-system \ + --selector=gateway.envoyproxy.io/owning-gateway-namespace=default,gateway.envoyproxy.io/owning-gateway-name=semantic-router \ + -o jsonpath='{.items[0].metadata.name}') + +kubectl port-forward -n envoy-gateway-system svc/$ENVOY_SERVICE 8080:80 +``` + +### Send Test Requests + +Once the gateway is accessible, test the inference endpoint: + +```bash +# Test math domain chat completions endpoint +curl -i -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "MoM", + "messages": [ + {"role": "user", "content": "What is the derivative of f(x) = x^3?"} + ] + }' +``` + +## Troubleshooting + +### Common Issues + +**Gateway not accessible:** + +```bash +# Check gateway status +kubectl get gateway semantic-router -n default + +# Check Envoy service +kubectl get svc -n envoy-gateway-system +``` + +**AI Gateway controller not ready:** + +```bash +# Check AI gateway controller logs +kubectl logs -n envoy-ai-gateway-system deployment/ai-gateway-controller + +# Check controller status +kubectl get deployment -n envoy-ai-gateway-system +``` + +**Semantic router not responding:** + +```bash +# Check semantic router pod status +kubectl get pods -n vllm-semantic-router-system + +# Check semantic router logs +kubectl logs -n vllm-semantic-router-system deployment/semantic-router +``` + +## Cleanup + +To remove the entire deployment: + +```bash +# Remove Gateway API resources and Demo LLM +kubectl delete -f deploy/kubernetes/ai-gateway/aigw-resources + +# Remove semantic router +kubectl delete -k deploy/kubernetes/ai-gateway/semantic-router + +# Remove AI gateway +helm uninstall aieg -n envoy-ai-gateway-system + +# Remove Envoy gateway +helm uninstall eg -n envoy-gateway-system + +# Delete kind cluster +kind delete cluster --name semantic-router-cluster +``` + +## Next Steps + +- Configure custom routing rules in the AI Gateway +- Set up monitoring and observability +- Implement authentication and authorization +- Scale the semantic router deployment for production workloads diff --git a/website/docs/installation/k8s/istio.md b/website/docs/installation/k8s/istio.md new file mode 100644 index 000000000..ae0c6f3b5 --- /dev/null +++ b/website/docs/installation/k8s/istio.md @@ -0,0 +1,263 @@ +# Install with Istio Gateway + +This guide provides step-by-step instructions for deploying the vLLM Semantic Router (vsr) with Istio Gateway on Kubernetes. Istio Gateway uses Envoy under the covers so it is possible to use vsr with it. However there are differences between how different Envoy based Gateways process the ExtProc protocol, hence the deployment described here is different from the deployment of vsr alongwith other types of Envoy based Gateways as described in the other guides in this repo. There are multiple architecture options possible to combine Istio Gateway with vsr. This document describes one of the options. + +## Architecture Overview + +The deployment consists of: + +- **vLLM Semantic Router**: Provides intelligent request routing and processing decisions to Envoy based Gateways +- **Istio Gateway**: Istio's implementation of Kubernetes Gateway API that uses an Envoy proxy under the covers +- **Gateway API Inference Extension**: Additional APIs to extend the Gateway API for Inference via ExtProc servers +- **Two instances of vLLM serving 1 model each**: Example backend LLMs for illustrating semantic routing in this topology + +## Prerequisites + +Before starting, ensure you have the following tools installed: + +- [Docker](https://docs.docker.com/get-docker/) - Container runtime +- [minikube](https://minikube.sigs.k8s.io/docs/start/) - Local Kubernetes +- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - Kubernetes in Docker +- [kubectl](https://kubernetes.io/docs/tasks/tools/) - Kubernetes CLI + +Either minikube or kind works to deploy a local kubernetes cluster needed for this exercise so you only need one of these two. We use minikube in the description below but the same steps should work with a Kind cluster once the cluster is created in Step 1. + +We will also deploy two different LLMs in this exercise to illustrate the semantic routing and model routing function more clearly so you ideally you should run this on a machine that has GPU support to run the two models used in this exercise and adequate memory and storage for these models. You can also use equivalent steps on a smaller server that runs smaller LLMs on a CPU based server without GPUs. + +## Step 1: Create Minikube Cluster + +Create a local Kubernetes cluster via minikube (or equivalently via Kind). + +```bash +# Create cluster +$ minikube start \ + --driver docker \ + --container-runtime docker \ + --gpus all \ + --memory no-limit \ + --cpus no-limit + +# Verify cluster is ready +$ kubectl wait --for=condition=Ready nodes --all --timeout=300s +``` + +## Step 2: Deploy LLM models + +In this exercise we deploy two LLMs viz. a llama3-8b model (meta-llama/Llama-3.1-8B-Instruct) and a phi4-mini model (microsoft/Phi-4-mini-instruct). We serve these models using two separate instances of the [vLLM inference server](https://docs.vllm.ai/en/latest/) running in the default namespace of the kubernetes cluster. You may choose any other inference engines as long as they expose OpenAI API endpoints. First install a secret for your HuggingFace token previously stored in env variable HF_TOKEN and then deploy the models as shown below. Note that the file path names used in the example kubectl clis in this guide are expected to be executed from the top folder of this repo. + +```bash +kubectl create secret generic hf-token-secret --from-literal=token=$HF_TOKEN +``` + +```bash +# Create vLLM service running llama3-8b +kubectl apply -f deploy/kubernetes/istio/vLlama3.yaml +``` + +This may take several (10+) minutes the first time this is run to download the model up until the vLLM pod running this model is in READY state. Similarly also deploy the second LLM (phi4-mini) and wait for several minutes until the pod is in READY state. + +```bash +# Create vLLM service running phi4-mini +kubectl apply -f deploy/kubernetes/istio/vPhi4.yaml +``` + +At the end of this you should be able to see both your vLLM pods are READY and serving these LLMs using the command below. You should also see Kubernetes services exposing the IP/ port on which these models are being served. In the example below the llama3-8b model is being served via a kubernetes service with service IP of 10.108.250.109 and port 80. + +```bash +# Verify that vLLM pods running the two LLMs are READY and serving + +kubectl get pods +NAME READY STATUS RESTARTS AGE +llama-8b-57b95475bd-ph7s4 1/1 Running 0 9d +phi4-mini-887476b56-74twv 1/1 Running 0 9d + +# View the IP/port of the Kubernetes services on which these models are being served + +kubectl get service +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +kubernetes ClusterIP 10.96.0.1 443/TCP 36d +llama-8b ClusterIP 10.108.250.109 80/TCP 18d +phi4-mini ClusterIP 10.97.252.33 80/TCP 9d +``` + +## Step 3: Install Istio Gateway, Gateway API, Inference Extension CRDs + +We will use a recent build of Istio for this exercise so that we have the option of also using the v1.0.0 GA version of the Gateway API Inference Extension CRDs and EPP functionality. + +Follow the procedures described in the Gateway API [Inference Extensions documentation](https://gateway-api-inference-extension.sigs.k8s.io/guides/) to deploy the 1.28 (or newer) version of Istio control plane, Istio Gateway, the Kubernetes Gateway API CRDs and the Gateway API Inference Extension v1.0.0. Do not install any of the HTTPRoute resources nor the EndPointPicker from that guide however, just use it to deploy the Istio gateway and CRDs. If installed correctly you should see the api CRDs for gateway api and inference extension as well as pods running for the Istio gateway and Istiod using the commands shown below. + +```bash +kubectl get crds | grep gateway +``` + +```bash +kubectl get crds | grep inference +``` + +```bash +kubectl get pods | grep istio +``` + +```bash +kubectl get pods -n istio-system +``` + +## Step 4: Update vsr config + +The file deploy/kubernetes/istio/config.yaml will get used to configure vsr when it is installed in the next step. Ensure that the models in the config file match the models you are using and that the vllm_endpoints in the file match the ip/ port of the llm kubernetes services you are running. It is usually good to start with basic features of vsr such as prompt classification and model routing before experimenting with other features such as PromptGuard or ToolCalling. + +## Step 5: Deploy vLLM Semantic Router + +Deploy the semantic router service with all required components: + +```bash +# Deploy semantic router using Kustomize +kubectl apply -k deploy/kubernetes/istio/ + +# Wait for deployment to be ready (this may take several minutes for model downloads) +kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s + +# Verify deployment status +kubectl get pods -n vllm-semantic-router-system +``` + +## Step 6: Install additional Istio configuration + +Install the destinationrule and envoy filter needed for Istio gateway to use ExtProc based interface with vLLM Semantic router + +```bash +kubectl apply -f deploy/kubernetes/istio/destinationrule.yaml +kubectl apply -f deploy/kubernetes/istio/envoyfilter.yaml +``` + +## Step 7: Install gateway routes + +Install HTTPRoutes in the Istio gateway. + +```bash +kubectl apply -f deploy/kubernetes/istio/httproute-llama3-8b.yaml +kubectl apply -f deploy/kubernetes/istio/httproute-phi4-mini.yaml +``` + +## Step 8: Testing the Deployment +To expose the IP on which the Istio gateway listens to client requests from outside the cluster, you can choose any standard kubernetes option for external load balancing. We tested our feature by [deploying and configuring metallb](https://metallb.universe.tf/installation/) into the cluster to be the LoadBalancer provider. Please refer to metallb documentation for installation procedures if needed. Finally, for the minikube case, we get the external url as shown below. + +```bash +minikube service inference-gateway-istio --url +http://192.168.49.2:30913 +``` + +Now we can send LLM prompts via curl to http://192.168.49.2:30913 to access the Istio gateway which will then use information from vLLM semantic router to dynamically route to one of the two LLMs we are using as backends in this case. + +### Send Test Requests + +Try the following cases with and without model "auto" selection to confirm that Istio + vsr together are able to route queries to the appropriate model. The query responses will include information about which model was used to serve that request. + +Example queries to try include the following + +```bash +# Model name llama3-8b provided explicitly, should route to this backend +curl http://192.168.49.2:30913/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "llama3-8b", + "messages": [ + {"role": "user", "content": "Linux is said to be an open source kernel because "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name set to "auto", should categorize to "computer science" & route to llama3-8b +curl http://192.168.49.2:30913/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "auto", + "messages": [ + {"role": "user", "content": "Linux is said to be an open source kernel because "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name phi4-mini provided explicitly, should route to this backend +curl http://192.168.49.2:30913/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "phi4-mini", + "messages": [ + {"role": "user", "content": "2+2 is "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name set to "auto", should categorize to "math" & route to phi4-mini +curl http://192.168.49.2:30913/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "auto", + "messages": [ + {"role": "user", "content": "2+2 is "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +## Troubleshooting + +### Common Issues + +**Gateway/ Front end not working:** + +```bash +# Check istio gateway status +kubectl get gateway + +# Check istio gw service status +kubectl get svc inference-gateway-istio + +# Check Istio's Envoy logs +kubectl logs deploy/inference-gateway-istio -c istio-proxy +``` + +**Semantic router not responding:** + +```bash +# Check semantic router pod +kubectl get pods -n vllm-semantic-router-system + +# Check semantic router service +kubectl get svc -n vllm-semantic-router-system + +# Check semantic router logs +kubectl logs -n vllm-semantic-router-system deployment/semantic-router +``` + +## Cleanup + +```bash + +# Remove semantic router +kubectl delete -k deploy/kubernetes/istio/ + +# Remove Istio +istioctl uninstall --purge + +# Remove LLMs +kubectl delete -f deploy/kubernetes/istio/vLlama3.yaml +kubectl delete -f deploy/kubernetes/istio/vPhi4.yaml + +# Stop minikube cluster +minikube stop + +# Delete minikube cluster +minikube delete +``` + +## Next Steps + +- Test/ experiment with different features of vLLM Semantic Router +- Additional use cases/ topologies with Istio Gateway (including with EPP and LLM-D) +- Set up monitoring and observability +- Implement authentication and authorization +- Scale the semantic router deployment for production workloads diff --git a/website/docs/installation/k8s/llm-d.md b/website/docs/installation/k8s/llm-d.md new file mode 100644 index 000000000..242254c0e --- /dev/null +++ b/website/docs/installation/k8s/llm-d.md @@ -0,0 +1,336 @@ +# Install with LLM-D + +This guide provides step-by-step instructions for deploying the vLLM Semantic Router (vsr) in combination with [LLM-D](https://github.com/llm-d/llm-d). This will also illustrate a key design pattern namely use of the vsr as a model picker in combination with the use of LLM-D as endpoint picker. + +A model picker provides the ability to route an LLM query to one of multiple LLM models that are entirely different from each other, whereas an endpoint picker selects one of multiple endpoints that each serve an equivalent model (and most often the exact same base model). Hence this deployment shows how vLLM Semantic Router in its role as a model picker is perfectly complementary to endpoint picker solutions such as LLM-D. + +Since LLM-D has a number of deployment configurations some of which require a larger hardware setup we will demonstrate a baseline version of LLM-D working in combination with vsr to introduce the core concepts. These same core concepts will also apply when using vsr with more complex LLM-D configurations and production grade well-lit paths as described in the LLM-D repo at [this link](https://github.com/llm-d/llm-d/tree/main/guides). + +Also we will use LLM-D with Istio as the Inference Gateway in order to build on the steps and hardware setup from the [Istio deployment example](istio) documented in this repo. Istio is also commonly used as the default gateway for LLM-D with or without vsr. + +## Architecture Overview + +The deployment consists of: + +- **vLLM Semantic Router**: Provides intelligent request routing and processing decisions to Envoy based Gateways +- **LLM-D**: Distributed Inference platform used for scaleout LLM inferencing with SOTA performance. +- **Istio Gateway**: Istio's implementation of Kubernetes Gateway API that uses an Envoy proxy under the covers +- **Gateway API Inference Extension**: Additional APIs to extend the Gateway API for Inference via ExtProc servers +- **Two instances of vLLM serving 1 model each**: Example backend LLMs for illustrating semantic routing in this topology + +## Prerequisites + +Before starting, ensure you have the following tools installed: + +- [Docker](https://docs.docker.com/get-docker/) - Container runtime +- [minikube](https://minikube.sigs.k8s.io/docs/start/) - Local Kubernetes +- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - Kubernetes in Docker +- [kubectl](https://kubernetes.io/docs/tasks/tools/) - Kubernetes CLI +- [istioctl](https://istio.io/latest/docs/ops/diagnostic-tools/istioctl/) - Istio CLI + +We use minikube in the description below. As noted above, this guide builds upon the vsr + Istio [deployment guide](istio) from this repo hence will point to that guide for the common portions of documentation and add the incremental additional steps here. + +As was the case for the Istio guide, you will need a machine that has GPU support with at least 2 GPUs to run this exercise so that we can deploy and test the use of vsr to do model routing between two different LLM base models. + +## Step 1: Common Steps from Istio Guide + +First, follow the steps documented in the [Istio guide](istio), to create a local minikube cluster. + +## Step 2: Install Istio Gateway, Gateway API, Inference Extension CRDs + +Install CRDs for the Kubernetes Gateway API, Gateway API Inference Extension, Istio Control plane and an instance of the Istio Gateway exactly as described in the [Istio guide](istio). Use the same version of Istio as documented in that guide. If you were following the LLM-D well-lit paths this part would be done by the Gateway provider Helm charts from the LLM-D repo. In this guide, we set these up manually to keep things common and reusable with the Istio guide from this repo. This will also help the reader understand the parts that are common between a GIE/EPP based deployment and an LLM-D based deployment and how vsr can be used in both cases. + +If installed correctly you should see the api CRDs for gateway api and inference extension as well as pods running for the Istio gateway and Istiod using the commands shown below. + +```bash +kubectl get crds | grep gateway +``` + +```bash +kubectl get crds | grep inference +``` + +```bash +kubectl get pods | grep istio +``` + +```bash +kubectl get pods -n istio-system +``` + +## Step 3: Deploy LLM models + +Now deploy two LLM models similar to the [Istio guide](istio) documentation. Note from the manifest file names that these example commands are to be executed from the top folder of the repo. The counterpart of this step from the LLM-D deployment documentation is the setup of the LLM-D Model Service. To keep things simple, we do not need the LLM-D Model service for this guide. + +```bash +kubectl create secret generic hf-token-secret --from-literal=token=$HF_TOKEN +``` + +```bash +# Create vLLM service running llama3-8b +kubectl apply -f deploy/kubernetes/istio/vLlama3.yaml +``` + +This may take several (10+) minutes the first time this is run to download the model up until the vLLM pod running this model is in READY state. Similarly also deploy the second LLM (phi4-mini) and wait for several minutes until the pod is in READY state. + +```bash +# Create vLLM service running phi4-mini +kubectl apply -f deploy/kubernetes/istio/vPhi4.yaml +``` + +At the end of this you should be able to see both your vLLM pods are READY and serving these LLMs using the command below. You should also see Kubernetes services exposing the IP/ port on which these models are being served. In the example below the llama3-8b model is being served via a kubernetes service with service IP of 10.108.250.109 and port 80. + +```bash +# Verify that vLLM pods running the two LLMs are READY and serving + +kubectl get pods +NAME READY STATUS RESTARTS AGE +llama-8b-57b95475bd-ph7s4 1/1 Running 0 9d +phi4-mini-887476b56-74twv 1/1 Running 0 9d + +# View the IP/port of the Kubernetes services on which these models are being served + +kubectl get service +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +kubernetes ClusterIP 10.96.0.1 443/TCP 36d +llama-8b ClusterIP 10.108.250.109 80/TCP 18d +phi4-mini ClusterIP 10.97.252.33 80/TCP 9d +``` + +## Step 4: Deploy InferencePools and LLM-D schedulers + +LLM-D (and Kubernetes IGW) use an API resource called InferencePool alongwith a scheduler (referred to as the LLM-D inference scheduler and sometimes equivalently as EndPoint Picker/ EPP). + +Deploy the provided manifests in order to create InferencePool and LLM-D inference schedulers corresponding to the 2 base models used in this exercise. + +In order to show a full combination of model picking and endpoint picking, one would normally need at least 2 inferencepools with at least 2 endpoints per pool. Since that would require 4 instances of vllm serving pods and 4 GPUs in our exercise, that would require a more complex hardware setup. This guide deploys 1 model endpoint per each of the two InferencePools in order to show the core design of vsr's model picking working with and complementing LLM-D scheduler's endpoint picking. + +```bash +# Create the LLM-D scheduler and InferencePool for the Llama3-8b model +kubectl apply -f deploy/kubernetes/llmd-base/inferencepool-llama.yaml +``` + +```bash +# Create the LLM-D scheduler and InferencePool for the phi4-mini model +kubectl apply -f deploy/kubernetes/llmd-base/inferencepool-phi4.yaml +``` + +## Step 5: Additional Istio config for LLM-D connection + +Add DestinationRule to allow each EPP/ LLM-D scheduler to use ExtProc without TLS (current Istio limitation). + +```bash +# Istio destinationrule for the Llama3-8b pool scheduler +kubectl apply -f deploy/kubernetes/llmd-base/dest-rule-epp-llama.yaml +``` + +```bash +# Istio destinationrule for the phi4-mini pool scheduler +kubectl apply -f deploy/kubernetes/llmd-base/dest-rule-epp-phi4.yaml +``` + +## Step 6: Update vsr config + +Since this guide is based on using the same backend models as in the [Istio guide](istio), we will reuse the same vsr config as from that guide and hence you do not need to update the file deploy/kubernetes/istio/config.yaml. If you were using different backend models as part of the LLM-D deployment, you would need to update this file. + +## Step 7: Deploy vLLM Semantic Router + +Deploy the semantic router service with all required components: + +```bash +# Deploy semantic router using Kustomize +kubectl apply -k deploy/kubernetes/istio/ + +# Wait for deployment to be ready (this may take several minutes for model downloads) +kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s + +# Verify deployment status +kubectl get pods -n vllm-semantic-router-system +``` + +## Step 6: Additional Istio configuration for the VSR connection + +Install the destinationrule and envoy filter needed for Istio gateway to use ExtProc based interface with vLLM Semantic router. + +```bash +kubectl apply -f deploy/kubernetes/istio/destinationrule.yaml +kubectl apply -f deploy/kubernetes/istio/envoyfilter.yaml +``` + +## Step 7: Install gateway routes + +Install HTTPRoutes in the Istio gateway. Note a difference here compared to the http routes used in the prior vsr + istio guide, here the backendRefs in the route matches based on point to the InferencePools which in turn point to the LLM-D schedulers for those pools instead of the backendRefs pointing to the vllm service endpoints of the models as was done in the [istio guide without llm-d](istio). + +```bash +kubectl apply -f deploy/kubernetes/llmd-base/httproute-llama-pool.yaml +kubectl apply -f deploy/kubernetes/llmd-base/httproute-phi4-pool.yaml +``` + +## Step 8: Testing the Deployment +To expose the IP on which the Istio gateway listens to client requests from outside the cluster, you can choose any standard kubernetes option for external load balancing. We tested our feature by [deploying and configuring metallb](https://metallb.universe.tf/installation/) into the cluster to be the LoadBalancer provider. Please refer to metallb documentation for installation procedures if needed. Finally, for the minikube case, we get the external url as shown below. + +```bash +minikube service inference-gateway-istio --url +http://192.168.49.2:32293 +``` + +Now we can send LLM prompts via curl to http://192.168.49.2:32293 to access the Istio gateway which will then use information from vLLM semantic router to dynamically route to one of the two LLMs we are using as backends in this case. Use the port number that you get as output from your "minikube service" command in the curl examples below. + +### Send Test Requests + +Try the following cases with and without model "auto" selection to confirm that Istio + vsr together are able to route queries to the appropriate model. The query responses will include information about which model was used to serve that request. + +Example queries to try include the following + +```bash +# Model name llama3-8b provided explicitly, no model alteration, send to llama EPP for endpoint picking +curl http://192.168.49.2:32293/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "llama3-8b", + "messages": [ + {"role": "user", "content": "Linux is said to be an open source kernel because "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name set to "auto", should categorize to "computer science" & route to llama3-8b +curl http://192.168.49.2:32293/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "auto", + "messages": [ + {"role": "user", "content": "Linux is said to be an open source kernel because "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name phi4-mini provided explicitly, no model alteration, send to phi4-mini EPP for endpoint picking +curl http://192.168.49.2:32293/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "phi4-mini", + "messages": [ + {"role": "user", "content": "2+2 is "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +```bash +# Model name set to "auto", should categorize to "math" & route to phi4-mini +curl http://192.168.49.2:32293/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "auto", + "messages": [ + {"role": "user", "content": "2+2 is "} + ], + "max_tokens": 100, + "temperature": 0 + }' +``` + +## Troubleshooting + +### Basic Pod Validation + +If you have followed the above steps, you should see pods similar to below running READY state as a quick initial validation. These include the LLM model pods, Istio gateway pod, LLM-D/EPP scheduler pods, vsr pod and istiod controller pod as shown below. You should also see the InferencePools and HTTPRoute instances as shown below with status showing routes in resolved state. + +```bash +$ kubectl get pods -n default +NAME READY STATUS RESTARTS AGE +inference-gateway-istio-6fc8864bfb-gbcz8 1/1 Running 0 14h +llama-8b-6558848cc8-wkkxn 1/1 Running 0 3h26m +phi4-mini-7b94bc69db-rnpkj 1/1 Running 0 17h +vllm-llama3-8b-instruct-epp-7f7ff88677-j7lst 1/1 Running 0 134m +vllm-phi4-mini-epp-6f5dd6bbb9-8pv27 1/1 Running 0 14h +``` + +```bash +$ kubectl get pods -n vllm-semantic-router-system +NAME READY STATUS RESTARTS AGE +semantic-router-bf6cdd5b9-t5hpg 1/1 Running 0 5d23h +``` + +```bash +$ kubectl get pods -n istio-system +NAME READY STATUS RESTARTS AGE +istiod-6f5ccc65c-vnbg5 1/1 Running 0 15h +``` + +```bash +$ kubectl get inferencepools +NAME AGE +vllm-llama3-8b-instruct 139m +vllm-phi4-mini 15h +``` + +```bash +$ kubectl get httproutes +NAME HOSTNAMES AGE +vsr-llama8b 13h +vsr-phi4-mini 13h +``` + +```bash +$ kubectl get httproute vsr-llama8b -o yaml | grep -A 1 "reason: ResolvedRefs" + reason: ResolvedRefs + status: "True" +``` + +### Common Issues + +**Gateway/ Front end not working:** + +```bash +# Check istio gateway status +kubectl get gateway + +# Check istio gw service status +kubectl get svc inference-gateway-istio + +# Check Istio's Envoy logs +kubectl logs deploy/inference-gateway-istio -c istio-proxy +``` + +**Semantic router not responding or not routing correctly:** + +```bash +# Check semantic router pod +kubectl get pods -n vllm-semantic-router-system + +# Check semantic router service +kubectl get svc -n vllm-semantic-router-system + +# Check semantic router logs +kubectl logs -n vllm-semantic-router-system deployment/semantic-router +``` + +## Cleanup + +```bash + +# Remove semantic router +kubectl delete -k deploy/kubernetes/istio/ + +# Remove Istio +istioctl uninstall --purge + +# Remove LLMs +kubectl delete -f deploy/kubernetes/istio/vLlama3.yaml +kubectl delete -f deploy/kubernetes/istio/vPhi4.yaml + +# Stop minikube cluster +minikube stop + +# Delete minikube cluster +minikube delete +``` + +## Next Steps + +- Test/ experiment with different features of vLLM Semantic Router +- Test/ experiment with more complex LLM-D configurations and well-lit paths +- Set up monitoring and observability +- Implement authentication and authorization +- Scale the semantic router deployment for production workloads diff --git a/website/docs/installation/k8s/production-stack.md b/website/docs/installation/k8s/production-stack.md new file mode 100644 index 000000000..f296f6fd0 --- /dev/null +++ b/website/docs/installation/k8s/production-stack.md @@ -0,0 +1,168 @@ +# Integration with Production Stack + +This tutorial is adapted from [vLLM production stack tutorials](https://github.com/vllm-project/production-stack/blob/main/tutorials/24-semantic-router-integration.md) + +## What is vLLM Semantic Router? + +The vLLM Semantic Router is an intelligent Mixture-of-Models (MoM) router that operates as an Envoy External Processor to semantically route OpenAI API–compatible requests to the most suitable backend model. Using BERT-based classification, it improves both quality and cost efficiency by matching requests (e.g., math, code, creative, general) to specialized models. + +- **Auto-selection of models**: Routes math, creative writing, code, and general queries to the best-fit models. +- **Security & privacy**: PII detection, prompt guard, and safe routing for sensitive prompts. +- **Performance optimizations**: Semantic cache and better tool selection to cut latency and tokens. +- **Architecture**: Tight Envoy ExtProc integration; dual Go and Python implementations; production-ready and scalable. +- **Monitoring**: Grafana dashboards, Prometheus metrics, and tracing for full visibility. + +Learn more: [vLLM Semantic Router](https://vllm-semantic-router.com/docs/intro) + +## What are the benefits of integration? + +The vLLM Production Stack provides several deployment ways that spin up vLLM servers which can direct traffic to different models, perform service discovery and fault tolerance through the Kubernetes API, and support round‑robin, session‑based, prefix‑aware, KV-aware and disaggregated-prefill routing with LMCache native support. The Semantic Router adds a system‑intelligence layer that classifies each user request, selects the most suitable model from a pool, injects domain‑specific system prompts, performs semantic caching and enforces enterprise‑grade security checks such as PII and jailbreak detection. + +By combining these two systems we obtain a unified inference stack. Semantic routing ensures that each request is answered by the best possible model. Production‑Stack routing maximizes infrastructure and inference efficiency, and exposes rich metrics. + +--- + +This tutorial will guide you: + +- Deploy a minimal vLLM Production Stack +- Deploy vLLM Semantic Router and point it to your vLLM router Service +- Test the endpoint via the Envoy AI Gateway + +## Prerequisites + +- kubectl +- Helm +- A Kubernetes cluster (kind, minikube, GKE, etc.) + +--- + +## Step 1: Deploy the vLLM Production Stack using your Helm values + +Use your chart and the provided values file at `tutorials/assets/values-23-SR.yaml`. + +```bash +helm repo add vllm-production-stack https://vllm-project.github.io/production-stack +helm install vllm-stack vllm-production-stack/vllm-stack -f ./tutorials/assets/values-23-SR.yaml +``` + +For reference, the following is the sample value file: + +```yaml +servingEngineSpec: + runtimeClassName: "" + strategy: + type: Recreate + modelSpec: + - name: "qwen3" + repository: "lmcache/vllm-openai" + tag: "v0.3.7" + modelURL: "Qwen/Qwen3-8B" + pvcStorage: "50Gi" + vllmConfig: + # maxModelLen: 131072 + extraArgs: ["--served-model-name", "Qwen/Qwen3-8B", "qwen3"] + + replicaCount: 2 + + requestCPU: 8 + requestMemory: "16Gi" + requestGPU: 1 + +routerSpec: + repository: lmcache/lmstack-router + tag: "latest" + resources: + requests: + cpu: "1" + memory: "2G" + limits: + cpu: "1" + memory: "2G" + routingLogic: "roundrobin" + sessionKey: "x-user-id" +``` + +Identify the ClusterIP and port of your router Service created by the chart (name may vary): + +```bash +kubectl get svc vllm-router-service +# Note the router service ClusterIP and port (e.g., 10.97.254.122:80) +``` + +--- + +## Step 2: Deploy vLLM Semantic Router and point it at your vLLM router Service + +Follow the official guide from the official website with **the updated config file as the following**: [Install in Kubernetes](https://vllm-semantic-router.com/docs/installation/kubernetes). + +Minimal sequence (same as the guide): + +```bash + # Deploy vLLM Semantic Router manifests + kubectl apply -k deploy/kubernetes/ai-gateway/semantic-router + kubectl wait --for=condition=Available deployment/semantic-router \ + -n vllm-semantic-router-system --timeout=600s + + # Install Envoy Gateway + helm upgrade -i eg oci://docker.io/envoyproxy/gateway-helm \ + --version v0.0.0-latest \ + --namespace envoy-gateway-system \ + --create-namespace \ + -f https://raw.githubusercontent.com/envoyproxy/ai-gateway/main/manifests/envoy-gateway-values.yaml + + # Install Envoy AI Gateway + helm upgrade -i aieg oci://docker.io/envoyproxy/ai-gateway-helm \ + --version v0.0.0-latest \ + --namespace envoy-ai-gateway-system \ + --create-namespace + # Install Envoy AI Gateway CRDs + helm upgrade -i aieg-crd oci://docker.io/envoyproxy/ai-gateway-crds-helm --version v0.0.0-latest --namespace envoy-ai-gateway-system + + kubectl wait --timeout=300s -n envoy-ai-gateway-system \ + deployment/ai-gateway-controller --for=condition=Available +``` + +Create LLM Demo Backends and AI Gateway Routes: + +```bash + # Apply LLM demo backends + kubectl apply -f deploy/kubernetes/ai-gateway/aigw-resources/base-model.yaml + # Apply AI Gateway routes + kubectl apply -f deploy/kubernetes/ai-gateway/aigw-resources/gwapi-resources.yaml +``` + +--- + +## Step 3: Test the deployment + +Port-forward to the Envoy service and send a test request, following the guide: + +```bash + export ENVOY_SERVICE=$(kubectl get svc -n envoy-gateway-system \ + --selector=gateway.envoyproxy.io/owning-gateway-namespace=default,gateway.envoyproxy.io/owning-gateway-name=semantic-router \ + -o jsonpath='{.items[0].metadata.name}') + + kubectl port-forward -n envoy-gateway-system svc/$ENVOY_SERVICE 8080:80 +``` + +Send a chat completions request: + +```bash + curl -i -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "MoM", + "messages": [ + {"role": "user", "content": "What is the derivative of f(x) = x^3?"} + ] + }' +``` + +--- + +## Troubleshooting + +- If the gateway is not accessible, check the Gateway and Envoy service per the guide. +- If the inference pool is not ready, `kubectl describe` the `InferencePool` and check controller logs. +- If the semantic router is not responding, check its pod status and logs. +- If it is returning error code, check the production stack router log. diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md index 80821ad9c..c43549fdb 100644 --- a/website/docs/installation/kubernetes.md +++ b/website/docs/installation/kubernetes.md @@ -1,271 +1,23 @@ # Install in Kubernetes -This guide provides step-by-step instructions for deploying the vLLM Semantic Router with Envoy AI Gateway on Kubernetes. +Deploy the vLLM Semantic Router on Kubernetes using the provided manifests. -## Architecture Overview - -The deployment consists of: - -- **vLLM Semantic Router**: Provides intelligent request routing and semantic understanding -- **Envoy Gateway**: Core gateway functionality and traffic management -- **Envoy AI Gateway**: AI Gateway built on Envoy Gateway for LLM providers -- **Gateway API Inference Extension**: CRDs for managing inference pools - -## Prerequisites - -Before starting, ensure you have the following tools installed: - -- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) - Kubernetes in Docker (Optional) -- [kubectl](https://kubernetes.io/docs/tasks/tools/) - Kubernetes CLI -- [Helm](https://helm.sh/docs/intro/install/) - Package manager for Kubernetes - -## Step 1: Create Kind Cluster (Optional) - -Create a local Kubernetes cluster optimized for the semantic router workload: +## Quick Start ```bash -# Create cluster with optimized resource settings -kind create cluster --name semantic-router-cluster --config tools/kind/kind-config.yaml - -# Verify cluster is ready -kubectl wait --for=condition=Ready nodes --all --timeout=300s -``` - -**Note**: The kind configuration provides sufficient resources (8GB+ RAM, 4+ CPU cores) for running the semantic router and AI gateway components. - -## Step 2: Deploy vLLM Semantic Router - -Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This file contains the vLLM configuration, including model config, endpoints, and policies. - -Deploy the semantic router service with all required components: - -```bash -# Deploy semantic router using Kustomize +# Deploy semantic router kubectl apply -k deploy/kubernetes/ -# Wait for deployment to be ready (this may take several minutes for model downloads) +# Wait for deployment kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s - -# Verify deployment status -kubectl get pods -n vllm-semantic-router-system -``` - -## Step 3: Install Envoy Gateway - -Install the core Envoy Gateway for traffic management: - -```bash -# Install Envoy Gateway using Helm -helm upgrade -i eg oci://docker.io/envoyproxy/gateway-helm \ - --version v0.0.0-latest \ - --namespace envoy-gateway-system \ - --create-namespace - -# Wait for Envoy Gateway to be ready -kubectl wait --timeout=300s -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available -``` - -## Step 4: Install Envoy AI Gateway - -Install the AI-specific extensions for inference workloads: - -```bash -# Install Envoy AI Gateway using Helm -helm upgrade -i aieg oci://docker.io/envoyproxy/ai-gateway-helm \ - --version v0.0.0-latest \ - --namespace envoy-ai-gateway-system \ - --create-namespace - -# Wait for AI Gateway Controller to be ready -kubectl wait --timeout=300s -n envoy-ai-gateway-system deployment/ai-gateway-controller --for=condition=Available -``` - -## Step 5: Install Gateway API Inference Extension - -Install the Custom Resource Definitions (CRDs) for managing inference pools: - -```bash -# Install Gateway API Inference Extension CRDs -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.1/manifests.yaml - -# Verify CRDs are installed -kubectl get crd | grep inference -``` - -## Step 6: Configure AI Gateway - -Apply the AI Gateway configuration to connect with the semantic router: - -```bash -# Apply AI Gateway configuration -kubectl apply -f deploy/kubernetes/ai-gateway/configuration - -# Restart controllers to pick up new configuration -kubectl rollout restart -n envoy-gateway-system deployment/envoy-gateway -kubectl rollout restart -n envoy-ai-gateway-system deployment/ai-gateway-controller - -# Wait for controllers to be ready -kubectl wait --timeout=120s -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available -kubectl wait --timeout=120s -n envoy-ai-gateway-system deployment/ai-gateway-controller --for=condition=Available -``` - -## Step 7: Create Inference Pool - -Create the inference pool that connects the gateway to the semantic router backend: - -```bash -# Create inference pool configuration -kubectl apply -f deploy/kubernetes/ai-gateway/inference-pool - -# Wait for inference pool to be ready -sleep 30 -``` - -## Step 8: Verify Deployment - -Verify that the inference pool has been created and is properly configured: - -```bash -# Check inference pool status -kubectl get inferencepool vllm-semantic-router -n vllm-semantic-router-system -o yaml ``` -Expected output should show the inference pool in `Accepted` state: - -```yaml -status: - parent: - - conditions: - - lastTransitionTime: "2025-09-27T09:27:32Z" - message: 'InferencePool has been Accepted by controller ai-gateway-controller: - InferencePool reconciled successfully' - observedGeneration: 1 - reason: Accepted - status: "True" - type: Accepted - - lastTransitionTime: "2025-09-27T09:27:32Z" - message: 'Reference resolution by controller ai-gateway-controller: All references - resolved successfully' - observedGeneration: 1 - reason: ResolvedRefs - status: "True" - type: ResolvedRefs - parentRef: - group: gateway.networking.k8s.io - kind: Gateway - name: vllm-semantic-router - namespace: vllm-semantic-router-system -``` - -## Testing the Deployment - -### Method 1: Port Forwarding (Recommended for Local Testing) - -Set up port forwarding to access the gateway locally: - -```bash -# Set up environment variables -export GATEWAY_IP="localhost:8080" - -# Get the Envoy service name -export ENVOY_SERVICE=$(kubectl get svc -n envoy-gateway-system \ - --selector=gateway.envoyproxy.io/owning-gateway-namespace=vllm-semantic-router-system,gateway.envoyproxy.io/owning-gateway-name=vllm-semantic-router \ - -o jsonpath='{.items[0].metadata.name}') - -# Start port forwarding (run in background or separate terminal) -kubectl port-forward -n envoy-gateway-system svc/$ENVOY_SERVICE 8080:80 -``` - -### Method 2: External IP (For Production Deployments) - -For production deployments with external load balancers: - -```bash -# Get the Gateway external IP -GATEWAY_IP=$(kubectl get gateway vllm-semantic-router -n vllm-semantic-router-system -o jsonpath='{.status.addresses[0].value}') -echo "Gateway IP: $GATEWAY_IP" -``` - -### Send Test Requests - -Once the gateway is accessible, test the inference endpoint: - -```bash -# Test math domain chat completions endpoint -curl -i -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "MoM", - "messages": [ - {"role": "user", "content": "What is the derivative of f(x) = x^3 + 2x^2 - 5x + 7?"} - ] - }' -``` - -## Troubleshooting - -### Common Issues +## Configuration -**Gateway not accessible:** +Edit `deploy/kubernetes/config.yaml` to configure your endpoints and policies before deployment. -```bash -# Check gateway status -kubectl get gateway vllm-semantic-router -n vllm-semantic-router-system - -# Check Envoy service -kubectl get svc -n envoy-gateway-system -``` - -**Inference pool not ready:** - -```bash -# Check inference pool events -kubectl describe inferencepool vllm-semantic-router -n vllm-semantic-router-system - -# Check AI gateway controller logs -kubectl logs -n envoy-ai-gateway-system deployment/ai-gateway-controller -``` - -**Semantic router not responding:** - -```bash -# Check semantic router pod status -kubectl get pods -n vllm-semantic-router-system - -# Check semantic router logs -kubectl logs -n vllm-semantic-router-system deployment/semantic-router -``` - -## Cleanup - -To remove the entire deployment: - -```bash -# Remove inference pool -kubectl delete -f deploy/kubernetes/ai-gateway/inference-pool - -# Remove AI gateway configuration -kubectl delete -f deploy/kubernetes/ai-gateway/configuration - -# Remove semantic router -kubectl delete -k deploy/kubernetes/ - -# Remove AI gateway -helm uninstall aieg -n envoy-ai-gateway-system - -# Remove Envoy gateway -helm uninstall eg -n envoy-gateway-system - -# Remove Gateway API CRDs (optional) -kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.1/manifests.yaml - -# Delete kind cluster -kind delete cluster --name semantic-router-cluster -``` +## Integration Options -## Next Steps +For advanced features, see the integration guides: -- Configure custom routing rules in the AI Gateway -- Set up monitoring and observability -- Implement authentication and authorization -- Scale the semantic router deployment for production workloads +- [Install with Envoy AI Gateway](k8s/ai-gateway.md) - Envoy AI Gateway for traffic management and load balancing diff --git a/website/docs/overview/architecture/envoy-extproc.md b/website/docs/overview/architecture/envoy-extproc.md index d9186fdd6..e790e5ce5 100644 --- a/website/docs/overview/architecture/envoy-extproc.md +++ b/website/docs/overview/architecture/envoy-extproc.md @@ -227,7 +227,7 @@ func (r *OpenAIRouter) handleRequestBody( headerMutations := []*core.HeaderValueOption{ { Header: &core.HeaderValue{ - Key: "x-gateway-destination-endpoint", + Key: "x-vsr-destination-endpoint", Value: selectedEndpoint, }, Append: &wrapperspb.BoolValue{Value: false}, @@ -347,7 +347,7 @@ static_resources: response_code: "%RESPONSE_CODE%" duration: "%DURATION%" selected_model: "%REQ(X-SELECTED-MODEL)%" - selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%" + selected_endpoint: "%REQ(x-vsr-destination-endpoint)%" routing_confidence: "%REQ(X-ROUTING-CONFIDENCE)%" # Route configuration with dynamic routing @@ -361,7 +361,7 @@ static_resources: - match: prefix: "/" headers: - - name: "x-gateway-destination-endpoint" + - name: "x-vsr-destination-endpoint" string_match: exact: "endpoint1" route: @@ -370,7 +370,7 @@ static_resources: - match: prefix: "/" headers: - - name: "x-gateway-destination-endpoint" + - name: "x-vsr-destination-endpoint" string_match: exact: "endpoint2" route: @@ -379,7 +379,7 @@ static_resources: - match: prefix: "/" headers: - - name: "x-gateway-destination-endpoint" + - name: "x-vsr-destination-endpoint" string_match: exact: "endpoint3" route: diff --git a/website/docs/overview/architecture/system-architecture.md b/website/docs/overview/architecture/system-architecture.md index 2139c7828..129b9233a 100644 --- a/website/docs/overview/architecture/system-architecture.md +++ b/website/docs/overview/architecture/system-architecture.md @@ -238,7 +238,7 @@ graph TB ToolsSelection --> RoutingDecision[Make Routing Decision
Select Optimal Model] - RoutingDecision --> SetHeaders[Set Routing Headers
x-gateway-destination-endpoint
x-selected-model] + RoutingDecision --> SetHeaders[Set Routing Headers
x-vsr-destination-endpoint
x-selected-model] SetHeaders --> EnvoyRoute[Envoy Routes to
Selected Backend] diff --git a/website/docs/tutorials/intelligent-route/reasoning.md b/website/docs/tutorials/intelligent-route/reasoning.md index 29b4774cb..f75be62a2 100644 --- a/website/docs/tutorials/intelligent-route/reasoning.md +++ b/website/docs/tutorials/intelligent-route/reasoning.md @@ -145,7 +145,7 @@ Verify routing via response headers The router does not inject routing metadata into the JSON body. Instead, inspect the response headers added by the router: - X-Selected-Model -- X-GATEWAY-DESTINATION-ENDPOINT +- x-vsr-destination-endpoint Example: @@ -161,7 +161,7 @@ curl -i http://localhost:8801/v1/chat/completions \ }' # In the response headers, look for: # X-Selected-Model: -# X-GATEWAY-DESTINATION-ENDPOINT: +# x-vsr-destination-endpoint: ``` 4) Run a comprehensive evaluation diff --git a/website/docs/tutorials/observability/distributed-tracing.md b/website/docs/tutorials/observability/distributed-tracing.md index a0e47612a..15933661f 100644 --- a/website/docs/tutorials/observability/distributed-tracing.md +++ b/website/docs/tutorials/observability/distributed-tracing.md @@ -377,7 +377,7 @@ tracestate: vendor=value ``` traceparent: 00-abc123-ghi789-01 -x-gateway-destination-endpoint: endpoint1 +x-vsr-destination-endpoint: endpoint1 x-selected-model: gpt-4 ``` diff --git a/website/sidebars.ts b/website/sidebars.ts index 40525eab2..c6cbcd6f5 100644 --- a/website/sidebars.ts +++ b/website/sidebars.ts @@ -49,7 +49,16 @@ const sidebars: SidebarsConfig = { label: 'Installation', items: [ 'installation/installation', - 'installation/kubernetes', + { + type: 'category', + label: 'Kubernetes', + items: [ + 'installation/k8s/ai-gateway', + 'installation/k8s/production-stack', + 'installation/k8s/istio', + 'installation/k8s/llm-d', + ], + }, 'installation/docker-compose', 'installation/configuration', ],