vllm-project
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 4 deletions b/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎Makefile‎
Lines changed: 9 additions & 9 deletions b/‎Makefile‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎config/config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎config/envoy-docker.yaml‎
Lines changed: 120 additions & 0 deletions b/‎config/envoy-docker.yaml‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎deploy/kubernetes/config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 54 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎docker/README.md‎
Lines changed: 54 additions & 0 deletions b/‎docker/README.md‎
Lines changed: 54 additions & 0 deletions
@@ -98,4 +98,7 @@ scripts/prd.txt
 .env.taskmaster
 package-lock.json
 
-website/build
+website/build
+.docusaurus
+spec/
+results/
@@ -21,10 +21,9 @@ Before you begin, ensure you have the following installed:
 
 - **Rust** (latest stable version)
 - **Go** 1.24.1 or later
-- **Python** 3.8+ (for training and testing)
-- **Envoy Proxy**
 - **Hugging Face CLI** (`pip install huggingface_hub`)
 - **Make** (for build automation)
+- **Python** 3.8+ (Optiona: for training and testing)
 
 ### Initial Setup
 
@@ -40,7 +39,7 @@ Before you begin, ensure you have the following installed:
    ```
    This downloads the pre-trained classification models from Hugging Face.
 
-3. **Install Python dependencies:**
+3. **Install Python dependencies(Optional):**
    ```bash
    # For training and development
    pip install -r requirements.txt
@@ -245,7 +244,7 @@ The test suite includes:
 
 ## Getting Help
 
-- Check the [documentation](https://llm-semantic-router.readthedocs.io/en/latest/)
+- Check the [documentation](https://vllm-semantic-router.com/)
 - Review existing issues and pull requests
 - Ask questions in discussions or create a new issue
 
 
@@ -4,7 +4,7 @@
 all: build
 
 # vLLM env var
-VLLM_ENDPOINT ?= http://192.168.12.90:11434
+VLLM_ENDPOINT ?=
 
 # Build the Rust library and Golang binding
 build: rust build-router
@@ -80,19 +80,19 @@ clean:
 	rm -f bin/router
 
 # Test the Envoy extproc
-test-prompt:
+test-auto-prompt-reasoning:
 	@echo "Testing Envoy extproc with curl (Math)..."
 	curl -X POST http://localhost:8801/v1/chat/completions \
 		-H "Content-Type: application/json" \
-		-d '{"model": "auto", "messages": [{"role": "assistant", "content": "You are a professional math teacher. Explain math concepts clearly and show step-by-step solutions to problems."}, {"role": "user", "content": "What is the derivative of f(x) = x^3 + 2x^2 - 5x + 7?"}], "temperature": 0.7}'
-	@echo "Testing Envoy extproc with curl (Creative Writing)..."
-	curl -X POST http://localhost:8801/v1/chat/completions \
-		-H "Content-Type: application/json" \
-		-d '{"model": "auto", "messages": [{"role": "assistant", "content": "You are a story writer. Create interesting stories with good characters and settings."}, {"role": "user", "content": "Write a short story about a space cat."}], "temperature": 0.7}'
-	@echo "Testing Envoy extproc with curl (Default/General)..."
+		-d '{"model": "auto", "messages": [{"role": "system", "content": "You are a professional math teacher. Explain math concepts clearly and show step-by-step solutions to problems."}, {"role": "user", "content": "What is the derivative of f(x) = x^3 + 2x^2 - 5x + 7?"}]}'
+
+# Test the Envoy extproc
+test-auto-prompt-no-reasoning:
+	@echo "Testing Envoy extproc with curl (Math)..."
 	curl -X POST http://localhost:8801/v1/chat/completions \
 		-H "Content-Type: application/json" \
-		-d '{"model": "auto", "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}], "temperature": 0.7}'
+		-d '{"model": "auto", "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]}'
+
 # Test prompts that contain PII
 test-pii:
 	@echo "Testing Envoy extproc with curl (Credit card number)..."
 
@@ -28,22 +28,22 @@ gpu_config:
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
   - name: "endpoint1"
-    address: "192.168.12.90"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "phi4"
       - "gemma3:27b"
     weight: 1  # Load balancing weight
     health_check_path: "/health"  # Optional health check endpoint
   - name: "endpoint2"
-    address: "192.168.12.91"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "mistral-small3.1"
     weight: 1
     health_check_path: "/health"
   - name: "endpoint3"
-    address: "192.168.12.92"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "phi4"  # Same model can be served by multiple endpoints for redundancy
 
@@ -0,0 +1,120 @@
+static_resources:
+  listeners:
+  - name: listener_0
+    address:
+      socket_address:
+        address: 0.0.0.0
+        port_value: 8801
+    filter_chains:
+    - filters:
+      - name: envoy.filters.network.http_connection_manager
+        typed_config:
+          "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+          stat_prefix: ingress_http
+          access_log:
+          - name: envoy.access_loggers.stdout
+            typed_config:
+              "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
+              log_format:
+                json_format:
+                  time: "%START_TIME%"
+                  protocol: "%PROTOCOL%"
+                  request_method: "%REQ(:METHOD)%"
+                  request_path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%"
+                  response_code: "%RESPONSE_CODE%"
+                  response_flags: "%RESPONSE_FLAGS%"
+                  bytes_received: "%BYTES_RECEIVED%"
+                  bytes_sent: "%BYTES_SENT%"
+                  duration: "%DURATION%"
+                  upstream_host: "%UPSTREAM_HOST%"
+                  upstream_cluster: "%UPSTREAM_CLUSTER%"
+                  upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
+                  request_id: "%REQ(X-REQUEST-ID)%"
+                  selected_model: "%REQ(X-SELECTED-MODEL)%"
+                  selected_endpoint: "%REQ(X-SEMANTIC-DESTINATION-ENDPOINT)%"
+          route_config:
+            name: local_route
+            virtual_hosts:
+            - name: local_service
+              domains: ["*"]
+              routes:
+              # Single route using original destination cluster
+              - match:
+                  prefix: "/"
+                route:
+                  cluster: vllm_dynamic_cluster
+                  timeout: 300s
+          http_filters:
+          - name: envoy.filters.http.ext_proc
+            typed_config:
+              "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+              grpc_service:
+                envoy_grpc:
+                  cluster_name: extproc_service
+              allow_mode_override: true
+              processing_mode:
+                request_header_mode: "SEND"
+                response_header_mode: "SEND"
+                request_body_mode: "BUFFERED"
+                response_body_mode: "BUFFERED"
+                request_trailer_mode: "SKIP"
+                response_trailer_mode: "SKIP"
+              failure_mode_allow: true
+              message_timeout: 300s
+          - name: envoy.filters.http.router
+            typed_config:
+              "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+              suppress_envoy_headers: true
+          http2_protocol_options:
+            max_concurrent_streams: 100
+            initial_stream_window_size: 65536
+            initial_connection_window_size: 1048576
+          stream_idle_timeout: "300s"
+          request_timeout: "300s"
+          common_http_protocol_options:
+            idle_timeout: "300s"
+
+  clusters:
+  - name: extproc_service
+    connect_timeout: 300s
+    per_connection_buffer_limit_bytes: 52428800
+    type: STATIC
+    lb_policy: ROUND_ROBIN
+    typed_extension_protocol_options:
+      envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+        "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+        explicit_http_config:
+          http2_protocol_options:
+            connection_keepalive:
+              interval: 300s
+              timeout: 300s
+    load_assignment:
+      cluster_name: extproc_service
+      endpoints:
+      - lb_endpoints:
+        - endpoint:
+            address:
+              socket_address:
+                address: semantic-router  # Use Docker service name
+                port_value: 50051
+
+  # Dynamic vLLM cluster using original destination
+  - name: vllm_dynamic_cluster
+    connect_timeout: 300s
+    per_connection_buffer_limit_bytes: 52428800
+    type: ORIGINAL_DST
+    lb_policy: CLUSTER_PROVIDED
+    original_dst_lb_config:
+      use_http_header: true
+      http_header_name: "x-semantic-destination-endpoint"
+    typed_extension_protocol_options:
+      envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+        "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+        explicit_http_config:
+          http_protocol_options: {}
+
+admin:
+  address:
+    socket_address:
+      address: "0.0.0.0"
+      port_value: 19000
@@ -28,22 +28,22 @@ gpu_config:
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
   - name: "endpoint1"
-    address: "192.168.12.90"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "phi4"
       - "gemma3:27b"
     weight: 1  # Load balancing weight
     health_check_path: "/health"  # Optional health check endpoint
   - name: "endpoint2"
-    address: "192.168.12.91"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "mistral-small3.1"
     weight: 1
     health_check_path: "/health"
   - name: "endpoint3"
-    address: "192.168.12.92"
+    address: "127.0.0.1"
     port: 11434
     models:
       - "phi4"  # Same model can be served by multiple endpoints for redundancy
 
@@ -0,0 +1,54 @@
+version: '3.8'
+
+services:
+  # Semantic Router External Processor Service
+  semantic-router:
+    build:
+      context: .
+      dockerfile: Dockerfile.extproc
+    container_name: semantic-router
+    ports:
+      - "50051:50051"
+    volumes:
+      - ./config:/app/config:ro
+      - ./models:/app/models:ro
+    environment:
+      - LD_LIBRARY_PATH=/app/lib
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "nc", "-z", "localhost", "50051"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 30s
+
+  # Envoy Proxy Service
+  envoy:
+    image: envoyproxy/envoy:v1.31.7
+    container_name: envoy-proxy
+    ports:
+      - "8801:8801"  # Main proxy port
+      - "19000:19000"  # Admin interface
+    volumes:
+      - ./config/envoy-docker.yaml:/etc/envoy/envoy.yaml:ro
+    command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:trace,router:trace,http:trace"]
+    depends_on:
+      semantic-router:
+        condition: service_healthy
+    networks:
+      - semantic-network
+    healthcheck:
+      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:19000/ready"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+networks:
+  semantic-network:
+    driver: bridge
+
+volumes:
+  models-cache:
+    driver: local
@@ -0,0 +1,54 @@
+# Docker Compose Quick Start Guide
+
+This Docker Compose configuration allows you to quickly run Semantic Router + Envoy proxy locally.
+
+## Prerequisites
+
+- Docker and Docker Compose
+- Ensure ports 8801, 50051, 19000 are not in use
+
+## Install in Docker Compose
+
+1. **Clone the repository and navigate to the project directory**
+   ```bash
+   git clone <repository-url>
+   cd semantic_router
+   ```
+
+2. **Download required models** (if not already present):
+   ```bash
+   make download-models
+   ```
+   This will download the necessary ML models for classification:
+   - Category classifier (ModernBERT-base)
+   - PII classifier (ModernBERT-base)
+   - Jailbreak classifier (ModernBERT-base)
+
+3. **Start the services using Docker Compose**
+   ```bash
+   # Start core services (semantic-router + envoy)
+   docker-compose up --build
+
+   # Or run in background
+   docker-compose up --build -d
+
+   # Start with testing services (includes mock vLLM)
+   docker-compose --profile testing up --build
+   ```
+
+4. **Verify the installation**
+   - Semantic Router: http://localhost:50051 (gRPC service)
+   - Envoy Proxy: http://localhost:8801 (main endpoint)
+   - Envoy Admin: http://localhost:19000 (admin interface)
+
+## Quick Start
+
+### 1. Build and Start Services
+
+```bash
+# Start core services (semantic-router + envoy)
+docker-compose up --build
+
+# Or run in background
+docker-compose up --build -d
+```