vllm-project
diff --git a/‎.github/workflows/k8s-kind-integration-test.yml‎
Lines changed: 15 additions & 4 deletions b/‎.github/workflows/k8s-kind-integration-test.yml‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎.github/workflows/pre-commit.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pre-commit.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/quickstart-integration-test.yml‎
Lines changed: 110 additions & 0 deletions b/‎.github/workflows/quickstart-integration-test.yml‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎.github/workflows/test-and-build.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-and-build.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Dockerfile.extproc‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile.extproc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Dockerfile.extproc.cross‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile.extproc.cross‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 18 deletions b/‎README.md‎
Lines changed: 13 additions & 18 deletions
@@ -186,6 +186,17 @@ jobs:
               - op: replace
                 path: /spec/template/spec/containers/0/resources/limits/cpu
                 value: "1"
+              - op: replace
+                path: /spec/template/spec/containers/0/readinessProbe
+                value:
+                  httpGet:
+                    path: /health
+                    port: classify-api
+                    scheme: HTTP
+                  initialDelaySeconds: 120
+                  periodSeconds: 15
+                  timeoutSeconds: 5
+                  failureThreshold: 20
               - op: add
                 path: /spec/template/spec/containers/0/imagePullPolicy
                 value: "IfNotPresent"
@@ -244,22 +255,22 @@ jobs:
 
           # Wait for PVC to be bound
           echo "Waiting for PVC to be bound..."
-          kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/semantic-router-models -n vllm-semantic-router-system --timeout=120s || {
+          kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/semantic-router-models -n vllm-semantic-router-system --timeout=300s || {
             echo "PVC binding timeout. Checking PVC status..."
             kubectl describe pvc -n vllm-semantic-router-system
             exit 1
           }
 
           # Wait for pods to be created
           echo "Waiting for pods to be created..."
-          timeout 120 bash -c 'until kubectl get pods -n vllm-semantic-router-system | grep -q semantic-router; do echo "Waiting for pod creation..."; sleep 5; done'
+          timeout 300 bash -c 'until kubectl get pods -n vllm-semantic-router-system | grep -q semantic-router; do echo "Waiting for pod creation..."; sleep 5; done'
 
           # Show pod status
           kubectl get pods -n vllm-semantic-router-system
 
           # Wait for init container to complete (model download)
           echo "Waiting for init container to complete (downloading models)..."
-          kubectl wait --for=condition=Initialized pods -l app=semantic-router -n vllm-semantic-router-system --timeout=600s || {
+          kubectl wait --for=condition=Initialized pods -l app=semantic-router -n vllm-semantic-router-system --timeout=1200s || {
             echo "❌ Init container did not complete in time. Showing logs..."
             kubectl logs -n vllm-semantic-router-system -l app=semantic-router -c model-downloader --tail=200 || true
             kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
@@ -268,7 +279,7 @@ jobs:
 
           # Wait for main container to be ready (increased timeout for model loading)
           echo "Waiting for main container to be ready..."
-          kubectl wait --for=condition=Ready pods -l app=semantic-router -n vllm-semantic-router-system --timeout=600s || {
+          kubectl wait --for=condition=Ready pods -l app=semantic-router -n vllm-semantic-router-system --timeout=1200s || {
             echo "❌ Pod did not become ready in time. Showing status and logs..."
             kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
             kubectl logs -n vllm-semantic-router-system -l app=semantic-router --tail=200 || true
 
@@ -36,7 +36,7 @@ jobs:
     - name: Set up Rust
       uses: dtolnay/rust-toolchain@stable
       with:
-        toolchain: 1.85
+        toolchain: 1.90
         components: rustfmt, clippy
 
     - name: Install system dependencies
 
@@ -0,0 +1,110 @@
+name: Quickstart Integration Test
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'scripts/quickstart.sh'
+      - 'deploy/docker-compose/**'
+      - 'config/config.yaml'
+      - 'tools/make/common.mk'
+      - 'tools/make/models.mk'
+      - 'tools/make/docker.mk'
+  workflow_dispatch: # Allow manual triggering
+
+jobs:
+  test-quickstart:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+
+      - name: Free up disk space
+        run: |
+          echo "Disk space before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          echo "Disk space after cleanup:"
+          df -h
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            make \
+            curl \
+            docker-compose
+
+      - name: Run quickstart script
+        id: quickstart
+        run: |
+          timeout 1200 bash scripts/quickstart.sh || {
+            exit_code=$?
+            if [ $exit_code -eq 124 ]; then
+              echo "::error::Quickstart script timed out after 20 minutes"
+            else
+              echo "::error::Quickstart script failed with exit code $exit_code"
+            fi
+            exit $exit_code
+          }
+        env:
+          CI: true
+          CI_MINIMAL_MODELS: true
+          TERM: xterm
+          HF_HUB_ENABLE_HF_TRANSFER: 1
+          HF_HUB_DISABLE_TELEMETRY: 1
+
+      - name: Test semantic routing functionality
+        run: |
+          echo "Testing semantic router with a sample query..."
+
+          response=$(curl -s -X POST http://localhost:8801/v1/chat/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "qwen3",
+              "messages": [{"role": "user", "content": "What is 2 + 2?"}],
+              "temperature": 0.7
+            }')
+
+          echo "Full response: $response"
+
+          # Validate response structure
+          if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
+            echo "✓ Semantic router successfully routed and processed the query"
+            echo "  Answer: $(echo "$response" | jq -r '.choices[0].message.content' | head -c 200)"
+          else
+            echo "::error::Semantic router failed to process query correctly"
+            echo "Response was: $response"
+            exit 1
+          fi
+
+      - name: Show service logs on failure
+        if: failure()
+        run: |
+          echo "=== Docker Compose Logs ==="
+          docker compose -f deploy/docker-compose/docker-compose.yml logs
+          echo "=== Container Status ==="
+          docker ps -a
+          echo "=== Semantic Router Logs ==="
+          docker logs semantic-router || true
+          echo "=== Envoy Logs ==="
+          docker logs envoy-proxy || true
+          echo "=== Dashboard Logs ==="
+          docker logs semantic-router-dashboard || true
+
+      - name: Clean up
+        if: always()
+        run: |
+          make docker-compose-down || true
+          docker system prune -af --volumes || true
@@ -18,7 +18,7 @@ jobs:
       - name: Set up Rust
         uses: dtolnay/rust-toolchain@stable
         with:
-          toolchain: 1.85
+          toolchain: 1.90
 
       - name: Set up Go
         uses: actions/setup-go@v5
 
@@ -75,6 +75,9 @@ bin/
 !*/models/README.md
 models/
 
+# Training data
+wikipedia_data/
+
 # Added by Claude Task Master
 # Logs
 logs
@@ -94,7 +97,7 @@ node_modules/
 *.sw?
 # Task files
 tasks.json
-tasks/ 
+tasks/
 .cursor/
 .roo/
 .env.example
 
@@ -1,4 +1,4 @@
-FROM quay.io/centos/centos:stream9
+FROM quay.io/centos/centos:stream10
 
 RUN dnf -y update && \
     dnf -y install epel-release && \
@@ -32,7 +32,7 @@ RUN ARCH=$(uname -m) && \
     curl -OL https://github.com/envoyproxy/envoy/releases/download/v${ENVOY_VERSION}/envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} && \
     chmod +x envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} && \
     mv envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} /usr/local/bin/envoy
-    
+
 # Install Golang
 ENV GOLANG_VERSION=1.24.1
 RUN ARCH=$(uname -m) && \
 
@@ -1,5 +1,5 @@
 # Build the Rust library using Makefile
-FROM rust:1.85 AS rust-builder
+FROM rust:1.90 AS rust-builder
 
 # Install make and other build dependencies including cross-compilation tools
 RUN apt-get update && apt-get install -y \
@@ -82,7 +82,7 @@ RUN mkdir -p bin && cd src/semantic-router && \
     go build -ldflags="-w -s" -o ../../bin/router cmd/main.go
 
 # Final stage: copy the binary and the shared library
-FROM quay.io/centos/centos:stream9
+FROM quay.io/centos/centos:stream10
 
 WORKDIR /app
 
 
@@ -1,5 +1,5 @@
 # Cross-compilation optimized Dockerfile for ARM64
-FROM --platform=linux/amd64 rust:1.85 AS rust-cross-builder
+FROM --platform=linux/amd64 rust:1.90 AS rust-cross-builder
 
 # Install cross-compilation dependencies
 RUN dpkg --add-architecture arm64 && \
@@ -212,7 +212,7 @@ RUN mkdir -p bin && cd src/semantic-router && \
     fi
 
 # Final stage: copy the binary and the shared library
-FROM quay.io/centos/centos:stream9
+FROM quay.io/centos/centos:stream10
 
 # Install OpenSSL runtime libraries
 RUN dnf update -y && \
 
@@ -16,6 +16,7 @@
 
 *Latest News* 🔥
 
+- [2025/10/26] We reached 2000 stars on GitHub! 🔥
 - [2025/10/21] We announced the [2025 Q4 Roadmap: Journey to Iris](https://vllm-semantic-router.com/blog/q4-roadmap-iris) 📅.
 - [2025/10/16] We established the [vLLM Semantic Router Youtube Channel](https://www.youtube.com/@vLLMSemanticRouter) ✨.
 - [2025/10/15] We announced the [vLLM Semantic Router Dashboard](https://www.youtube.com/watch?v=E2IirN8PsFw) 🚀.
@@ -25,13 +26,6 @@
 - [2025/09/15] We reached 1000 stars on GitHub! 🔥
 - [2025/09/01] We released the project officially: [vLLM Semantic Router: Next Phase in LLM inference](https://blog.vllm.ai/2025/09/11/semantic-router.html) 🚀.
 
-<!-- <details>
-<summary>Previous News 🔥</summary>
-
-- 
-
-</details> -->
-
 ---
 
 ## Innovations ✨
@@ -44,30 +38,36 @@
 
 An **Mixture-of-Models** (MoM) router that intelligently directs OpenAI API requests to the most suitable models from a defined pool based on **Semantic Understanding** of the request's intent (Complexity, Task, Tools).
 
-This is achieved using BERT classification. Conceptually similar to Mixture-of-Experts (MoE) which lives *within* a model, this system selects the best *entire model* for the nature of the task.
+![](./website/static/img/mom-overview.png)
+
+Conceptually similar to Mixture-of-Experts (MoE) which lives *within* a model, this system selects the best *entire model* for the nature of the task.
 
 As such, the overall inference accuracy is improved by using a pool of models that are better suited for different types of tasks:
 
 ![Model Accuracy](./website/static/img/category_accuracies.png)
 
-The screenshot below shows the LLM Router dashboard in Grafana.
-
-![LLM Router Dashboard](./website/static/img/grafana_screenshot.png)
-
 The router is implemented in two ways:
 
 - Golang (with Rust FFI based on the [candle](https://github.com/huggingface/candle) rust ML framework)
 - Python
 Benchmarking will be conducted to determine the best implementation.
 
+#### Request Flow
+
+![architecture](./website/static/img/flow.png)
+
 #### Auto-Selection of Tools
 
 Select the tools to use based on the prompt, avoiding the use of tools that are not relevant to the prompt so as to reduce the number of prompt tokens and improve tool selection accuracy by the LLM.
 
-#### Category-Specific System Prompts
+#### Domain Aware System Prompts
 
 Automatically inject specialized system prompts based on query classification, ensuring optimal model behavior for different domains (math, coding, business, etc.) without manual prompt engineering.
 
+#### Domain Aware Similarity Caching ⚡️
+
+Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
+
 ### Enterprise Security 🔒
 
 #### PII detection
@@ -78,10 +78,6 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p
 
 Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.
 
-### Similarity Caching ⚡️
-
-Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
-
 ### Distributed Tracing 🔍
 
 Comprehensive observability with OpenTelemetry distributed tracing provides fine-grained visibility into the request processing pipeline.
@@ -128,7 +124,6 @@ The documentation includes:
 - **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work
 - **[API Reference](https://vllm-semantic-router.com/docs/api/router/)** - Complete API documentation
 - **[Dashboard](https://vllm-semantic-router.com/docs/overview/dashboard)** - vLLM Semantic Router Dashboard
-- **[Distributed Tracing](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/)** - Observability and debugging guide
 
 ## Community 👋