Skip to content

Commit 5973a9c

Browse files
authored
Merge branch 'main' into 1022-yuluo/helm
2 parents 03baf65 + 0fc81e6 commit 5973a9c

File tree

67 files changed

+10646
-656
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+10646
-656
lines changed

.github/workflows/k8s-kind-integration-test.yml

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,17 @@ jobs:
186186
- op: replace
187187
path: /spec/template/spec/containers/0/resources/limits/cpu
188188
value: "1"
189+
- op: replace
190+
path: /spec/template/spec/containers/0/readinessProbe
191+
value:
192+
httpGet:
193+
path: /health
194+
port: classify-api
195+
scheme: HTTP
196+
initialDelaySeconds: 120
197+
periodSeconds: 15
198+
timeoutSeconds: 5
199+
failureThreshold: 20
189200
- op: add
190201
path: /spec/template/spec/containers/0/imagePullPolicy
191202
value: "IfNotPresent"
@@ -244,22 +255,22 @@ jobs:
244255
245256
# Wait for PVC to be bound
246257
echo "Waiting for PVC to be bound..."
247-
kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/semantic-router-models -n vllm-semantic-router-system --timeout=120s || {
258+
kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc/semantic-router-models -n vllm-semantic-router-system --timeout=300s || {
248259
echo "PVC binding timeout. Checking PVC status..."
249260
kubectl describe pvc -n vllm-semantic-router-system
250261
exit 1
251262
}
252263
253264
# Wait for pods to be created
254265
echo "Waiting for pods to be created..."
255-
timeout 120 bash -c 'until kubectl get pods -n vllm-semantic-router-system | grep -q semantic-router; do echo "Waiting for pod creation..."; sleep 5; done'
266+
timeout 300 bash -c 'until kubectl get pods -n vllm-semantic-router-system | grep -q semantic-router; do echo "Waiting for pod creation..."; sleep 5; done'
256267
257268
# Show pod status
258269
kubectl get pods -n vllm-semantic-router-system
259270
260271
# Wait for init container to complete (model download)
261272
echo "Waiting for init container to complete (downloading models)..."
262-
kubectl wait --for=condition=Initialized pods -l app=semantic-router -n vllm-semantic-router-system --timeout=600s || {
273+
kubectl wait --for=condition=Initialized pods -l app=semantic-router -n vllm-semantic-router-system --timeout=1200s || {
263274
echo "❌ Init container did not complete in time. Showing logs..."
264275
kubectl logs -n vllm-semantic-router-system -l app=semantic-router -c model-downloader --tail=200 || true
265276
kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
@@ -268,7 +279,7 @@ jobs:
268279
269280
# Wait for main container to be ready (increased timeout for model loading)
270281
echo "Waiting for main container to be ready..."
271-
kubectl wait --for=condition=Ready pods -l app=semantic-router -n vllm-semantic-router-system --timeout=600s || {
282+
kubectl wait --for=condition=Ready pods -l app=semantic-router -n vllm-semantic-router-system --timeout=1200s || {
272283
echo "❌ Pod did not become ready in time. Showing status and logs..."
273284
kubectl describe pods -n vllm-semantic-router-system -l app=semantic-router
274285
kubectl logs -n vllm-semantic-router-system -l app=semantic-router --tail=200 || true

.github/workflows/pre-commit.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
- name: Set up Rust
3737
uses: dtolnay/rust-toolchain@stable
3838
with:
39-
toolchain: 1.85
39+
toolchain: 1.90
4040
components: rustfmt, clippy
4141

4242
- name: Install system dependencies
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
name: Quickstart Integration Test
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- main
7+
paths:
8+
- 'scripts/quickstart.sh'
9+
- 'deploy/docker-compose/**'
10+
- 'config/config.yaml'
11+
- 'tools/make/common.mk'
12+
- 'tools/make/models.mk'
13+
- 'tools/make/docker.mk'
14+
workflow_dispatch: # Allow manual triggering
15+
16+
jobs:
17+
test-quickstart:
18+
runs-on: ubuntu-latest
19+
timeout-minutes: 30
20+
21+
steps:
22+
- name: Check out the repo
23+
uses: actions/checkout@v4
24+
25+
- name: Free up disk space
26+
run: |
27+
echo "Disk space before cleanup:"
28+
df -h
29+
sudo rm -rf /usr/share/dotnet
30+
sudo rm -rf /opt/ghc
31+
sudo rm -rf /usr/local/share/boost
32+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
33+
echo "Disk space after cleanup:"
34+
df -h
35+
36+
- name: Set up Python
37+
uses: actions/setup-python@v5
38+
with:
39+
python-version: '3.11'
40+
41+
- name: Install system dependencies
42+
run: |
43+
sudo apt-get update
44+
sudo apt-get install -y \
45+
make \
46+
curl \
47+
docker-compose
48+
49+
- name: Run quickstart script
50+
id: quickstart
51+
run: |
52+
timeout 1200 bash scripts/quickstart.sh || {
53+
exit_code=$?
54+
if [ $exit_code -eq 124 ]; then
55+
echo "::error::Quickstart script timed out after 20 minutes"
56+
else
57+
echo "::error::Quickstart script failed with exit code $exit_code"
58+
fi
59+
exit $exit_code
60+
}
61+
env:
62+
CI: true
63+
CI_MINIMAL_MODELS: true
64+
TERM: xterm
65+
HF_HUB_ENABLE_HF_TRANSFER: 1
66+
HF_HUB_DISABLE_TELEMETRY: 1
67+
68+
- name: Test semantic routing functionality
69+
run: |
70+
echo "Testing semantic router with a sample query..."
71+
72+
response=$(curl -s -X POST http://localhost:8801/v1/chat/completions \
73+
-H "Content-Type: application/json" \
74+
-d '{
75+
"model": "qwen3",
76+
"messages": [{"role": "user", "content": "What is 2 + 2?"}],
77+
"temperature": 0.7
78+
}')
79+
80+
echo "Full response: $response"
81+
82+
# Validate response structure
83+
if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
84+
echo "✓ Semantic router successfully routed and processed the query"
85+
echo " Answer: $(echo "$response" | jq -r '.choices[0].message.content' | head -c 200)"
86+
else
87+
echo "::error::Semantic router failed to process query correctly"
88+
echo "Response was: $response"
89+
exit 1
90+
fi
91+
92+
- name: Show service logs on failure
93+
if: failure()
94+
run: |
95+
echo "=== Docker Compose Logs ==="
96+
docker compose -f deploy/docker-compose/docker-compose.yml logs
97+
echo "=== Container Status ==="
98+
docker ps -a
99+
echo "=== Semantic Router Logs ==="
100+
docker logs semantic-router || true
101+
echo "=== Envoy Logs ==="
102+
docker logs envoy-proxy || true
103+
echo "=== Dashboard Logs ==="
104+
docker logs semantic-router-dashboard || true
105+
106+
- name: Clean up
107+
if: always()
108+
run: |
109+
make docker-compose-down || true
110+
docker system prune -af --volumes || true

.github/workflows/test-and-build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
- name: Set up Rust
1919
uses: dtolnay/rust-toolchain@stable
2020
with:
21-
toolchain: 1.85
21+
toolchain: 1.90
2222

2323
- name: Set up Go
2424
uses: actions/setup-go@v5

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ bin/
7575
!*/models/README.md
7676
models/
7777

78+
# Training data
79+
wikipedia_data/
80+
7881
# Added by Claude Task Master
7982
# Logs
8083
logs
@@ -94,7 +97,7 @@ node_modules/
9497
*.sw?
9598
# Task files
9699
tasks.json
97-
tasks/
100+
tasks/
98101
.cursor/
99102
.roo/
100103
.env.example

Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM quay.io/centos/centos:stream9
1+
FROM quay.io/centos/centos:stream10
22

33
RUN dnf -y update && \
44
dnf -y install epel-release && \
@@ -32,7 +32,7 @@ RUN ARCH=$(uname -m) && \
3232
curl -OL https://github.com/envoyproxy/envoy/releases/download/v${ENVOY_VERSION}/envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} && \
3333
chmod +x envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} && \
3434
mv envoy-${ENVOY_VERSION}-linux-${ENVOY_ARCH} /usr/local/bin/envoy
35-
35+
3636
# Install Golang
3737
ENV GOLANG_VERSION=1.24.1
3838
RUN ARCH=$(uname -m) && \

Dockerfile.extproc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Build the Rust library using Makefile
2-
FROM rust:1.85 AS rust-builder
2+
FROM rust:1.90 AS rust-builder
33

44
# Install make and other build dependencies including cross-compilation tools
55
RUN apt-get update && apt-get install -y \
@@ -82,7 +82,7 @@ RUN mkdir -p bin && cd src/semantic-router && \
8282
go build -ldflags="-w -s" -o ../../bin/router cmd/main.go
8383

8484
# Final stage: copy the binary and the shared library
85-
FROM quay.io/centos/centos:stream9
85+
FROM quay.io/centos/centos:stream10
8686

8787
WORKDIR /app
8888

Dockerfile.extproc.cross

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Cross-compilation optimized Dockerfile for ARM64
2-
FROM --platform=linux/amd64 rust:1.85 AS rust-cross-builder
2+
FROM --platform=linux/amd64 rust:1.90 AS rust-cross-builder
33

44
# Install cross-compilation dependencies
55
RUN dpkg --add-architecture arm64 && \
@@ -212,7 +212,7 @@ RUN mkdir -p bin && cd src/semantic-router && \
212212
fi
213213

214214
# Final stage: copy the binary and the shared library
215-
FROM quay.io/centos/centos:stream9
215+
FROM quay.io/centos/centos:stream10
216216

217217
# Install OpenSSL runtime libraries
218218
RUN dnf update -y && \

README.md

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
*Latest News* 🔥
1818

19+
- [2025/10/26] We reached 2000 stars on GitHub! 🔥
1920
- [2025/10/21] We announced the [2025 Q4 Roadmap: Journey to Iris](https://vllm-semantic-router.com/blog/q4-roadmap-iris) 📅.
2021
- [2025/10/16] We established the [vLLM Semantic Router Youtube Channel](https://www.youtube.com/@vLLMSemanticRouter) ✨.
2122
- [2025/10/15] We announced the [vLLM Semantic Router Dashboard](https://www.youtube.com/watch?v=E2IirN8PsFw) 🚀.
@@ -25,13 +26,6 @@
2526
- [2025/09/15] We reached 1000 stars on GitHub! 🔥
2627
- [2025/09/01] We released the project officially: [vLLM Semantic Router: Next Phase in LLM inference](https://blog.vllm.ai/2025/09/11/semantic-router.html) 🚀.
2728

28-
<!-- <details>
29-
<summary>Previous News 🔥</summary>
30-
31-
-
32-
33-
</details> -->
34-
3529
---
3630

3731
## Innovations ✨
@@ -44,30 +38,36 @@
4438

4539
An **Mixture-of-Models** (MoM) router that intelligently directs OpenAI API requests to the most suitable models from a defined pool based on **Semantic Understanding** of the request's intent (Complexity, Task, Tools).
4640

47-
This is achieved using BERT classification. Conceptually similar to Mixture-of-Experts (MoE) which lives *within* a model, this system selects the best *entire model* for the nature of the task.
41+
![](./website/static/img/mom-overview.png)
42+
43+
Conceptually similar to Mixture-of-Experts (MoE) which lives *within* a model, this system selects the best *entire model* for the nature of the task.
4844

4945
As such, the overall inference accuracy is improved by using a pool of models that are better suited for different types of tasks:
5046

5147
![Model Accuracy](./website/static/img/category_accuracies.png)
5248

53-
The screenshot below shows the LLM Router dashboard in Grafana.
54-
55-
![LLM Router Dashboard](./website/static/img/grafana_screenshot.png)
56-
5749
The router is implemented in two ways:
5850

5951
- Golang (with Rust FFI based on the [candle](https://github.com/huggingface/candle) rust ML framework)
6052
- Python
6153
Benchmarking will be conducted to determine the best implementation.
6254

55+
#### Request Flow
56+
57+
![architecture](./website/static/img/flow.png)
58+
6359
#### Auto-Selection of Tools
6460

6561
Select the tools to use based on the prompt, avoiding the use of tools that are not relevant to the prompt so as to reduce the number of prompt tokens and improve tool selection accuracy by the LLM.
6662

67-
#### Category-Specific System Prompts
63+
#### Domain Aware System Prompts
6864

6965
Automatically inject specialized system prompts based on query classification, ensuring optimal model behavior for different domains (math, coding, business, etc.) without manual prompt engineering.
7066

67+
#### Domain Aware Similarity Caching ⚡️
68+
69+
Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
70+
7171
### Enterprise Security 🔒
7272

7373
#### PII detection
@@ -78,10 +78,6 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p
7878

7979
Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.
8080

81-
### Similarity Caching ⚡️
82-
83-
Cache the semantic representation of the prompt so as to reduce the number of prompt tokens and improve the overall inference latency.
84-
8581
### Distributed Tracing 🔍
8682

8783
Comprehensive observability with OpenTelemetry distributed tracing provides fine-grained visibility into the request processing pipeline.
@@ -128,7 +124,6 @@ The documentation includes:
128124
- **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work
129125
- **[API Reference](https://vllm-semantic-router.com/docs/api/router/)** - Complete API documentation
130126
- **[Dashboard](https://vllm-semantic-router.com/docs/overview/dashboard)** - vLLM Semantic Router Dashboard
131-
- **[Distributed Tracing](https://vllm-semantic-router.com/docs/tutorials/observability/distributed-tracing/)** - Observability and debugging guide
132127

133128
## Community 👋
134129

0 commit comments

Comments
 (0)