Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions .github/workflows/quickstart-integration-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: Quickstart Integration Test

on:
pull_request:
branches:
- main
paths:
- 'scripts/quickstart.sh'
- 'deploy/docker-compose/**'
- 'config/config.yaml'
- 'tools/make/common.mk'
- 'tools/make/models.mk'
- 'tools/make/docker.mk'
workflow_dispatch: # Allow manual triggering

jobs:
test-quickstart:
runs-on: ubuntu-latest
timeout-minutes: 30

steps:
- name: Check out the repo
uses: actions/checkout@v4

- name: Free up disk space
run: |
echo "Disk space before cleanup:"
df -h
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
echo "Disk space after cleanup:"
df -h

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
make \
curl \
docker-compose

- name: Run quickstart script
id: quickstart
run: |
timeout 1200 bash scripts/quickstart.sh || {
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "::error::Quickstart script timed out after 20 minutes"
else
echo "::error::Quickstart script failed with exit code $exit_code"
fi
exit $exit_code
}
env:
CI: true
CI_MINIMAL_MODELS: true
TERM: xterm
HF_HUB_ENABLE_HF_TRANSFER: 1
HF_HUB_DISABLE_TELEMETRY: 1

- name: Test semantic routing functionality
run: |
echo "Testing semantic router with a sample query..."

response=$(curl -s -X POST http://localhost:8801/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3",
"messages": [{"role": "user", "content": "What is 2 + 2?"}],
"temperature": 0.7
}')

echo "Full response: $response"

# Validate response structure
if echo "$response" | jq -e '.choices[0].message.content' > /dev/null 2>&1; then
echo "✓ Semantic router successfully routed and processed the query"
echo " Answer: $(echo "$response" | jq -r '.choices[0].message.content' | head -c 200)"
else
echo "::error::Semantic router failed to process query correctly"
echo "Response was: $response"
exit 1
fi

- name: Show service logs on failure
if: failure()
run: |
echo "=== Docker Compose Logs ==="
docker compose -f deploy/docker-compose/docker-compose.yml logs
echo "=== Container Status ==="
docker ps -a
echo "=== Semantic Router Logs ==="
docker logs semantic-router || true
echo "=== Envoy Logs ==="
docker logs envoy-proxy || true
echo "=== Dashboard Logs ==="
docker logs semantic-router-dashboard || true

- name: Clean up
if: always()
run: |
make docker-compose-down || true
docker system prune -af --volumes || true
11 changes: 1 addition & 10 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,6 @@ semantic_cache:
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
# Default: "bert" (fastest, lowest memory)
embedding_model: "bert"
# HNSW index configuration (for memory backend only)
use_hnsw: true # Enable HNSW index for faster similarity search
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)

# Hybrid cache configuration (when backend_type: "hybrid")
# Combines in-memory HNSW for fast search with Milvus for scalable storage
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
# backend_config_path: "config/milvus.yaml" # Path to Milvus config

tools:
enabled: true
Expand Down Expand Up @@ -223,7 +214,7 @@ router:
traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability
tie_break_confidence: 0.5 # Confidence value for tie-breaking situations

default_model: openai/gpt-oss-20b
default_model: qwen3

# Reasoning family configurations
reasoning_families:
Expand Down
41 changes: 25 additions & 16 deletions deploy/docker-compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ services:
ports:
- "50051:50051"
volumes:
- ../../config:/app/config:ro
- ../../models:/app/models:ro
- ~/.cache/huggingface:/root/.cache/huggingface
- ../../config:/app/config:ro,z
- ../../models:/app/models:ro,z
- ~/.cache/huggingface:/root/.cache/huggingface:z
environment:
- LD_LIBRARY_PATH=/app/lib
# Use main config by default; override via CONFIG_FILE if needed
Expand All @@ -32,19 +32,21 @@ services:
envoy:
image: envoyproxy/envoy:v1.31.7
container_name: envoy-proxy
security_opt:
- label=disable
ports:
- "8801:8801" # Main proxy port
- "19000:19000" # Admin interface
volumes:
- ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro
- ./addons/envoy.yaml:/etc/envoy/envoy.yaml:ro,z
command: ["/usr/local/bin/envoy", "-c", "/etc/envoy/envoy.yaml", "--component-log-level", "ext_proc:trace,router:trace,http:trace"]
depends_on:
semantic-router:
condition: service_healthy
networks:
- semantic-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:19000/ready"]
test: ["CMD", "bash", "-c", "(echo -e 'GET /ready HTTP/1.1\\r\\nHost: localhost\\r\\n\\r\\n' >&3; timeout 2 cat <&3) 3<>/dev/tcp/localhost/19000 | grep -q LIVE"]
interval: 10s
timeout: 5s
retries: 5
Expand Down Expand Up @@ -86,7 +88,7 @@ services:
image: prom/prometheus:v2.53.0
container_name: prometheus
volumes:
- ./addons/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro
- ./addons/prometheus.yaml:/etc/prometheus/prometheus.yaml:ro,z
- prometheus-data:/prometheus
command:
- --config.file=/etc/prometheus/prometheus.yaml
Expand All @@ -106,11 +108,11 @@ services:
ports:
- "3000:3000"
volumes:
- ./addons/grafana.ini:/etc/grafana/grafana.ini:ro
- ./addons/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro
- ./addons/grafana-datasource-jaeger.yaml:/etc/grafana/provisioning/datasources/datasource_jaeger.yaml:ro
- ./addons/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro
- ./addons/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro
- ./addons/grafana.ini:/etc/grafana/grafana.ini:ro,z
- ./addons/grafana-datasource.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro,z
- ./addons/grafana-datasource-jaeger.yaml:/etc/grafana/provisioning/datasources/datasource_jaeger.yaml:ro,z
- ./addons/grafana-dashboard.yaml:/etc/grafana/provisioning/dashboards/dashboard.yaml:ro,z
- ./addons/llm-router-dashboard.json:/etc/grafana/provisioning/dashboards/llm-router-dashboard.json:ro,z
- grafana-data:/var/lib/grafana
networks:
- semantic-network
Expand Down Expand Up @@ -175,9 +177,16 @@ services:
- PYTHONUNBUFFERED=1
volumes:
# Persistent pipelines storage (auto-loaded on start)
- openwebui-pipelines:/app/pipelines
# Mount our vLLM Semantic Router pipeline
- ./addons/vllm_semantic_router_pipe.py:/app/pipelines/vllm_semantic_router_pipe.py:ro
- type: volume
source: openwebui-pipelines
target: /app/pipelines
volume:
nocopy: true
# Mount our vLLM Semantic Router pipeline (read-only) into the persistent dir
- type: bind
source: ./addons/vllm_semantic_router_pipe.py
target: /app/pipelines/vllm_semantic_router_pipe.py
read_only: true
networks:
- semantic-network

Expand All @@ -202,7 +211,7 @@ services:
- HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN:-}
- HF_HUB_ENABLE_HF_TRANSFER=1
volumes:
- ../../models:/app/models:ro
- ../../models:/app/models:ro,z
- hf-cache:/home/llmkatan/.cache/huggingface
networks:
semantic-network:
Expand Down Expand Up @@ -235,7 +244,7 @@ services:
- TARGET_CHATUI_URL=http://chat-ui:3000
- ROUTER_CONFIG_PATH=/app/config/config.yaml
volumes:
- ../../config:/app/config:rw
- ../../config:/app/config:rw,z
ports:
- "8700:8700"
networks:
Expand Down
Loading
Loading