vllm-project · kobe0938 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
@@ -231,3 +231,62 @@ jobs:
           pkill -f "python3 -m src.vllm_router.app" || true
 
       - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
+
+  router-stress-test:
+    runs-on: self-hosted
+    needs: e2e-test
+    if: github.event.pull_request.draft == false
+    env:
+      LOG_DIR: /tmp/router-stress-test-${{ github.event.pull_request.number || 'main' }}
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Install Apache Bench
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y apache2-utils
+
+      - name: Run Router Stress Test
+        env:
+          PYTHONPATH: ${{ github.workspace }}/src
+          VLLM_ROUTER_STRESS_TEST_MODE: true
+        run: |
+          echo "🧪 Running router stress test with mock backends"
+          chmod +x tests/e2e/router-stress-test.sh
+          ./tests/e2e/router-stress-test.sh \
+            --concurrent 20000 \
+            --requests 100000 \
+            --port 30080 \
+            --log-dir "$LOG_DIR" \
+            --model "facebook/opt-125m" \
+            --backend1-port 8000 \
+            --backend2-port 8001
+        timeout-minutes: 10
+
+      - name: Archive router stress test results and logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: router-stress-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          path: |
+            ${{ env.LOG_DIR }}/*
+
+      - name: Cleanup router processes
+        if: always()
+        run: |
+          echo "🧹 Cleaning up router processes"
+          pkill -f "python3 -m src.vllm_router.app" || true
+
+      - run: echo "🍏 Router stress test job status is ${{ job.status }}."
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
@@ -95,6 +95,24 @@ async def process_request(
     request.app.state.request_stats_monitor.on_new_request(
         backend_url, request_id, start_time
     )
+
+    # Check if stress test mode is enabled; used for tests/e2e/stress-test.sh
+    if os.getenv("VLLM_ROUTER_STRESS_TEST_MODE", "false").lower() == "true":
+        # Mock response for stress testing - skip backend calls
+        mock_headers = {"content-type": "application/json", "x-request-id": request_id}
+        mock_response = b'{"id":"test","object":"chat.completion","choices":[{"message":{"role":"assistant","content":"Test"},"index":0,"finish_reason":"stop"}]}'
-        mock_response = b'{"id":"test","object":"chat.completion","choices":[{"message":{"role":"assistant","content":"Test"},"index":0,"finish_reason":"stop"}]}'
+        mock_response = json.dumps({
+            "id": "test",
+            "object": "chat.completion",
+            "choices": [{
+                "message": {"role": "assistant", "content": "Test"},
+                "index": 0,
+                "finish_reason": "stop"
+            }]
+        }).encode("utf-8")
-        mock_response = b'{"id":"test","object":"chat.completion","choices":[{"message":{"role":"assistant","content":"Test"},"index":0,"finish_reason":"stop"}]}'
+        mock_response = json.dumps({
+            "id": "test",
+            "object": "chat.completion",
+            "choices": [{
+                "message": {"role": "assistant", "content": "Test"},
+                "index": 0,
+                "finish_reason": "stop"
+            }]
+        }).encode("utf-8")
+
+        # Yield headers and mock response
+        yield mock_headers, 200
+        request.app.state.request_stats_monitor.on_request_response(
+            backend_url, request_id, time.time()
+        )
+        yield mock_response
+        request.app.state.request_stats_monitor.on_request_complete(
+            backend_url, request_id, time.time()
+        )
+        return
+
     # Check if this is a streaming request
     is_streaming = False
     try:

diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+
+# Router Stress Test - Pure Router Logic Testing
+# Tests round-robin routing logic under high concurrent loads
+#
+# IMPORTANT NOTES:
+# - This test uses MOCK backends and MOCK responses (no real vLLM servers)
+# - Backend ports are dummy placeholders - no actual services run on them
+# - Model names are dummy placeholders - no real models are loaded
+# - When VLLM_ROUTER_STRESS_TEST_MODE=true, the router returns mock responses
+#   instead of forwarding requests to backends (see src/vllm_router/services/request_service/request.py)
+# - This test validates ONLY the router's routing logic, load balancing, and performance
+#   under high concurrent loads, not actual inference capabilities
+#
+# Purpose: Verify that the router correctly distributes requests using round-robin
+# logic and can handle high concurrency without routing logic failures.
+#
+# TODO: will add tests for prefix-aware and session-based router later
+# TODO: will add performance comparison tests & threshold later
+
+set -euo pipefail
+
+# Default values
+ROUTER_PORT=30080
+CONCURRENT=20000
+REQUESTS=100000
+LOG_DIR="/tmp/router-stress-logs"
+MODEL="facebook/opt-125m"
+BACKEND1_PORT=8000
+BACKEND2_PORT=8001
+BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
+
+# Colors for output
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+print_status() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+show_usage() {
+    cat << EOF
+Router Stress Test - Tests round-robin routing logic
+
+Usage: $0 [options]
+
+Options:
+    -c, --concurrent N      Concurrent requests (default: 20000)
+    -n, --requests N        Total requests (default: 100000)
+    -p, --port PORT         Router port (default: 30080)
+    -l, --log-dir DIR       Log directory (default: /tmp/router-stress-logs)
+    -m, --model MODEL       Model to use (default: facebook/opt-125m)
+    --backend1-port PORT    First backend port (default: 8000)
+    --backend2-port PORT    Second backend port (default: 8001)
+    -h, --help              Show this help
+
+Examples:
+    $0                      # Basic test (20000 concurrent, 100000 requests)
+    $0 -c 500 -n 20000     # High load test
+    $0 -p 8080 -c 100      # Different port, lower load
+    $0 --backend1-port 9000 --backend2-port 9001  # Custom backend ports
+
+Prerequisites:
+    - Router must be started with VLLM_ROUTER_STRESS_TEST_MODE=true
+EOF
+}
+
+# Check if Apache Bench is available
+check_ab() {
+    if ! command -v ab >/dev/null 2>&1; then
+        print_error "Apache Bench (ab) not found!"
+        print_error "Install with: sudo apt-get install apache2-utils"
+        exit 1
+    fi
+}
+
+# Function to cleanup processes
+cleanup() {
+    print_status "Cleaning up router processes..."
+    pkill -f "python3 -m src.vllm_router.app" || true
+    sleep 2
+}
+
+# Function to start router
+start_router() {
+    local log_file="$LOG_DIR/router.log"
+
+    print_status "Starting router with round-robin routing (stress test mode)"
+
+    # Create log directory
+    mkdir -p "$(dirname "$log_file")"
+
+    # Set stress test mode
+    export VLLM_ROUTER_STRESS_TEST_MODE=true
+
+    # Start router with detailed logging
+    python3 -m src.vllm_router.app --port "$ROUTER_PORT" \
+        --service-discovery static \
+        --static-backends "$BACKENDS_URL" \
+        --static-models "$MODEL,$MODEL" \
+        --static-model-types "chat,chat" \
+        --routing-logic roundrobin \
+        --log-stats \
+        --log-stats-interval 5 > "$log_file" 2>&1 &
+
+    ROUTER_PID=$!
+    print_status "Router started with PID: $ROUTER_PID"
+
+    # Wait for router to be ready
+    print_status "Waiting for router to be ready..."
+    timeout 30 bash -c "until curl -s http://localhost:$ROUTER_PORT/v1/models > /dev/null 2>&1; do sleep 1; done" || {
+        print_error "Router failed to start within 30 seconds"
+        print_error "Router log:"
+        tail -20 "$log_file" || true
+        exit 1
+    }
+    print_status "Router is ready"
+}
+
+# Function to run stress test
+run_stress_test() {
+    print_status "Running stress test with Apache Bench"
+    print_status "Concurrent: $CONCURRENT, Total: $REQUESTS"
+
+    # Create payload file
+    local payload_file="/tmp/stress_payload.json"
-    local payload_file="/tmp/stress_payload.json"
+    local payload_file=$(mktemp /tmp/stress_payload.XXXXXX.json)
-    local payload_file="/tmp/stress_payload.json"
+    local payload_file=$(mktemp /tmp/stress_payload.XXXXXX.json)
+    cat > "$payload_file" << EOF
+{
+    "model": "$MODEL",
+    "messages": [
+        {"role": "user", "content": "Test message for stress testing"}
+    ],
+    "max_tokens": 10,
+    "temperature": 0.7
+}
+EOF
+
+    # Run Apache Bench
+    ab -c "$CONCURRENT" \
+       -n "$REQUESTS" \
+       -p "$payload_file" \
+       -T "application/json" \
+       -H "Authorization: Bearer test" \
+       -H "x-user-id: stress-test-user" \
+       "http://localhost:$ROUTER_PORT/v1/chat/completions"
+
+    # Clean up payload file
+    rm -f "$payload_file"
+
+    print_status "Stress test completed"
+
+    # Small delay to ensure all logs are written
+    sleep 2
+}
+
+# Function to check round-robin correctness
+check_roundrobin_correctness() {
+    local log_file="$LOG_DIR/router.log"
+
+    print_status "Checking round-robin routing correctness..."
+
+    if [ ! -f "$log_file" ]; then
+        print_error "Router log file not found: $log_file"
+        return 1
+    fi
+
+    # Extract backend routing decisions from logs
+    # Look for "Routing request ... to http://localhost:XXXX"
+    local backend1_count
+    backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
+
+    local backend2_count
+    backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
+
+    local total_routed=$((backend1_count + backend2_count))
+
+    print_status "Round-robin routing results:"
+    print_status "  Backend localhost:$BACKEND1_PORT: $backend1_count requests"
+    print_status "  Backend localhost:$BACKEND2_PORT: $backend2_count requests"
+    print_status "  Total routed: $total_routed requests"
+
+    if [ "$total_routed" -eq 0 ]; then
+        print_error "No routing decisions found in logs"
+        return 1
+    fi
+
+    # Calculate percentages
+    local backend1_pct=$((backend1_count * 100 / total_routed))
+    local backend2_pct=$((backend2_count * 100 / total_routed))
+
+    print_status "  Backend localhost:$BACKEND1_PORT: ${backend1_pct}%"
+    print_status "  Backend localhost:$BACKEND2_PORT: ${backend2_pct}%"
+
+    # Check if distribution is roughly even (within 20% tolerance)
+    local diff=$((backend1_pct > backend2_pct ? backend1_pct - backend2_pct : backend2_pct - backend1_pct))
+
+    if [ "$diff" -le 20 ]; then
+        print_status "✅ Round-robin routing is working correctly (${diff}% difference)"
+        return 0
+    else
+        print_error "❌ Round-robin routing appears uneven (${diff}% difference)"
+        print_status "Last 10 routing decisions from logs:"
+        grep "Routing request.*to http://localhost:" "$log_file" | tail -10 | sed 's/^/  /' || true
+        return 1
+    fi
+}
+
+# Function to show log summary
+show_log_summary() {
+    local log_file="$LOG_DIR/router.log"
+
+    if [ -f "$log_file" ]; then
+        print_status "Log summary (last 20 lines):"
+        tail -20 "$log_file" | sed 's/^/  /'
+    fi
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--concurrent)
+            CONCURRENT="$2"
+            shift 2
+            ;;
+        -n|--requests)
+            REQUESTS="$2"
+            shift 2
+            ;;
+        -p|--port)
+            ROUTER_PORT="$2"
+            shift 2
+            ;;
+        -l|--log-dir)
+            LOG_DIR="$2"
+            shift 2
+            ;;
+        -m|--model)
+            MODEL="$2"
+            shift 2
+            ;;
+        --backend1-port)
+            BACKEND1_PORT="$2"
+            shift 2
+            ;;
+        --backend2-port)
+            BACKEND2_PORT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            show_usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            show_usage
+            exit 1
+            ;;
+    esac
+done
+
+# Set trap for cleanup
+trap cleanup EXIT
+
+# Update backends URL with final port values
+BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
+
+# Check prerequisites
+print_status "Checking prerequisites..."
+check_ab
+
+print_status "Router stress test configuration:"
+print_status "  Concurrent requests: $CONCURRENT"
+print_status "  Total requests: $REQUESTS"
+print_status "  Router port: $ROUTER_PORT"
+print_status "  Backend ports: $BACKEND1_PORT, $BACKEND2_PORT"
+print_status "  Model: $MODEL"
+
+# Run test
+start_router
+run_stress_test
+
+# Check correctness and show results
+if check_roundrobin_correctness; then
+    print_status "Test completed successfully!"
+else
+    print_error "Test completed but round-robin routing correctness check failed!"
+    show_log_summary
+    exit 1
+fi