From e2ea70ad594cad596b32140574005c518ea1a9dc Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Tue, 1 Jul 2025 22:59:39 -0700
Subject: [PATCH 1/9] add simple round-robin stress test for router

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .../services/request_service/request.py       |  14 +
 tests/e2e/stress-test.sh                      | 263 ++++++++++++++++++
 2 files changed, 277 insertions(+)
 create mode 100755 tests/e2e/stress-test.sh

diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 46969b2b2..bc8a9d984 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -95,6 +95,20 @@ async def process_request(
     request.app.state.request_stats_monitor.on_new_request(
         backend_url, request_id, start_time
     )
+    
+    # Check if stress test mode is enabled; used for tests/e2e/stress-test.sh
+    if os.getenv("VLLM_ROUTER_STRESS_TEST_MODE", "false").lower() == "true":
+        # Mock response for stress testing - skip backend calls
+        mock_headers = {"content-type": "application/json", "x-request-id": request_id}
+        mock_response = b'{"id":"test","object":"chat.completion","choices":[{"message":{"role":"assistant","content":"Test"},"index":0,"finish_reason":"stop"}]}'
+        
+        # Yield headers and mock response
+        yield mock_headers, 200
+        request.app.state.request_stats_monitor.on_request_response(backend_url, request_id, time.time())
+        yield mock_response
+        request.app.state.request_stats_monitor.on_request_complete(backend_url, request_id, time.time())
+        return
+    
     # Check if this is a streaming request
     is_streaming = False
     try:
diff --git a/tests/e2e/stress-test.sh b/tests/e2e/stress-test.sh
new file mode 100755
index 000000000..766b0d8ef
--- /dev/null
+++ b/tests/e2e/stress-test.sh
@@ -0,0 +1,263 @@
+#!/bin/bash
+
+# Simple Router Stress Test
+# Tests round-robin routing logic under high concurrent loads
+
+set -euo pipefail
+
+# Default values
+ROUTER_PORT=30080
+CONCURRENT=200
+REQUESTS=1000
+LOG_DIR="/tmp/router-stress-logs"
+MODEL="facebook/opt-125m"
+BACKENDS_URL="http://localhost:8000,http://localhost:8001"
+
+# Colors for output
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+print_status() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+show_usage() {
+    cat << EOF
+Router Stress Test - Tests round-robin routing logic
+
+Usage: $0 [options]
+
+Options:
+    -c, --concurrent N      Concurrent requests (default: 2000)
+    -n, --requests N        Total requests (default: 10000)
+    -p, --port PORT         Router port (default: 30080)
+    -l, --log-dir DIR       Log directory (default: /tmp/router-stress-logs)
+    -m, --model MODEL       Model to use (default: facebook/opt-125m)
+    -h, --help              Show this help
+
+Examples:
+    $0                      # Basic test (2000 concurrent, 10000 requests)
+    $0 -c 500 -n 20000     # High load test
+    $0 -p 8080 -c 100      # Different port, lower load
+
+Prerequisites:
+    - Router must be started with VLLM_ROUTER_STRESS_TEST_MODE=true
+EOF
+}
+
+# Check if Apache Bench is available
+check_ab() {
+    if ! command -v ab >/dev/null 2>&1; then
+        print_error "Apache Bench (ab) not found!"
+        print_error "Install with: sudo apt-get install apache2-utils"
+        exit 1
+    fi
+}
+
+# Function to cleanup processes
+cleanup() {
+    print_status "Cleaning up router processes..."
+    pkill -f "python3 -m src.vllm_router.app" || true
+    sleep 2
+}
+
+# Function to start router
+start_router() {
+    local log_file="$LOG_DIR/router.log"
+    
+    print_status "Starting router with round-robin routing (stress test mode)"
+    
+    # Create log directory
+    mkdir -p "$(dirname "$log_file")"
+    
+    # Set stress test mode
+    export VLLM_ROUTER_STRESS_TEST_MODE=true
+    
+    # Start router with detailed logging
+    python3 -m src.vllm_router.app --port "$ROUTER_PORT" \
+        --service-discovery static \
+        --static-backends "$BACKENDS_URL" \
+        --static-models "$MODEL,$MODEL" \
+        --static-model-types "chat,chat" \
+        --routing-logic roundrobin \
+        --log-stats \
+        --log-stats-interval 5 > "$log_file" 2>&1 &
+    
+    ROUTER_PID=$!
+    print_status "Router started with PID: $ROUTER_PID"
+    
+    # Wait for router to be ready
+    print_status "Waiting for router to be ready..."
+    timeout 30 bash -c "until curl -s http://localhost:$ROUTER_PORT/v1/models > /dev/null 2>&1; do sleep 1; done" || {
+        print_error "Router failed to start within 30 seconds"
+        print_error "Router log:"
+        tail -20 "$log_file" || true
+        exit 1
+    }
+    print_status "Router is ready"
+}
+
+# Function to run stress test
+run_stress_test() {
+    print_status "Running stress test with Apache Bench"
+    print_status "Concurrent: $CONCURRENT, Total: $REQUESTS"
+    
+    # Create payload file
+    local payload_file="/tmp/stress_payload.json"
+    cat > "$payload_file" << EOF
+{
+    "model": "$MODEL",
+    "messages": [
+        {"role": "user", "content": "Test message for stress testing"}
+    ],
+    "max_tokens": 10,
+    "temperature": 0.7
+}
+EOF
+    
+    # Run Apache Bench
+    ab -c "$CONCURRENT" \
+       -n "$REQUESTS" \
+       -p "$payload_file" \
+       -T "application/json" \
+       -H "Authorization: Bearer test" \
+       -H "x-user-id: stress-test-user" \
+       "http://localhost:$ROUTER_PORT/v1/chat/completions"
+    
+    # Clean up payload file
+    rm -f "$payload_file"
+    
+    print_status "Stress test completed"
+    
+    # Small delay to ensure all logs are written
+    sleep 2
+}
+
+# Function to check round-robin correctness
+check_roundrobin_correctness() {
+    local log_file="$LOG_DIR/router.log"
+    
+    print_status "Checking round-robin routing correctness..."
+    
+    if [ ! -f "$log_file" ]; then
+        print_error "Router log file not found: $log_file"
+        return 1
+    fi
+    
+    # Extract backend routing decisions from logs
+    # Look for "Routing request ... to http://localhost:XXXX"
+    local backend1_count=$(grep -c "to http://localhost:8000" "$log_file" || echo "0")
+    local backend2_count=$(grep -c "to http://localhost:8001" "$log_file" || echo "0")
+    local total_routed=$((backend1_count + backend2_count))
+    
+    print_status "Round-robin routing results:"
+    print_status "  Backend localhost:8000: $backend1_count requests"
+    print_status "  Backend localhost:8001: $backend2_count requests"
+    print_status "  Total routed: $total_routed requests"
+    
+    if [ "$total_routed" -eq 0 ]; then
+        print_error "No routing decisions found in logs"
+        return 1
+    fi
+    
+    # Calculate percentages
+    local backend1_pct=$((backend1_count * 100 / total_routed))
+    local backend2_pct=$((backend2_count * 100 / total_routed))
+    
+    print_status "  Backend localhost:8000: ${backend1_pct}%"
+    print_status "  Backend localhost:8001: ${backend2_pct}%"
+    
+    # Check if distribution is roughly even (within 20% tolerance)
+    local diff=$((backend1_pct > backend2_pct ? backend1_pct - backend2_pct : backend2_pct - backend1_pct))
+    
+    if [ "$diff" -le 20 ]; then
+        print_status "✅ Round-robin routing is working correctly (${diff}% difference)"
+        return 0
+    else
+        print_error "❌ Round-robin routing appears uneven (${diff}% difference)"
+        print_status "Last 10 routing decisions from logs:"
+        grep "Routing request.*to http://localhost:" "$log_file" | tail -10 | sed 's/^/  /' || true
+        return 1
+    fi
+}
+
+# Function to show log summary
+show_log_summary() {
+    local log_file="$LOG_DIR/router.log"
+    
+    if [ -f "$log_file" ]; then
+        print_status "Log summary (last 20 lines):"
+        tail -20 "$log_file" | sed 's/^/  /'
+    fi
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -c|--concurrent)
+            CONCURRENT="$2"
+            shift 2
+            ;;
+        -n|--requests)
+            REQUESTS="$2"
+            shift 2
+            ;;
+        -p|--port)
+            ROUTER_PORT="$2"
+            shift 2
+            ;;
+        -l|--log-dir)
+            LOG_DIR="$2"
+            shift 2
+            ;;
+        -m|--model)
+            MODEL="$2"
+            shift 2
+            ;;
+        -h|--help)
+            show_usage
+            exit 0
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            show_usage
+            exit 1
+            ;;
+    esac
+done
+
+# Set trap for cleanup
+trap cleanup EXIT
+
+# Check prerequisites
+print_status "Checking prerequisites..."
+check_ab
+
+print_status "Router stress test configuration:"
+print_status "  Concurrent requests: $CONCURRENT"
+print_status "  Total requests: $REQUESTS"
+print_status "  Router port: $ROUTER_PORT"
+print_status "  Model: $MODEL"
+
+# Run test
+start_router
+run_stress_test
+
+# Check correctness and show results
+if check_roundrobin_correctness; then
+    print_status "Test completed successfully!"
+else
+    print_error "Test completed but round-robin routing correctness check failed!"
+    show_log_summary
+    exit 1
+fi 
\ No newline at end of file

From 48c089283abd813aa878632ff5d6e593965faa3e Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:08:26 -0700
Subject: [PATCH 2/9] add github action

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .github/workflows/router-e2e-test.yml | 59 +++++++++++++++++++++++++++
 tests/e2e/stress-test.sh              | 31 ++++++++++----
 2 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index b689223ad..0bed007f8 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -231,3 +231,62 @@ jobs:
           pkill -f "python3 -m src.vllm_router.app" || true
 
       - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
+
+  stress-test:
+    runs-on: self-hosted
+    needs: e2e-test
+    if: github.event.pull_request.draft == false
+    env:
+      LOG_DIR: /tmp/stress-test-${{ github.event.pull_request.number || 'main' }}
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Install Apache Bench
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y apache2-utils
+
+      - name: Run Router Stress Test
+        env:
+          PYTHONPATH: ${{ github.workspace }}/src
+          VLLM_ROUTER_STRESS_TEST_MODE: true
+        run: |
+          echo "🧪 Running router stress test with mock backends"
+          chmod +x tests/e2e/stress-test.sh
+          ./tests/e2e/stress-test.sh \
+            --concurrent 100 \
+            --requests 1000 \
+            --port 30080 \
+            --log-dir "$LOG_DIR" \
+            --model "facebook/opt-125m" \
+            --backend1-port 8000 \
+            --backend2-port 8001
+        timeout-minutes: 2
+
+      - name: Archive stress test results and logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: stress-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          path: |
+            ${{ env.LOG_DIR }}/*
+
+      - name: Cleanup router processes
+        if: always()
+        run: |
+          echo "🧹 Cleaning up router processes"
+          pkill -f "python3 -m src.vllm_router.app" || true
+
+      - run: echo "🍏 Stress test job status is ${{ job.status }}."
\ No newline at end of file
diff --git a/tests/e2e/stress-test.sh b/tests/e2e/stress-test.sh
index 766b0d8ef..9c7b37cc8 100755
--- a/tests/e2e/stress-test.sh
+++ b/tests/e2e/stress-test.sh
@@ -11,7 +11,9 @@ CONCURRENT=200
 REQUESTS=1000
 LOG_DIR="/tmp/router-stress-logs"
 MODEL="facebook/opt-125m"
-BACKENDS_URL="http://localhost:8000,http://localhost:8001"
+BACKEND1_PORT=8000
+BACKEND2_PORT=8001
+BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
 
 # Colors for output
 GREEN='\033[0;32m'
@@ -43,12 +45,15 @@ Options:
     -p, --port PORT         Router port (default: 30080)
     -l, --log-dir DIR       Log directory (default: /tmp/router-stress-logs)
     -m, --model MODEL       Model to use (default: facebook/opt-125m)
+    --backend1-port PORT    First backend port (default: 8000)
+    --backend2-port PORT    Second backend port (default: 8001)
     -h, --help              Show this help
 
 Examples:
     $0                      # Basic test (2000 concurrent, 10000 requests)
     $0 -c 500 -n 20000     # High load test
     $0 -p 8080 -c 100      # Different port, lower load
+    $0 --backend1-port 9000 --backend2-port 9001  # Custom backend ports
 
 Prerequisites:
     - Router must be started with VLLM_ROUTER_STRESS_TEST_MODE=true
@@ -156,13 +161,13 @@ check_roundrobin_correctness() {
     
     # Extract backend routing decisions from logs
     # Look for "Routing request ... to http://localhost:XXXX"
-    local backend1_count=$(grep -c "to http://localhost:8000" "$log_file" || echo "0")
-    local backend2_count=$(grep -c "to http://localhost:8001" "$log_file" || echo "0")
+    local backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
+    local backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
     local total_routed=$((backend1_count + backend2_count))
     
     print_status "Round-robin routing results:"
-    print_status "  Backend localhost:8000: $backend1_count requests"
-    print_status "  Backend localhost:8001: $backend2_count requests"
+    print_status "  Backend localhost:$BACKEND1_PORT: $backend1_count requests"
+    print_status "  Backend localhost:$BACKEND2_PORT: $backend2_count requests"
     print_status "  Total routed: $total_routed requests"
     
     if [ "$total_routed" -eq 0 ]; then
@@ -174,8 +179,8 @@ check_roundrobin_correctness() {
     local backend1_pct=$((backend1_count * 100 / total_routed))
     local backend2_pct=$((backend2_count * 100 / total_routed))
     
-    print_status "  Backend localhost:8000: ${backend1_pct}%"
-    print_status "  Backend localhost:8001: ${backend2_pct}%"
+    print_status "  Backend localhost:$BACKEND1_PORT: ${backend1_pct}%"
+    print_status "  Backend localhost:$BACKEND2_PORT: ${backend2_pct}%"
     
     # Check if distribution is roughly even (within 20% tolerance)
     local diff=$((backend1_pct > backend2_pct ? backend1_pct - backend2_pct : backend2_pct - backend1_pct))
@@ -224,6 +229,14 @@ while [[ $# -gt 0 ]]; do
             MODEL="$2"
             shift 2
             ;;
+        --backend1-port)
+            BACKEND1_PORT="$2"
+            shift 2
+            ;;
+        --backend2-port)
+            BACKEND2_PORT="$2"
+            shift 2
+            ;;
         -h|--help)
             show_usage
             exit 0
@@ -239,6 +252,9 @@ done
 # Set trap for cleanup
 trap cleanup EXIT
 
+# Update backends URL with final port values
+BACKENDS_URL="http://localhost:$BACKEND1_PORT,http://localhost:$BACKEND2_PORT"
+
 # Check prerequisites
 print_status "Checking prerequisites..."
 check_ab
@@ -247,6 +263,7 @@ print_status "Router stress test configuration:"
 print_status "  Concurrent requests: $CONCURRENT"
 print_status "  Total requests: $REQUESTS"
 print_status "  Router port: $ROUTER_PORT"
+print_status "  Backend ports: $BACKEND1_PORT, $BACKEND2_PORT"
 print_status "  Model: $MODEL"
 
 # Run test

From 3734898c3cf937571e393c1731e3ff051fb05d4c Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:09:59 -0700
Subject: [PATCH 3/9] test github action on stress test

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .github/workflows/router-e2e-test.yml | 262 +++++++++++++-------------
 1 file changed, 131 insertions(+), 131 deletions(-)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index 0bed007f8..08d2991c8 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -101,139 +101,139 @@ jobs:
             ~/.kube/config
             src/tests/perftest/logs
 
-  k8s-discovery-e2e-test:
-    runs-on: self-hosted
-    needs: e2e-test
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r benchmarks/multi-round-qa/requirements.txt
-          pip install -e .
-
-      - name: Setup minikube environment
-        env:
-          DOCKER_BUILDKIT: 1
-        run: |
-          echo "🔧 Setting up minikube environment"
-          sudo sysctl fs.protected_regular=0
-          # Verify minikube is running
-          minikube status
-          # Ensure kubectl is configured for minikube
-          kubectl config use-context minikube
-
-      - name: Build and deploy router image
-        env:
-          DOCKER_BUILDKIT: 1
-        run: |
-          echo "🔨 Building router docker image"
-          cd ${{ github.workspace }}
-          eval "$(minikube docker-env)"
-          docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
-
-      - name: Run all k8s discovery routing tests
-        run: |
-          echo "🧪 Running all k8s discovery routing tests"
-          ./tests/e2e/run-k8s-routing-test.sh all \
-            --model "facebook/opt-125m" \
-            --num-requests 25 \
-            --chunk-size 128 \
-            --verbose \
-            --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
-            --timeout 10
-        timeout-minutes: 10
-
-      - name: Archive k8s discovery routing test results
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
-          path: |
-            /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
-
-      - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
-
-  static-discovery-e2e-test:
-    runs-on: self-hosted
-    needs: e2e-test
-    if: github.event.pull_request.draft == false
-    env:
-      LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e .
-
-      - name: Install vLLM and lmcache
-        run: |
-          pip install vllm
-          pip install lmcache
-
-      - name: Start 2 vLLM serve backends
-        run: |
-          echo "🚀 Starting vLLM serve backend"
-          mkdir -p "$LOG_DIR"
-          CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
-          CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
-
-      - name: Wait for backends to be ready
-        run: |
-          echo "⏳ Waiting for backends to be ready"
-          chmod +x tests/e2e/wait-for-backends.sh
-          ./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
-
-      - name: Run All Static Discovery Routing Tests
-        env:
-          PYTHONPATH: ${{ github.workspace }}/src
-        run: |
-          echo "🧪 Running all static discovery routing tests sequentially"
-          chmod +x tests/e2e/run-static-discovery-routing-test.sh
-          ./tests/e2e/run-static-discovery-routing-test.sh all \
-            --pythonpath "$PYTHONPATH" \
-            --log-dir "$LOG_DIR" \
-            --num-requests 20 \
-            --verbose \
-            --backends-url "http://localhost:8001,http://localhost:8002"
-        timeout-minutes: 5
-
-      - name: Archive static discovery test results and logs
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
-          path: |
-            ${{ env.LOG_DIR }}/*
-
-      - name: Cleanup processes
-        if: always()
-        run: |
-          echo "🧹 Cleaning up processes"
-          pkill -f "vllm serve" || true
-          pkill -f "python3 -m src.vllm_router.app" || true
-
-      - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
+  # k8s-discovery-e2e-test:
+  #   runs-on: self-hosted
+  #   needs: e2e-test
+  #   if: github.event.pull_request.draft == false
+  #   steps:
+  #     - name: Check out repository code
+  #       uses: actions/checkout@v4
+
+  #     - name: Setup Python
+  #       uses: actions/setup-python@v5
+  #       with:
+  #         python-version: "3.12"
+
+  #     - name: Install Python dependencies
+  #       run: |
+  #         python -m pip install --upgrade pip
+  #         pip install -r benchmarks/multi-round-qa/requirements.txt
+  #         pip install -e .
+
+  #     - name: Setup minikube environment
+  #       env:
+  #         DOCKER_BUILDKIT: 1
+  #       run: |
+  #         echo "🔧 Setting up minikube environment"
+  #         sudo sysctl fs.protected_regular=0
+  #         # Verify minikube is running
+  #         minikube status
+  #         # Ensure kubectl is configured for minikube
+  #         kubectl config use-context minikube
+
+  #     - name: Build and deploy router image
+  #       env:
+  #         DOCKER_BUILDKIT: 1
+  #       run: |
+  #         echo "🔨 Building router docker image"
+  #         cd ${{ github.workspace }}
+  #         eval "$(minikube docker-env)"
+  #         docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
+
+  #     - name: Run all k8s discovery routing tests
+  #       run: |
+  #         echo "🧪 Running all k8s discovery routing tests"
+  #         ./tests/e2e/run-k8s-routing-test.sh all \
+  #           --model "facebook/opt-125m" \
+  #           --num-requests 25 \
+  #           --chunk-size 128 \
+  #           --verbose \
+  #           --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
+  #           --timeout 10
+  #       timeout-minutes: 10
+
+  #     - name: Archive k8s discovery routing test results
+  #       uses: actions/upload-artifact@v4
+  #       if: always()
+  #       with:
+  #         name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+  #         path: |
+  #           /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
+
+  #     - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
+
+  # static-discovery-e2e-test:
+  #   runs-on: self-hosted
+  #   needs: e2e-test
+  #   if: github.event.pull_request.draft == false
+  #   env:
+  #     LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
+
+  #   steps:
+  #     - name: Check out repository code
+  #       uses: actions/checkout@v4
+
+  #     - name: Setup Python
+  #       uses: actions/setup-python@v5
+  #       with:
+  #         python-version: "3.12"
+
+  #     - name: Install Python dependencies
+  #       run: |
+  #         python -m pip install --upgrade pip
+  #         pip install -e .
+
+  #     - name: Install vLLM and lmcache
+  #       run: |
+  #         pip install vllm
+  #         pip install lmcache
+
+  #     - name: Start 2 vLLM serve backends
+  #       run: |
+  #         echo "🚀 Starting vLLM serve backend"
+  #         mkdir -p "$LOG_DIR"
+  #         CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
+  #         CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
+
+  #     - name: Wait for backends to be ready
+  #       run: |
+  #         echo "⏳ Waiting for backends to be ready"
+  #         chmod +x tests/e2e/wait-for-backends.sh
+  #         ./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
+
+  #     - name: Run All Static Discovery Routing Tests
+  #       env:
+  #         PYTHONPATH: ${{ github.workspace }}/src
+  #       run: |
+  #         echo "🧪 Running all static discovery routing tests sequentially"
+  #         chmod +x tests/e2e/run-static-discovery-routing-test.sh
+  #         ./tests/e2e/run-static-discovery-routing-test.sh all \
+  #           --pythonpath "$PYTHONPATH" \
+  #           --log-dir "$LOG_DIR" \
+  #           --num-requests 20 \
+  #           --verbose \
+  #           --backends-url "http://localhost:8001,http://localhost:8002"
+  #       timeout-minutes: 5
+
+  #     - name: Archive static discovery test results and logs
+  #       uses: actions/upload-artifact@v4
+  #       if: always()
+  #       with:
+  #         name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+  #         path: |
+  #           ${{ env.LOG_DIR }}/*
+
+  #     - name: Cleanup processes
+  #       if: always()
+  #       run: |
+  #         echo "🧹 Cleaning up processes"
+  #         pkill -f "vllm serve" || true
+  #         pkill -f "python3 -m src.vllm_router.app" || true
+
+  #     - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
 
   stress-test:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
     needs: e2e-test
     if: github.event.pull_request.draft == false
     env:

From 4265fdbcadf118121a31ad2b755d1849d9f1760b Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:17:12 -0700
Subject: [PATCH 4/9] Rename stress test job and script for clarity; update
 artifact naming and logging paths accordingly.

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .github/workflows/router-e2e-test.yml              | 14 +++++++-------
 .../e2e/{stress-test.sh => router-stress-test.sh}  |  0
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename tests/e2e/{stress-test.sh => router-stress-test.sh} (100%)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index 08d2991c8..a93ff48a9 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -232,12 +232,12 @@ jobs:
 
   #     - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
 
-  stress-test:
+  router-stress-test:
     runs-on: ubuntu-latest
     needs: e2e-test
     if: github.event.pull_request.draft == false
     env:
-      LOG_DIR: /tmp/stress-test-${{ github.event.pull_request.number || 'main' }}
+      LOG_DIR: /tmp/router-stress-test-${{ github.event.pull_request.number || 'main' }}
 
     steps:
       - name: Check out repository code
@@ -264,8 +264,8 @@ jobs:
           VLLM_ROUTER_STRESS_TEST_MODE: true
         run: |
           echo "🧪 Running router stress test with mock backends"
-          chmod +x tests/e2e/stress-test.sh
-          ./tests/e2e/stress-test.sh \
+          chmod +x tests/e2e/router-stress-test.sh
+          ./tests/e2e/router-stress-test.sh \
             --concurrent 100 \
             --requests 1000 \
             --port 30080 \
@@ -275,11 +275,11 @@ jobs:
             --backend2-port 8001
         timeout-minutes: 2
 
-      - name: Archive stress test results and logs
+      - name: Archive router stress test results and logs
         uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: stress-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          name: router-stress-test-results-pr-${{ github.event.pull_request.number || 'main' }}
           path: |
             ${{ env.LOG_DIR }}/*
 
@@ -289,4 +289,4 @@ jobs:
           echo "🧹 Cleaning up router processes"
           pkill -f "python3 -m src.vllm_router.app" || true
 
-      - run: echo "🍏 Stress test job status is ${{ job.status }}."
\ No newline at end of file
+      - run: echo "🍏 Router stress test job status is ${{ job.status }}."
\ No newline at end of file
diff --git a/tests/e2e/stress-test.sh b/tests/e2e/router-stress-test.sh
similarity index 100%
rename from tests/e2e/stress-test.sh
rename to tests/e2e/router-stress-test.sh

From e03b3a633676ce76856abd8a8f6e18bca12e648f Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:24:42 -0700
Subject: [PATCH 5/9] add comments

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 tests/e2e/router-stress-test.sh | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
index 9c7b37cc8..986f70d39 100755
--- a/tests/e2e/router-stress-test.sh
+++ b/tests/e2e/router-stress-test.sh
@@ -1,7 +1,22 @@
 #!/bin/bash
 
-# Simple Router Stress Test
+# Router Stress Test - Pure Router Logic Testing
 # Tests round-robin routing logic under high concurrent loads
+#
+# IMPORTANT NOTES:
+# - This test uses MOCK backends and MOCK responses (no real vLLM servers)
+# - Backend ports are dummy placeholders - no actual services run on them
+# - Model names are dummy placeholders - no real models are loaded
+# - When VLLM_ROUTER_STRESS_TEST_MODE=true, the router returns mock responses
+#   instead of forwarding requests to backends (see src/vllm_router/services/request_service/request.py)
+# - This test validates ONLY the router's routing logic, load balancing, and performance
+#   under high concurrent loads, not actual inference capabilities
+#
+# Purpose: Verify that the router correctly distributes requests using round-robin
+# logic and can handle high concurrency without routing logic failures.
+#
+# TODO: will add tests for prefix-aware and session-based router later
+# TODO: will add performance comparison tests & threshold later
 
 set -euo pipefail
 

From d58f4caa4e67bae612f7950eb925c07a4de6306d Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:36:27 -0700
Subject: [PATCH 6/9] bring back other tests and increase load

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .github/workflows/router-e2e-test.yml | 268 +++++++++++++-------------
 tests/e2e/router-stress-test.sh       |   4 +-
 2 files changed, 136 insertions(+), 136 deletions(-)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index a93ff48a9..cfb4e4ebb 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -101,139 +101,139 @@ jobs:
             ~/.kube/config
             src/tests/perftest/logs
 
-  # k8s-discovery-e2e-test:
-  #   runs-on: self-hosted
-  #   needs: e2e-test
-  #   if: github.event.pull_request.draft == false
-  #   steps:
-  #     - name: Check out repository code
-  #       uses: actions/checkout@v4
-
-  #     - name: Setup Python
-  #       uses: actions/setup-python@v5
-  #       with:
-  #         python-version: "3.12"
-
-  #     - name: Install Python dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install -r benchmarks/multi-round-qa/requirements.txt
-  #         pip install -e .
-
-  #     - name: Setup minikube environment
-  #       env:
-  #         DOCKER_BUILDKIT: 1
-  #       run: |
-  #         echo "🔧 Setting up minikube environment"
-  #         sudo sysctl fs.protected_regular=0
-  #         # Verify minikube is running
-  #         minikube status
-  #         # Ensure kubectl is configured for minikube
-  #         kubectl config use-context minikube
-
-  #     - name: Build and deploy router image
-  #       env:
-  #         DOCKER_BUILDKIT: 1
-  #       run: |
-  #         echo "🔨 Building router docker image"
-  #         cd ${{ github.workspace }}
-  #         eval "$(minikube docker-env)"
-  #         docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
-
-  #     - name: Run all k8s discovery routing tests
-  #       run: |
-  #         echo "🧪 Running all k8s discovery routing tests"
-  #         ./tests/e2e/run-k8s-routing-test.sh all \
-  #           --model "facebook/opt-125m" \
-  #           --num-requests 25 \
-  #           --chunk-size 128 \
-  #           --verbose \
-  #           --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
-  #           --timeout 10
-  #       timeout-minutes: 10
-
-  #     - name: Archive k8s discovery routing test results
-  #       uses: actions/upload-artifact@v4
-  #       if: always()
-  #       with:
-  #         name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
-  #         path: |
-  #           /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
-
-  #     - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
-
-  # static-discovery-e2e-test:
-  #   runs-on: self-hosted
-  #   needs: e2e-test
-  #   if: github.event.pull_request.draft == false
-  #   env:
-  #     LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
-
-  #   steps:
-  #     - name: Check out repository code
-  #       uses: actions/checkout@v4
-
-  #     - name: Setup Python
-  #       uses: actions/setup-python@v5
-  #       with:
-  #         python-version: "3.12"
-
-  #     - name: Install Python dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install -e .
-
-  #     - name: Install vLLM and lmcache
-  #       run: |
-  #         pip install vllm
-  #         pip install lmcache
-
-  #     - name: Start 2 vLLM serve backends
-  #       run: |
-  #         echo "🚀 Starting vLLM serve backend"
-  #         mkdir -p "$LOG_DIR"
-  #         CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
-  #         CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
-
-  #     - name: Wait for backends to be ready
-  #       run: |
-  #         echo "⏳ Waiting for backends to be ready"
-  #         chmod +x tests/e2e/wait-for-backends.sh
-  #         ./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
-
-  #     - name: Run All Static Discovery Routing Tests
-  #       env:
-  #         PYTHONPATH: ${{ github.workspace }}/src
-  #       run: |
-  #         echo "🧪 Running all static discovery routing tests sequentially"
-  #         chmod +x tests/e2e/run-static-discovery-routing-test.sh
-  #         ./tests/e2e/run-static-discovery-routing-test.sh all \
-  #           --pythonpath "$PYTHONPATH" \
-  #           --log-dir "$LOG_DIR" \
-  #           --num-requests 20 \
-  #           --verbose \
-  #           --backends-url "http://localhost:8001,http://localhost:8002"
-  #       timeout-minutes: 5
-
-  #     - name: Archive static discovery test results and logs
-  #       uses: actions/upload-artifact@v4
-  #       if: always()
-  #       with:
-  #         name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
-  #         path: |
-  #           ${{ env.LOG_DIR }}/*
-
-  #     - name: Cleanup processes
-  #       if: always()
-  #       run: |
-  #         echo "🧹 Cleaning up processes"
-  #         pkill -f "vllm serve" || true
-  #         pkill -f "python3 -m src.vllm_router.app" || true
-
-  #     - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
+  k8s-discovery-e2e-test:
+    runs-on: self-hosted
+    needs: e2e-test
+    if: github.event.pull_request.draft == false
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r benchmarks/multi-round-qa/requirements.txt
+          pip install -e .
+
+      - name: Setup minikube environment
+        env:
+          DOCKER_BUILDKIT: 1
+        run: |
+          echo "🔧 Setting up minikube environment"
+          sudo sysctl fs.protected_regular=0
+          # Verify minikube is running
+          minikube status
+          # Ensure kubectl is configured for minikube
+          kubectl config use-context minikube
+
+      - name: Build and deploy router image
+        env:
+          DOCKER_BUILDKIT: 1
+        run: |
+          echo "🔨 Building router docker image"
+          cd ${{ github.workspace }}
+          eval "$(minikube docker-env)"
+          docker build --build-arg INSTALL_OPTIONAL_DEP=default -t git-act-router -f docker/Dockerfile.kvaware .
+
+      - name: Run all k8s discovery routing tests
+        run: |
+          echo "🧪 Running all k8s discovery routing tests"
+          ./tests/e2e/run-k8s-routing-test.sh all \
+            --model "facebook/opt-125m" \
+            --num-requests 25 \
+            --chunk-size 128 \
+            --verbose \
+            --result-dir /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }} \
+            --timeout 10
+        timeout-minutes: 10
+
+      - name: Archive k8s discovery routing test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: k8s-discovery-routing-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          path: |
+            /tmp/k8s-discovery-routing-results-pr-${{ github.event.pull_request.number || 'main' }}/*
+
+      - run: echo "🍏 K8s discovery e2e test job status is ${{ job.status }}."
+
+  static-discovery-e2e-test:
+    runs-on: self-hosted
+    needs: e2e-test
+    if: github.event.pull_request.draft == false
+    env:
+      LOG_DIR: /tmp/static-discovery-e2e-test-${{ github.event.pull_request.number || 'main' }}
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Install vLLM and lmcache
+        run: |
+          pip install vllm
+          pip install lmcache
+
+      - name: Start 2 vLLM serve backends
+        run: |
+          echo "🚀 Starting vLLM serve backend"
+          mkdir -p "$LOG_DIR"
+          CUDA_VISIBLE_DEVICES=0 vllm serve facebook/opt-125m --port 8001 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend1.log" 2>&1 &
+          CUDA_VISIBLE_DEVICES=1 vllm serve facebook/opt-125m --port 8002 --gpu-memory-utilization 0.7 --chat-template .github/template-chatml.jinja > "$LOG_DIR/backend2.log" 2>&1 &
+
+      - name: Wait for backends to be ready
+        run: |
+          echo "⏳ Waiting for backends to be ready"
+          chmod +x tests/e2e/wait-for-backends.sh
+          ./tests/e2e/wait-for-backends.sh 180 "http://localhost:8001" "http://localhost:8002"
+
+      - name: Run All Static Discovery Routing Tests
+        env:
+          PYTHONPATH: ${{ github.workspace }}/src
+        run: |
+          echo "🧪 Running all static discovery routing tests sequentially"
+          chmod +x tests/e2e/run-static-discovery-routing-test.sh
+          ./tests/e2e/run-static-discovery-routing-test.sh all \
+            --pythonpath "$PYTHONPATH" \
+            --log-dir "$LOG_DIR" \
+            --num-requests 20 \
+            --verbose \
+            --backends-url "http://localhost:8001,http://localhost:8002"
+        timeout-minutes: 5
+
+      - name: Archive static discovery test results and logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: static-discovery-test-results-pr-${{ github.event.pull_request.number || 'main' }}
+          path: |
+            ${{ env.LOG_DIR }}/*
+
+      - name: Cleanup processes
+        if: always()
+        run: |
+          echo "🧹 Cleaning up processes"
+          pkill -f "vllm serve" || true
+          pkill -f "python3 -m src.vllm_router.app" || true
+
+      - run: echo "🍏 Static discovery e2e test job status is ${{ job.status }}."
 
   router-stress-test:
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
     needs: e2e-test
     if: github.event.pull_request.draft == false
     env:
@@ -266,14 +266,14 @@ jobs:
           echo "🧪 Running router stress test with mock backends"
           chmod +x tests/e2e/router-stress-test.sh
           ./tests/e2e/router-stress-test.sh \
-            --concurrent 100 \
-            --requests 1000 \
+            --concurrent 20000 \
+            --requests 100000 \
             --port 30080 \
             --log-dir "$LOG_DIR" \
             --model "facebook/opt-125m" \
             --backend1-port 8000 \
             --backend2-port 8001
-        timeout-minutes: 2
+        timeout-minutes: 10
 
       - name: Archive router stress test results and logs
         uses: actions/upload-artifact@v4
diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
index 986f70d39..c3e4492fa 100755
--- a/tests/e2e/router-stress-test.sh
+++ b/tests/e2e/router-stress-test.sh
@@ -22,8 +22,8 @@ set -euo pipefail
 
 # Default values
 ROUTER_PORT=30080
-CONCURRENT=200
-REQUESTS=1000
+CONCURRENT=20000
+REQUESTS=100000
 LOG_DIR="/tmp/router-stress-logs"
 MODEL="facebook/opt-125m"
 BACKEND1_PORT=8000

From ca4515be9f1a54ae02140796602a3d37fbe75972 Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:54:57 -0700
Subject: [PATCH 7/9] pass pre-commit

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 .github/workflows/router-e2e-test.yml         |  2 +-
 .../services/request_service/request.py       | 14 +++---
 tests/e2e/router-stress-test.sh               | 44 +++++++++----------
 3 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/router-e2e-test.yml b/.github/workflows/router-e2e-test.yml
index cfb4e4ebb..ab63b7c48 100644
--- a/.github/workflows/router-e2e-test.yml
+++ b/.github/workflows/router-e2e-test.yml
@@ -289,4 +289,4 @@ jobs:
           echo "🧹 Cleaning up router processes"
           pkill -f "python3 -m src.vllm_router.app" || true
 
-      - run: echo "🍏 Router stress test job status is ${{ job.status }}."
\ No newline at end of file
+      - run: echo "🍏 Router stress test job status is ${{ job.status }}."
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index bc8a9d984..4b130d6ab 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -95,20 +95,24 @@ async def process_request(
     request.app.state.request_stats_monitor.on_new_request(
         backend_url, request_id, start_time
     )
-    
+
     # Check if stress test mode is enabled; used for tests/e2e/stress-test.sh
     if os.getenv("VLLM_ROUTER_STRESS_TEST_MODE", "false").lower() == "true":
         # Mock response for stress testing - skip backend calls
         mock_headers = {"content-type": "application/json", "x-request-id": request_id}
         mock_response = b'{"id":"test","object":"chat.completion","choices":[{"message":{"role":"assistant","content":"Test"},"index":0,"finish_reason":"stop"}]}'
-        
+
         # Yield headers and mock response
         yield mock_headers, 200
-        request.app.state.request_stats_monitor.on_request_response(backend_url, request_id, time.time())
+        request.app.state.request_stats_monitor.on_request_response(
+            backend_url, request_id, time.time()
+        )
         yield mock_response
-        request.app.state.request_stats_monitor.on_request_complete(backend_url, request_id, time.time())
+        request.app.state.request_stats_monitor.on_request_complete(
+            backend_url, request_id, time.time()
+        )
         return
-    
+
     # Check if this is a streaming request
     is_streaming = False
     try:
diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
index c3e4492fa..e8dcec705 100755
--- a/tests/e2e/router-stress-test.sh
+++ b/tests/e2e/router-stress-test.sh
@@ -94,15 +94,15 @@ cleanup() {
 # Function to start router
 start_router() {
     local log_file="$LOG_DIR/router.log"
-    
+
     print_status "Starting router with round-robin routing (stress test mode)"
-    
+
     # Create log directory
     mkdir -p "$(dirname "$log_file")"
-    
+
     # Set stress test mode
     export VLLM_ROUTER_STRESS_TEST_MODE=true
-    
+
     # Start router with detailed logging
     python3 -m src.vllm_router.app --port "$ROUTER_PORT" \
         --service-discovery static \
@@ -112,10 +112,10 @@ start_router() {
         --routing-logic roundrobin \
         --log-stats \
         --log-stats-interval 5 > "$log_file" 2>&1 &
-    
+
     ROUTER_PID=$!
     print_status "Router started with PID: $ROUTER_PID"
-    
+
     # Wait for router to be ready
     print_status "Waiting for router to be ready..."
     timeout 30 bash -c "until curl -s http://localhost:$ROUTER_PORT/v1/models > /dev/null 2>&1; do sleep 1; done" || {
@@ -131,7 +131,7 @@ start_router() {
 run_stress_test() {
     print_status "Running stress test with Apache Bench"
     print_status "Concurrent: $CONCURRENT, Total: $REQUESTS"
-    
+
     # Create payload file
     local payload_file="/tmp/stress_payload.json"
     cat > "$payload_file" << EOF
@@ -144,7 +144,7 @@ run_stress_test() {
     "temperature": 0.7
 }
 EOF
-    
+
     # Run Apache Bench
     ab -c "$CONCURRENT" \
        -n "$REQUESTS" \
@@ -153,12 +153,12 @@ EOF
        -H "Authorization: Bearer test" \
        -H "x-user-id: stress-test-user" \
        "http://localhost:$ROUTER_PORT/v1/chat/completions"
-    
+
     # Clean up payload file
     rm -f "$payload_file"
-    
+
     print_status "Stress test completed"
-    
+
     # Small delay to ensure all logs are written
     sleep 2
 }
@@ -166,40 +166,40 @@ EOF
 # Function to check round-robin correctness
 check_roundrobin_correctness() {
     local log_file="$LOG_DIR/router.log"
-    
+
     print_status "Checking round-robin routing correctness..."
-    
+
     if [ ! -f "$log_file" ]; then
         print_error "Router log file not found: $log_file"
         return 1
     fi
-    
+
     # Extract backend routing decisions from logs
     # Look for "Routing request ... to http://localhost:XXXX"
     local backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
     local backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
     local total_routed=$((backend1_count + backend2_count))
-    
+
     print_status "Round-robin routing results:"
     print_status "  Backend localhost:$BACKEND1_PORT: $backend1_count requests"
     print_status "  Backend localhost:$BACKEND2_PORT: $backend2_count requests"
     print_status "  Total routed: $total_routed requests"
-    
+
     if [ "$total_routed" -eq 0 ]; then
         print_error "No routing decisions found in logs"
         return 1
     fi
-    
+
     # Calculate percentages
     local backend1_pct=$((backend1_count * 100 / total_routed))
     local backend2_pct=$((backend2_count * 100 / total_routed))
-    
+
     print_status "  Backend localhost:$BACKEND1_PORT: ${backend1_pct}%"
     print_status "  Backend localhost:$BACKEND2_PORT: ${backend2_pct}%"
-    
+
     # Check if distribution is roughly even (within 20% tolerance)
     local diff=$((backend1_pct > backend2_pct ? backend1_pct - backend2_pct : backend2_pct - backend1_pct))
-    
+
     if [ "$diff" -le 20 ]; then
         print_status "✅ Round-robin routing is working correctly (${diff}% difference)"
         return 0
@@ -214,7 +214,7 @@ check_roundrobin_correctness() {
 # Function to show log summary
 show_log_summary() {
     local log_file="$LOG_DIR/router.log"
-    
+
     if [ -f "$log_file" ]; then
         print_status "Log summary (last 20 lines):"
         tail -20 "$log_file" | sed 's/^/  /'
@@ -292,4 +292,4 @@ else
     print_error "Test completed but round-robin routing correctness check failed!"
     show_log_summary
     exit 1
-fi 
\ No newline at end of file
+fi

From 0e79e3dbbf1fef8f1baafed2ae45db61866834b3 Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 00:57:21 -0700
Subject: [PATCH 8/9] Update tests/e2e/router-stress-test.sh

per gemini's suggestion

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 tests/e2e/router-stress-test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
index e8dcec705..dbd844e52 100755
--- a/tests/e2e/router-stress-test.sh
+++ b/tests/e2e/router-stress-test.sh
@@ -55,8 +55,8 @@ Router Stress Test - Tests round-robin routing logic
 Usage: $0 [options]
 
 Options:
-    -c, --concurrent N      Concurrent requests (default: 2000)
-    -n, --requests N        Total requests (default: 10000)
+    -c, --concurrent N      Concurrent requests (default: 20000)
+    -n, --requests N        Total requests (default: 100000)
     -p, --port PORT         Router port (default: 30080)
     -l, --log-dir DIR       Log directory (default: /tmp/router-stress-logs)
     -m, --model MODEL       Model to use (default: facebook/opt-125m)
@@ -65,7 +65,7 @@ Options:
     -h, --help              Show this help
 
 Examples:
-    $0                      # Basic test (2000 concurrent, 10000 requests)
+    $0                      # Basic test (20000 concurrent, 100000 requests)
     $0 -c 500 -n 20000     # High load test
     $0 -p 8080 -c 100      # Different port, lower load
     $0 --backend1-port 9000 --backend2-port 9001  # Custom backend ports

From 57dfd52410de6d1f6ff8ce631bac504b1a78ea01 Mon Sep 17 00:00:00 2001
From: Kobe Chen <xiaokunchen0@gmail.com>
Date: Wed, 2 Jul 2025 01:22:54 -0700
Subject: [PATCH 9/9] pass shellcheck

Signed-off-by: Kobe Chen <xiaokunchen0@gmail.com>
---
 tests/e2e/router-stress-test.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/router-stress-test.sh b/tests/e2e/router-stress-test.sh
index dbd844e52..107f64d92 100755
--- a/tests/e2e/router-stress-test.sh
+++ b/tests/e2e/router-stress-test.sh
@@ -176,8 +176,12 @@ check_roundrobin_correctness() {
 
     # Extract backend routing decisions from logs
     # Look for "Routing request ... to http://localhost:XXXX"
-    local backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
-    local backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
+    local backend1_count
+    backend1_count=$(grep -c "to http://localhost:$BACKEND1_PORT" "$log_file" || echo "0")
+
+    local backend2_count
+    backend2_count=$(grep -c "to http://localhost:$BACKEND2_PORT" "$log_file" || echo "0")
+
     local total_routed=$((backend1_count + backend2_count))
 
     print_status "Round-robin routing results:"