Merge branch 'main' into pr/transcription-whisper

davidgao7 · web-flow · commit 4aa1a8b121fc · 2025-08-21T22:10:27.000+08:00
diff --git a/docs/source/use_cases/kv-cache-aware-routing.rst b/docs/source/use_cases/kv-cache-aware-routing.rst
@@ -1,7 +1,7 @@
 KV Cache Aware Routing
 ======================
 
-This tutorial demonstrates how to use KV cache aware routing in the vLLM Production Stack. KV cache aware routing ensures that subsequent requests with the same prompt prefix are routed to the same instance, maximizing KV cache utilization and improving performance.
+In this tutorial, you'll learn how to enable and use KV cache aware routing in the vLLM Production Stack. With KV cache aware routing, incoming requests are routed to the instance with the highest KV cache hit rate, which helps maximize cache efficiency and boost overall performance. Unlike prefix aware routing—which always sends requests with the same prefix to the same instance, even if the cache has been evicted—KV cache aware routing prioritizes cache hits to optimize resource usage.
 
 Table of Contents
 -----------------
@@ -78,7 +78,7 @@ Then, send another request with the same prompt prefix:
        "max_tokens": 100
      }'
 
-You should observe that the second request is routed to the same instance as the first request. This is because the KV cache aware router detects that the second request shares a prefix with the first request and routes it to the same instance to maximize KV cache utilization.
+You should observe that the second request is routed to the same instance as the first request. This is because the KV cache aware router detects that the second request has a higher KV cache hit rate in the instance of the first request and routes it to the same instance to maximize KV cache utilization.
 
 Step 4: Clean Up
 -----------------
@@ -98,4 +98,4 @@ In this tutorial, we've demonstrated how to:
 2. Set up port forwarding to access the router
 3. Test the KV cache aware routing functionality
 
-The KV cache aware routing feature helps improve performance by ensuring that requests with shared prefixes are routed to the same instance, maximizing KV cache utilization.
+The KV cache aware routing feature helps improve performance by ensuring that requests will be routed to the instance with the highest KV cache hit rate, maximizing KV cache utilization.
diff --git a/src/tests/test_roundrobin_router.py b/src/tests/test_roundrobin_router.py
@@ -0,0 +1,74 @@
+import random
+from typing import Dict, List, Tuple
+
+from vllm_router.routers.routing_logic import RoundRobinRouter
+
+
+class EndpointInfo:
+    def __init__(self, url: str):
+        self.url = url
+
+
+class RequestStats:
+    def __init__(self, qps: float):
+        self.qps = qps
+
+
+class Request:
+    def __init__(self, headers: Dict[str, str]):
+        self.headers = headers
+
+
+class EngineStats:
+    def __init__(self):
+        return
+
+
+def generate_request_args(
+    num_endpoints: int, qps_range: int = 0
+) -> Tuple[List[EndpointInfo], Dict[str, EngineStats], Dict[str, RequestStats]]:
+    endpoints = [
+        EndpointInfo(
+            url=f"{endpoint_index}",
+        )
+        for endpoint_index in range(num_endpoints)
+    ]
+    engine_stats = {
+        f"{endpoint_index}": EngineStats() for endpoint_index in range(num_endpoints)
+    }
+    request_stats = {
+        f"{endpoint_index}": RequestStats(qps=random.uniform(0, qps_range))
+        for endpoint_index in range(num_endpoints)
+    }
+    return endpoints, engine_stats, request_stats
+
+
+def generate_request(request_type="http") -> Request:
+    return Request({"type": request_type})
+
+
+def test_roundrobin_logic(
+    dynamic_discoveries: int = 10, max_endpoints: int = 1000, max_requests: int = 10000
+):
+    """
+    Ensure that all active urls have roughly same number of requests (difference at most 1)
+    """
+    router = RoundRobinRouter()
+
+    def _fixed_router_check(num_endpoints: int, num_requests: int) -> bool:
+        # Make num_requests requests to the router and check even output distribution
+        endpoints, engine_stats, request_stats = generate_request_args(num_endpoints)
+        output_distribution = {}
+        for request_idx in range(num_requests):
+            request = generate_request()
+            url = router.route_request(endpoints, engine_stats, request_stats, request)
+            output_distribution[url] = output_distribution.get(url, 0) + 1
+        request_counts = output_distribution.values()
+        return max(request_counts) - min(request_counts) <= 1
+
+    for _ in range(dynamic_discoveries):
+        num_endpoints = random.randint(1, max_endpoints)
+        num_requests = random.randint(1, max_requests)
+        # Perform router check
+        res = _fixed_router_check(num_endpoints, num_requests)
+        assert res