vllm-project
diff --git a/‎benchmarks/multi-round-qa/multi-round-qa.py
Lines changed: 17 additions & 0 deletions b/‎benchmarks/multi-round-qa/multi-round-qa.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/vllm_router/app.py
Lines changed: 18 additions & 0 deletions b/‎src/vllm_router/app.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/vllm_router/parsers/parser.py
Lines changed: 15 additions & 7 deletions b/‎src/vllm_router/parsers/parser.py
Lines changed: 15 additions & 7 deletions
@@ -40,6 +40,9 @@ class WorkloadConfig:
     # Whether to include user id in request header
     enable_user_id: bool
 
+    # Max number of unfinished queries allowed (None means no limit)
+    max_unfinished_queries: Optional[int]
+
 
 @dataclass
 class UserConfig:
@@ -419,6 +422,13 @@ def step(self, timestamp: float, executor: RequestExecutor):
         if self.start_time is None:
             self.start_time = timestamp
 
+        pending_queries = len([s for s in self.sessions if s.has_unfinished_request])
+        # Only check limit if max_unfinished_queries is set
+        if (self.workload_config.max_unfinished_queries is not None and 
+            pending_queries > self.workload_config.max_unfinished_queries):
+            logger.info(f"unfinished queries >{self.workload_config.max_unfinished_queries}, waiting")
+            return
+
         if timestamp - self.last_user_join > self.gap_between_users:
             self._create_user_session()
             self.last_user_join = timestamp
@@ -625,6 +635,12 @@ def parse_arguments() -> WorkloadConfig:
     parser.add_argument(
         "--sharegpt", action="store_true", help="Whether to use ShareGPT dataset"
     )
+    parser.add_argument(
+        "--max-unfinished-queries",
+        type=int,
+        default=None,
+        help="Maximum number of unfinished queries allowed (default: no limit)",
+    )
     args = parser.parse_args()
     return args
 
@@ -675,6 +691,7 @@ def main():
         qps=args.qps,
         model=args.model,
         enable_user_id=args.request_with_user_id,
+        max_unfinished_queries=args.max_unfinished_queries,
     )
 
     manager = UserSessionManager(
 
@@ -109,6 +109,21 @@ async def lifespan(app: FastAPI):
         dyn_cfg_watcher.close()
 
 
+def create_instance_id_to_url(lmcache_instances, static_backends):
+    if lmcache_instances is None or static_backends is None:
+        return None
+    instance_ids = lmcache_instances.strip().split(',')
+    urls = parse_static_urls(static_backends)
+    if not instance_ids or not urls:
+        return None
+    if len(instance_ids) != len(urls):
+        raise ValueError("length of lmcache-instances & static-backends mismatched")
+    instance_id_to_url = {}
+    for instance_id, url in zip(instance_ids, urls):
+        instance_id_to_url[instance_id] = url
+    return instance_id_to_url
+
+
 def initialize_all(app: FastAPI, args):
     """
     Initialize all the components of the router with the given arguments.
@@ -206,6 +221,9 @@ def initialize_all(app: FastAPI, args):
         prefill_model_labels=args.prefill_model_labels,
         decode_model_labels=args.decode_model_labels,
         kv_aware_threshold=args.kv_aware_threshold,
+        tokenizer=args.tokenizer,
+        instance_id_to_url=create_instance_id_to_url(args.lmcache_instances,
+                                                     args.static_backends),
     )
 
     # Initialize feature gates
 
@@ -20,6 +20,7 @@
 from vllm_router.parsers.yaml_utils import (
     read_and_process_yaml_config_file,
 )
+from vllm_router.routers.routing_logic import RoutingLogic
 from vllm_router.version import __version__
 
 try:
@@ -203,13 +204,7 @@ def parse_args():
     parser.add_argument(
         "--routing-logic",
         type=str,
-        choices=[
-            "roundrobin",
-            "session",
-            "kvaware",
-            "prefixaware",
-            "disaggregated_prefill",
-        ],
+        choices=[routing for routing in RoutingLogic],
         help="The routing logic to use",
     )
     parser.add_argument(
@@ -218,12 +213,25 @@ def parse_args():
         default=9000,
         help="The port of the LMCache controller.",
     )
+    parser.add_argument(
+        "--lmcache-instances",
+        type=str,
+        default=None,
+        help="The instance id in the lmcache config files, must be with the length of static-backends,"
+             " separated by commas. E.g., instance_0,instance_1",
+    )
     parser.add_argument(
         "--session-key",
         type=str,
         default=None,
         help="The key (in the header) to identify a session.",
     )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        default=None,
+        help="The tokenizer model.",
+    )
     parser.add_argument(
         "--callbacks",
         type=str,