Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions benchmarks/multi-round-qa/multi-round-qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ class WorkloadConfig:
# Whether to include user id in request header
enable_user_id: bool

# Max number of unfinished queries allowed (None means no limit)
max_unfinished_queries: Optional[int]


@dataclass
class UserConfig:
Expand Down Expand Up @@ -419,6 +422,13 @@ def step(self, timestamp: float, executor: RequestExecutor):
if self.start_time is None:
self.start_time = timestamp

pending_queries = len([s for s in self.sessions if s.has_unfinished_request])
# Only check limit if max_unfinished_queries is set
if (self.workload_config.max_unfinished_queries is not None and
pending_queries > self.workload_config.max_unfinished_queries):
logger.info(f"unfinished queries >{self.workload_config.max_unfinished_queries}, waiting")
return

if timestamp - self.last_user_join > self.gap_between_users:
self._create_user_session()
self.last_user_join = timestamp
Expand Down Expand Up @@ -625,6 +635,12 @@ def parse_arguments() -> WorkloadConfig:
parser.add_argument(
"--sharegpt", action="store_true", help="Whether to use ShareGPT dataset"
)
parser.add_argument(
"--max-unfinished-queries",
type=int,
default=None,
help="Maximum number of unfinished queries allowed (default: no limit)",
)
args = parser.parse_args()
return args

Expand Down Expand Up @@ -675,6 +691,7 @@ def main():
qps=args.qps,
model=args.model,
enable_user_id=args.request_with_user_id,
max_unfinished_queries=args.max_unfinished_queries,
)

manager = UserSessionManager(
Expand Down
15 changes: 15 additions & 0 deletions src/vllm_router/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,18 @@ async def lifespan(app: FastAPI):
dyn_cfg_watcher.close()


def create_instance_id_to_url(lmcache_instances, static_backends):
if lmcache_instances is None or static_backends is None:
return None
instance_ids = [s.strip() for s in lmcache_instances.split(',') if s.strip()]
urls = parse_static_urls(static_backends)
if not instance_ids or not urls:
return None
if len(instance_ids) != len(urls):
raise ValueError("length of lmcache-instances & static-backends mismatched")
return dict(zip(instance_ids, urls))


def initialize_all(app: FastAPI, args):
"""
Initialize all the components of the router with the given arguments.
Expand Down Expand Up @@ -206,6 +218,9 @@ def initialize_all(app: FastAPI, args):
prefill_model_labels=args.prefill_model_labels,
decode_model_labels=args.decode_model_labels,
kv_aware_threshold=args.kv_aware_threshold,
tokenizer=args.tokenizer,
instance_id_to_url=create_instance_id_to_url(args.lmcache_instances,
args.static_backends),
)

# Initialize feature gates
Expand Down
22 changes: 15 additions & 7 deletions src/vllm_router/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from vllm_router.parsers.yaml_utils import (
read_and_process_yaml_config_file,
)
from vllm_router.routers.routing_logic import RoutingLogic
from vllm_router.version import __version__

try:
Expand Down Expand Up @@ -203,13 +204,7 @@ def parse_args():
parser.add_argument(
"--routing-logic",
type=str,
choices=[
"roundrobin",
"session",
"kvaware",
"prefixaware",
"disaggregated_prefill",
],
choices=[routing for routing in RoutingLogic],
help="The routing logic to use",
)
parser.add_argument(
Expand All @@ -218,12 +213,25 @@ def parse_args():
default=9000,
help="The port of the LMCache controller.",
)
parser.add_argument(
"--lmcache-instances",
type=str,
default=None,
help="The instance id in the lmcache config files, must be with the length of static-backends,"
" separated by commas. E.g., instance_0,instance_1",
)
parser.add_argument(
"--session-key",
type=str,
default=None,
help="The key (in the header) to identify a session.",
)
parser.add_argument(
"--tokenizer",
type=str,
default=None,
help="The tokenizer model.",
)
parser.add_argument(
"--callbacks",
type=str,
Expand Down
Loading