Skip to content

Commit 2361a17

Browse files
committed
add TtftRouter
Signed-off-by: chickeyton <[email protected]>
1 parent 6b0a04a commit 2361a17

File tree

6 files changed

+416
-55
lines changed

6 files changed

+416
-55
lines changed

benchmarks/multi-round-qa/multi-round-qa.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ class WorkloadConfig:
4040
# Whether to include user id in request header
4141
enable_user_id: bool
4242

43+
# Max number of unfinished queries allowed (None means no limit)
44+
max_unfinished_queries: Optional[int]
45+
4346

4447
@dataclass
4548
class UserConfig:
@@ -419,6 +422,13 @@ def step(self, timestamp: float, executor: RequestExecutor):
419422
if self.start_time is None:
420423
self.start_time = timestamp
421424

425+
pending_queries = len([s for s in self.sessions if s.has_unfinished_request])
426+
# Only check limit if max_unfinished_queries is set
427+
if (self.workload_config.max_unfinished_queries is not None and
428+
pending_queries > self.workload_config.max_unfinished_queries):
429+
logger.info(f"unfinished queries >{self.workload_config.max_unfinished_queries}, waiting")
430+
return
431+
422432
if timestamp - self.last_user_join > self.gap_between_users:
423433
self._create_user_session()
424434
self.last_user_join = timestamp
@@ -625,6 +635,12 @@ def parse_arguments() -> WorkloadConfig:
625635
parser.add_argument(
626636
"--sharegpt", action="store_true", help="Whether to use ShareGPT dataset"
627637
)
638+
parser.add_argument(
639+
"--max-unfinished-queries",
640+
type=int,
641+
default=None,
642+
help="Maximum number of unfinished queries allowed (default: no limit)",
643+
)
628644
args = parser.parse_args()
629645
return args
630646

@@ -675,6 +691,7 @@ def main():
675691
qps=args.qps,
676692
model=args.model,
677693
enable_user_id=args.request_with_user_id,
694+
max_unfinished_queries=args.max_unfinished_queries,
678695
)
679696

680697
manager = UserSessionManager(

src/vllm_router/app.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,21 @@ async def lifespan(app: FastAPI):
109109
dyn_cfg_watcher.close()
110110

111111

112+
def create_instance_id_to_url(lmcache_instances, static_backends):
113+
if lmcache_instances is None or static_backends is None:
114+
return None
115+
instance_ids = lmcache_instances.strip().split(',')
116+
urls = parse_static_urls(static_backends)
117+
if not instance_ids or not urls:
118+
return None
119+
if len(instance_ids) != len(urls):
120+
raise ValueError("length of lmcache-instances & static-backends mismatched")
121+
instance_id_to_url = {}
122+
for instance_id, url in zip(instance_ids, urls):
123+
instance_id_to_url[instance_id] = url
124+
return instance_id_to_url
125+
126+
112127
def initialize_all(app: FastAPI, args):
113128
"""
114129
Initialize all the components of the router with the given arguments.
@@ -206,6 +221,9 @@ def initialize_all(app: FastAPI, args):
206221
prefill_model_labels=args.prefill_model_labels,
207222
decode_model_labels=args.decode_model_labels,
208223
kv_aware_threshold=args.kv_aware_threshold,
224+
tokenizer=args.tokenizer,
225+
instance_id_to_url=create_instance_id_to_url(args.lmcache_instances,
226+
args.static_backends),
209227
)
210228

211229
# Initialize feature gates

src/vllm_router/parsers/parser.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from vllm_router.parsers.yaml_utils import (
2121
read_and_process_yaml_config_file,
2222
)
23+
from vllm_router.routers.routing_logic import RoutingLogic
2324
from vllm_router.version import __version__
2425

2526
try:
@@ -203,13 +204,7 @@ def parse_args():
203204
parser.add_argument(
204205
"--routing-logic",
205206
type=str,
206-
choices=[
207-
"roundrobin",
208-
"session",
209-
"kvaware",
210-
"prefixaware",
211-
"disaggregated_prefill",
212-
],
207+
choices=[routing for routing in RoutingLogic],
213208
help="The routing logic to use",
214209
)
215210
parser.add_argument(
@@ -218,12 +213,25 @@ def parse_args():
218213
default=9000,
219214
help="The port of the LMCache controller.",
220215
)
216+
parser.add_argument(
217+
"--lmcache-instances",
218+
type=str,
219+
default=None,
220+
help="The instance id in the lmcache config files, must be with the length of static-backends,"
221+
" separated by commas. E.g., instance_0,instance_1",
222+
)
221223
parser.add_argument(
222224
"--session-key",
223225
type=str,
224226
default=None,
225227
help="The key (in the header) to identify a session.",
226228
)
229+
parser.add_argument(
230+
"--tokenizer",
231+
type=str,
232+
default=None,
233+
help="The tokenizer model.",
234+
)
227235
parser.add_argument(
228236
"--callbacks",
229237
type=str,

0 commit comments

Comments
 (0)