Skip to content

Commit a956b41

Browse files
make scale out policy consistent between deployments (#70)
1 parent 0869543 commit a956b41

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

llmserve/backend/server/run.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,12 @@ def llm_server(args: Union[str, LLMApp, List[Union[LLMApp, str]]]):
7878
user_config=user_config,
7979
**deployment_config,
8080
).bind()
81-
# test = []
82-
return RouterDeployment.bind(deployments, model_configs) # pylint:disable=no-member
81+
82+
return RouterDeployment.options(
83+
name=_reverse_prefix(model.model_config.model_id) + "-route",
84+
max_concurrent_queries=max_concurrent_queries,
85+
**deployment_config,
86+
).bind(deployments, model_configs) # pylint:disable=no-member
8387

8488

8589
def llm_experimental(args: Union[str, LLMApp, List[Union[LLMApp, str]]]):

models/text-generation--facebook--opt-125m.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
deployment_config:
2+
max_concurrent_queries: 100
23
autoscaling_config:
34
min_replicas: 1
45
initial_replicas: 1
5-
max_replicas: 8
6-
target_num_ongoing_requests_per_replica: 1.0
6+
max_replicas: 3
7+
target_num_ongoing_requests_per_replica: 10
78
metrics_interval_s: 10.0
89
look_back_period_s: 30.0
910
smoothing_factor: 1.0
1011
downscale_delay_s: 300.0
11-
upscale_delay_s: 90.0
12+
upscale_delay_s: 60.0
1213
ray_actor_options:
1314
num_cpus: 0.1 # for a model deployment, we have 3 actor created, 1 and 2 will cost 0.1 cpu, and the model infrence will cost 6(see the setting in the end of the file)
1415
model_config:
@@ -48,4 +49,4 @@ model_config:
4849
scaling_config:
4950
num_workers: 1
5051
num_gpus_per_worker: 0
51-
num_cpus_per_worker: 3 # for inference
52+
num_cpus_per_worker: 2 # for inference

0 commit comments

Comments
 (0)