Skip to content

Commit 38ff781

Browse files
committed
[Serve] Fix OpenAIIngress scale-to-zero when all models have min_replicas=0
1 parent f5a53c4 commit 38ff781

File tree

2 files changed

+147
-1
lines changed

2 files changed

+147
-1
lines changed

python/ray/llm/_internal/serve/core/ingress/ingress.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,27 @@
9797
},
9898
}
9999

100+
101+
def _get_min_replicas_from_llm_config(config: LLMConfig) -> Optional[int]:
102+
autoscaling_config = config.deployment_config.get("autoscaling_config")
103+
if autoscaling_config is None:
104+
return None
105+
if isinstance(autoscaling_config, dict):
106+
return autoscaling_config.get("min_replicas")
107+
return getattr(autoscaling_config, "min_replicas", None)
108+
109+
110+
def _all_models_scale_to_zero(llm_configs: Optional[List[LLMConfig]]) -> bool:
111+
"""Check if all models are configured with min_replicas == 0."""
112+
if not llm_configs:
113+
return False
114+
for config in llm_configs:
115+
min_replicas = _get_min_replicas_from_llm_config(config)
116+
if min_replicas != 0:
117+
return False
118+
return True
119+
120+
100121
# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
101122
class CallMethod(Enum):
102123
CHAT = "chat"
@@ -763,10 +784,20 @@ def get_deployment_options(
763784
) -> Dict[str, Any]:
764785
"""Get the deployment options for the ingress deployment.
765786
787+
If all models are configured with min_replicas=0 (scale-to-zero),
788+
the ingress will also be configured with min_replicas=0 so that
789+
the worker node/GPU instance can be fully released when idle.
790+
766791
Args:
767792
llm_configs: The LLM configs to infer the number of ingress replicas from.
768793
769794
Returns:
770795
A dictionary containing the deployment options for the ingress deployment.
771796
"""
772-
return DEFAULT_INGRESS_OPTIONS
797+
options = {
798+
k: (v.copy() if isinstance(v, dict) else v)
799+
for k, v in DEFAULT_INGRESS_OPTIONS.items()
800+
}
801+
if _all_models_scale_to_zero(llm_configs):
802+
options.setdefault("autoscaling_config", {})["min_replicas"] = 0
803+
return options

python/ray/llm/tests/serve/cpu/deployments/routers/test_builder_ingress.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,121 @@ def test_user_target_ongoing_requests_respected(
372372
assert autoscaling_config.target_ongoing_requests == user_target
373373

374374

375+
class TestIngressScaleToZero:
376+
"""Tests for ingress scale-to-zero behavior when all models have min_replicas=0."""
377+
378+
def test_all_models_scale_to_zero(self, disable_placement_bundles):
379+
"""When all models have min_replicas=0, ingress should also have min_replicas=0."""
380+
llm_cfg_dict_autoscaling = LLMConfig(
381+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
382+
accelerator_type="L4",
383+
deployment_config={
384+
"autoscaling_config": {
385+
"min_replicas": 0,
386+
"max_replicas": 2,
387+
}
388+
},
389+
)
390+
llm_cfg_obj_autoscaling = LLMConfig(
391+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
392+
accelerator_type="L4",
393+
deployment_config={
394+
"autoscaling_config": AutoscalingConfig(
395+
min_replicas=0,
396+
max_replicas=4,
397+
)
398+
},
399+
)
400+
401+
app = build_openai_app(
402+
LLMServingArgs(
403+
llm_configs=[llm_cfg_dict_autoscaling, llm_cfg_obj_autoscaling],
404+
)
405+
)
406+
autoscaling_config = (
407+
app._bound_deployment._deployment_config.autoscaling_config
408+
)
409+
assert autoscaling_config.min_replicas == 0
410+
411+
def test_mixed_min_replicas_keeps_default(self, disable_placement_bundles):
412+
"""When some models have min_replicas>0, ingress should keep default min_replicas."""
413+
llm_cfg_zero = LLMConfig(
414+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
415+
accelerator_type="L4",
416+
deployment_config={
417+
"autoscaling_config": {
418+
"min_replicas": 0,
419+
"max_replicas": 2,
420+
}
421+
},
422+
)
423+
llm_cfg_nonzero = LLMConfig(
424+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
425+
accelerator_type="L4",
426+
deployment_config={
427+
"autoscaling_config": AutoscalingConfig(
428+
min_replicas=1,
429+
max_replicas=4,
430+
)
431+
},
432+
)
433+
434+
app = build_openai_app(
435+
LLMServingArgs(
436+
llm_configs=[llm_cfg_zero, llm_cfg_nonzero],
437+
)
438+
)
439+
autoscaling_config = (
440+
app._bound_deployment._deployment_config.autoscaling_config
441+
)
442+
# Default min_replicas from AutoscalingConfig is 1
443+
assert autoscaling_config.min_replicas == 1
444+
445+
def test_no_autoscaling_config_keeps_default(self, disable_placement_bundles):
446+
"""When models don't have autoscaling_config, ingress should keep default."""
447+
llm_cfg = LLMConfig(
448+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
449+
accelerator_type="L4",
450+
)
451+
452+
app = build_openai_app(
453+
LLMServingArgs(llm_configs=[llm_cfg]),
454+
)
455+
autoscaling_config = (
456+
app._bound_deployment._deployment_config.autoscaling_config
457+
)
458+
assert autoscaling_config.min_replicas == 1
459+
460+
def test_user_override_takes_precedence(self, disable_placement_bundles):
461+
"""User-specified ingress min_replicas should override scale-to-zero logic."""
462+
llm_cfg = LLMConfig(
463+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
464+
accelerator_type="L4",
465+
deployment_config={
466+
"autoscaling_config": {
467+
"min_replicas": 0,
468+
"max_replicas": 2,
469+
}
470+
},
471+
)
472+
473+
app = build_openai_app(
474+
LLMServingArgs(
475+
llm_configs=[llm_cfg],
476+
ingress_deployment_config={
477+
"autoscaling_config": {
478+
"min_replicas": 3,
479+
"max_replicas": 5,
480+
}
481+
},
482+
)
483+
)
484+
autoscaling_config = (
485+
app._bound_deployment._deployment_config.autoscaling_config
486+
)
487+
assert autoscaling_config.min_replicas == 3
488+
489+
375490
def extract_applications_from_output(output: bytes) -> dict:
376491
"""
377492
Extracts the 'applications' block from mixed output and returns it as a dict.

0 commit comments

Comments
 (0)