Skip to content

Commit a324643

Browse files
committed
[Serve] Fix OpenAIIngress scale-to-zero when all models have min_replicas=0
Signed-off-by: thjung123 <jeothen@gmail.com>
1 parent f5a53c4 commit a324643

File tree

2 files changed

+135
-1
lines changed

2 files changed

+135
-1
lines changed

python/ray/llm/_internal/serve/core/ingress/ingress.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,23 @@
9797
},
9898
}
9999

100+
101+
def _get_min_replicas_from_llm_config(config: LLMConfig) -> Optional[int]:
102+
autoscaling_config = config.deployment_config.get("autoscaling_config")
103+
if autoscaling_config is None:
104+
return None
105+
if isinstance(autoscaling_config, dict):
106+
return autoscaling_config.get("min_replicas")
107+
return getattr(autoscaling_config, "min_replicas", None)
108+
109+
110+
def _all_models_scale_to_zero(llm_configs: Optional[List[LLMConfig]]) -> bool:
111+
"""Check if all models are configured with min_replicas == 0."""
112+
if not llm_configs:
113+
return False
114+
return all(_get_min_replicas_from_llm_config(config) == 0 for config in llm_configs)
115+
116+
100117
# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
101118
class CallMethod(Enum):
102119
CHAT = "chat"
@@ -763,10 +780,20 @@ def get_deployment_options(
763780
) -> Dict[str, Any]:
764781
"""Get the deployment options for the ingress deployment.
765782
783+
If all models are configured with min_replicas=0 (scale-to-zero),
784+
the ingress will also be configured with min_replicas=0 so that
785+
the worker node/GPU instance can be fully released when idle.
786+
766787
Args:
767788
llm_configs: The LLM configs to infer the number of ingress replicas from.
768789
769790
Returns:
770791
A dictionary containing the deployment options for the ingress deployment.
771792
"""
772-
return DEFAULT_INGRESS_OPTIONS
793+
options = {
794+
k: (v.copy() if isinstance(v, dict) else v)
795+
for k, v in DEFAULT_INGRESS_OPTIONS.items()
796+
}
797+
if _all_models_scale_to_zero(llm_configs):
798+
options.setdefault("autoscaling_config", {})["min_replicas"] = 0
799+
return options

python/ray/llm/tests/serve/cpu/deployments/routers/test_builder_ingress.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,113 @@ def test_user_target_ongoing_requests_respected(
372372
assert autoscaling_config.target_ongoing_requests == user_target
373373

374374

375+
class TestIngressScaleToZero:
376+
"""Tests for ingress scale-to-zero behavior when all models have min_replicas=0."""
377+
378+
def test_all_models_scale_to_zero(self, disable_placement_bundles):
379+
"""When all models have min_replicas=0, ingress should also have min_replicas=0."""
380+
llm_cfg_dict_autoscaling = LLMConfig(
381+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
382+
accelerator_type="L4",
383+
deployment_config={
384+
"autoscaling_config": {
385+
"min_replicas": 0,
386+
"max_replicas": 2,
387+
}
388+
},
389+
)
390+
llm_cfg_obj_autoscaling = LLMConfig(
391+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
392+
accelerator_type="L4",
393+
deployment_config={
394+
"autoscaling_config": AutoscalingConfig(
395+
min_replicas=0,
396+
max_replicas=4,
397+
)
398+
},
399+
)
400+
401+
app = build_openai_app(
402+
LLMServingArgs(
403+
llm_configs=[llm_cfg_dict_autoscaling, llm_cfg_obj_autoscaling],
404+
)
405+
)
406+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
407+
assert autoscaling_config.min_replicas == 0
408+
409+
def test_mixed_min_replicas_keeps_default(self, disable_placement_bundles):
410+
"""When some models have min_replicas>0, ingress should keep default min_replicas."""
411+
llm_cfg_zero = LLMConfig(
412+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
413+
accelerator_type="L4",
414+
deployment_config={
415+
"autoscaling_config": {
416+
"min_replicas": 0,
417+
"max_replicas": 2,
418+
}
419+
},
420+
)
421+
llm_cfg_nonzero = LLMConfig(
422+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
423+
accelerator_type="L4",
424+
deployment_config={
425+
"autoscaling_config": AutoscalingConfig(
426+
min_replicas=1,
427+
max_replicas=4,
428+
)
429+
},
430+
)
431+
432+
app = build_openai_app(
433+
LLMServingArgs(
434+
llm_configs=[llm_cfg_zero, llm_cfg_nonzero],
435+
)
436+
)
437+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
438+
# Default min_replicas from AutoscalingConfig is 1
439+
assert autoscaling_config.min_replicas == 1
440+
441+
def test_no_autoscaling_config_keeps_default(self, disable_placement_bundles):
442+
"""When models don't have autoscaling_config, ingress should keep default."""
443+
llm_cfg = LLMConfig(
444+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
445+
accelerator_type="L4",
446+
)
447+
448+
app = build_openai_app(
449+
LLMServingArgs(llm_configs=[llm_cfg]),
450+
)
451+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
452+
assert autoscaling_config.min_replicas == 1
453+
454+
def test_user_override_takes_precedence(self, disable_placement_bundles):
455+
"""User-specified ingress min_replicas should override scale-to-zero logic."""
456+
llm_cfg = LLMConfig(
457+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
458+
accelerator_type="L4",
459+
deployment_config={
460+
"autoscaling_config": {
461+
"min_replicas": 0,
462+
"max_replicas": 2,
463+
}
464+
},
465+
)
466+
467+
app = build_openai_app(
468+
LLMServingArgs(
469+
llm_configs=[llm_cfg],
470+
ingress_deployment_config={
471+
"autoscaling_config": {
472+
"min_replicas": 3,
473+
"max_replicas": 5,
474+
}
475+
},
476+
)
477+
)
478+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
479+
assert autoscaling_config.min_replicas == 3
480+
481+
375482
def extract_applications_from_output(output: bytes) -> dict:
376483
"""
377484
Extracts the 'applications' block from mixed output and returns it as a dict.

0 commit comments

Comments
 (0)