Skip to content

Commit f084aeb

Browse files
committed
[Serve] Fix OpenAIIngress scale-to-zero when all models have min_replicas=0
Signed-off-by: thjung123 <jeothen@gmail.com>
1 parent f5a53c4 commit f084aeb

File tree

2 files changed

+133
-1
lines changed

2 files changed

+133
-1
lines changed

python/ray/llm/_internal/serve/core/ingress/ingress.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import asyncio
2+
import copy
23
import json
34
import sys
45
from contextlib import asynccontextmanager
@@ -97,6 +98,23 @@
9798
},
9899
}
99100

101+
102+
def _get_min_replicas_from_llm_config(config: LLMConfig) -> Optional[int]:
103+
autoscaling_config = config.deployment_config.get("autoscaling_config")
104+
if autoscaling_config is None:
105+
return None
106+
if isinstance(autoscaling_config, dict):
107+
return autoscaling_config.get("min_replicas")
108+
return getattr(autoscaling_config, "min_replicas", None)
109+
110+
111+
def _all_models_scale_to_zero(llm_configs: Optional[List[LLMConfig]]) -> bool:
112+
"""Check if all models are configured with min_replicas == 0."""
113+
if not llm_configs:
114+
return False
115+
return all(_get_min_replicas_from_llm_config(config) == 0 for config in llm_configs)
116+
117+
100118
# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
101119
class CallMethod(Enum):
102120
CHAT = "chat"
@@ -763,10 +781,17 @@ def get_deployment_options(
763781
) -> Dict[str, Any]:
764782
"""Get the deployment options for the ingress deployment.
765783
784+
If all models are configured with min_replicas=0 (scale-to-zero),
785+
the ingress will also be configured with min_replicas=0 so that
786+
the worker node/GPU instance can be fully released when idle.
787+
766788
Args:
767789
llm_configs: The LLM configs to infer the number of ingress replicas from.
768790
769791
Returns:
770792
A dictionary containing the deployment options for the ingress deployment.
771793
"""
772-
return DEFAULT_INGRESS_OPTIONS
794+
options = copy.deepcopy(DEFAULT_INGRESS_OPTIONS)
795+
if _all_models_scale_to_zero(llm_configs):
796+
options.setdefault("autoscaling_config", {})["min_replicas"] = 0
797+
return options

python/ray/llm/tests/serve/cpu/deployments/routers/test_builder_ingress.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,113 @@ def test_user_target_ongoing_requests_respected(
372372
assert autoscaling_config.target_ongoing_requests == user_target
373373

374374

375+
class TestIngressScaleToZero:
376+
"""Tests for ingress scale-to-zero behavior when all models have min_replicas=0."""
377+
378+
def test_all_models_scale_to_zero(self, disable_placement_bundles):
379+
"""When all models have min_replicas=0, ingress should also have min_replicas=0."""
380+
llm_cfg_dict_autoscaling = LLMConfig(
381+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
382+
accelerator_type="L4",
383+
deployment_config={
384+
"autoscaling_config": {
385+
"min_replicas": 0,
386+
"max_replicas": 2,
387+
}
388+
},
389+
)
390+
llm_cfg_obj_autoscaling = LLMConfig(
391+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
392+
accelerator_type="L4",
393+
deployment_config={
394+
"autoscaling_config": AutoscalingConfig(
395+
min_replicas=0,
396+
max_replicas=4,
397+
)
398+
},
399+
)
400+
401+
app = build_openai_app(
402+
LLMServingArgs(
403+
llm_configs=[llm_cfg_dict_autoscaling, llm_cfg_obj_autoscaling],
404+
)
405+
)
406+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
407+
assert autoscaling_config.min_replicas == 0
408+
409+
def test_mixed_min_replicas_keeps_default(self, disable_placement_bundles):
410+
"""When some models have min_replicas>0, ingress should keep default min_replicas."""
411+
llm_cfg_zero = LLMConfig(
412+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
413+
accelerator_type="L4",
414+
deployment_config={
415+
"autoscaling_config": {
416+
"min_replicas": 0,
417+
"max_replicas": 2,
418+
}
419+
},
420+
)
421+
llm_cfg_nonzero = LLMConfig(
422+
model_loading_config=ModelLoadingConfig(model_id="model_b"),
423+
accelerator_type="L4",
424+
deployment_config={
425+
"autoscaling_config": AutoscalingConfig(
426+
min_replicas=1,
427+
max_replicas=4,
428+
)
429+
},
430+
)
431+
432+
app = build_openai_app(
433+
LLMServingArgs(
434+
llm_configs=[llm_cfg_zero, llm_cfg_nonzero],
435+
)
436+
)
437+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
438+
# Default min_replicas from AutoscalingConfig is 1
439+
assert autoscaling_config.min_replicas == 1
440+
441+
def test_no_autoscaling_config_keeps_default(self, disable_placement_bundles):
442+
"""When models don't have autoscaling_config, ingress should keep default."""
443+
llm_cfg = LLMConfig(
444+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
445+
accelerator_type="L4",
446+
)
447+
448+
app = build_openai_app(
449+
LLMServingArgs(llm_configs=[llm_cfg]),
450+
)
451+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
452+
assert autoscaling_config.min_replicas == 1
453+
454+
def test_user_override_takes_precedence(self, disable_placement_bundles):
455+
"""User-specified ingress min_replicas should override scale-to-zero logic."""
456+
llm_cfg = LLMConfig(
457+
model_loading_config=ModelLoadingConfig(model_id="model_a"),
458+
accelerator_type="L4",
459+
deployment_config={
460+
"autoscaling_config": {
461+
"min_replicas": 0,
462+
"max_replicas": 2,
463+
}
464+
},
465+
)
466+
467+
app = build_openai_app(
468+
LLMServingArgs(
469+
llm_configs=[llm_cfg],
470+
ingress_deployment_config={
471+
"autoscaling_config": {
472+
"min_replicas": 3,
473+
"max_replicas": 5,
474+
}
475+
},
476+
)
477+
)
478+
autoscaling_config = app._bound_deployment._deployment_config.autoscaling_config
479+
assert autoscaling_config.min_replicas == 3
480+
481+
375482
def extract_applications_from_output(output: bytes) -> dict:
376483
"""
377484
Extracts the 'applications' block from mixed output and returns it as a dict.

0 commit comments

Comments
 (0)