@@ -372,6 +372,113 @@ def test_user_target_ongoing_requests_respected(
372372 assert autoscaling_config .target_ongoing_requests == user_target
373373
374374
375+ class TestIngressScaleToZero :
376+ """Tests for ingress scale-to-zero behavior when all models have min_replicas=0."""
377+
378+ def test_all_models_scale_to_zero (self , disable_placement_bundles ):
379+ """When all models have min_replicas=0, ingress should also have min_replicas=0."""
380+ llm_cfg_dict_autoscaling = LLMConfig (
381+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
382+ accelerator_type = "L4" ,
383+ deployment_config = {
384+ "autoscaling_config" : {
385+ "min_replicas" : 0 ,
386+ "max_replicas" : 2 ,
387+ }
388+ },
389+ )
390+ llm_cfg_obj_autoscaling = LLMConfig (
391+ model_loading_config = ModelLoadingConfig (model_id = "model_b" ),
392+ accelerator_type = "L4" ,
393+ deployment_config = {
394+ "autoscaling_config" : AutoscalingConfig (
395+ min_replicas = 0 ,
396+ max_replicas = 4 ,
397+ )
398+ },
399+ )
400+
401+ app = build_openai_app (
402+ LLMServingArgs (
403+ llm_configs = [llm_cfg_dict_autoscaling , llm_cfg_obj_autoscaling ],
404+ )
405+ )
406+ autoscaling_config = app ._bound_deployment ._deployment_config .autoscaling_config
407+ assert autoscaling_config .min_replicas == 0
408+
409+ def test_mixed_min_replicas_keeps_default (self , disable_placement_bundles ):
410+ """When some models have min_replicas>0, ingress should keep default min_replicas."""
411+ llm_cfg_zero = LLMConfig (
412+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
413+ accelerator_type = "L4" ,
414+ deployment_config = {
415+ "autoscaling_config" : {
416+ "min_replicas" : 0 ,
417+ "max_replicas" : 2 ,
418+ }
419+ },
420+ )
421+ llm_cfg_nonzero = LLMConfig (
422+ model_loading_config = ModelLoadingConfig (model_id = "model_b" ),
423+ accelerator_type = "L4" ,
424+ deployment_config = {
425+ "autoscaling_config" : AutoscalingConfig (
426+ min_replicas = 1 ,
427+ max_replicas = 4 ,
428+ )
429+ },
430+ )
431+
432+ app = build_openai_app (
433+ LLMServingArgs (
434+ llm_configs = [llm_cfg_zero , llm_cfg_nonzero ],
435+ )
436+ )
437+ autoscaling_config = app ._bound_deployment ._deployment_config .autoscaling_config
438+ # Default min_replicas from AutoscalingConfig is 1
439+ assert autoscaling_config .min_replicas == 1
440+
441+ def test_no_autoscaling_config_keeps_default (self , disable_placement_bundles ):
442+ """When models don't have autoscaling_config, ingress should keep default."""
443+ llm_cfg = LLMConfig (
444+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
445+ accelerator_type = "L4" ,
446+ )
447+
448+ app = build_openai_app (
449+ LLMServingArgs (llm_configs = [llm_cfg ]),
450+ )
451+ autoscaling_config = app ._bound_deployment ._deployment_config .autoscaling_config
452+ assert autoscaling_config .min_replicas == 1
453+
454+ def test_user_override_takes_precedence (self , disable_placement_bundles ):
455+ """User-specified ingress min_replicas should override scale-to-zero logic."""
456+ llm_cfg = LLMConfig (
457+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
458+ accelerator_type = "L4" ,
459+ deployment_config = {
460+ "autoscaling_config" : {
461+ "min_replicas" : 0 ,
462+ "max_replicas" : 2 ,
463+ }
464+ },
465+ )
466+
467+ app = build_openai_app (
468+ LLMServingArgs (
469+ llm_configs = [llm_cfg ],
470+ ingress_deployment_config = {
471+ "autoscaling_config" : {
472+ "min_replicas" : 3 ,
473+ "max_replicas" : 5 ,
474+ }
475+ },
476+ )
477+ )
478+ autoscaling_config = app ._bound_deployment ._deployment_config .autoscaling_config
479+ assert autoscaling_config .min_replicas == 3
480+
481+
375482def extract_applications_from_output (output : bytes ) -> dict :
376483 """
377484 Extracts the 'applications' block from mixed output and returns it as a dict.
0 commit comments