@@ -372,6 +372,121 @@ def test_user_target_ongoing_requests_respected(
372372 assert autoscaling_config .target_ongoing_requests == user_target
373373
374374
375+ class TestIngressScaleToZero :
376+ """Tests for ingress scale-to-zero behavior when all models have min_replicas=0."""
377+
378+ def test_all_models_scale_to_zero (self , disable_placement_bundles ):
379+ """When all models have min_replicas=0, ingress should also have min_replicas=0."""
380+ llm_cfg_dict_autoscaling = LLMConfig (
381+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
382+ accelerator_type = "L4" ,
383+ deployment_config = {
384+ "autoscaling_config" : {
385+ "min_replicas" : 0 ,
386+ "max_replicas" : 2 ,
387+ }
388+ },
389+ )
390+ llm_cfg_obj_autoscaling = LLMConfig (
391+ model_loading_config = ModelLoadingConfig (model_id = "model_b" ),
392+ accelerator_type = "L4" ,
393+ deployment_config = {
394+ "autoscaling_config" : AutoscalingConfig (
395+ min_replicas = 0 ,
396+ max_replicas = 4 ,
397+ )
398+ },
399+ )
400+
401+ app = build_openai_app (
402+ LLMServingArgs (
403+ llm_configs = [llm_cfg_dict_autoscaling , llm_cfg_obj_autoscaling ],
404+ )
405+ )
406+ autoscaling_config = (
407+ app ._bound_deployment ._deployment_config .autoscaling_config
408+ )
409+ assert autoscaling_config .min_replicas == 0
410+
411+ def test_mixed_min_replicas_keeps_default (self , disable_placement_bundles ):
412+ """When some models have min_replicas>0, ingress should keep default min_replicas."""
413+ llm_cfg_zero = LLMConfig (
414+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
415+ accelerator_type = "L4" ,
416+ deployment_config = {
417+ "autoscaling_config" : {
418+ "min_replicas" : 0 ,
419+ "max_replicas" : 2 ,
420+ }
421+ },
422+ )
423+ llm_cfg_nonzero = LLMConfig (
424+ model_loading_config = ModelLoadingConfig (model_id = "model_b" ),
425+ accelerator_type = "L4" ,
426+ deployment_config = {
427+ "autoscaling_config" : AutoscalingConfig (
428+ min_replicas = 1 ,
429+ max_replicas = 4 ,
430+ )
431+ },
432+ )
433+
434+ app = build_openai_app (
435+ LLMServingArgs (
436+ llm_configs = [llm_cfg_zero , llm_cfg_nonzero ],
437+ )
438+ )
439+ autoscaling_config = (
440+ app ._bound_deployment ._deployment_config .autoscaling_config
441+ )
442+ # Default min_replicas from AutoscalingConfig is 1
443+ assert autoscaling_config .min_replicas == 1
444+
445+ def test_no_autoscaling_config_keeps_default (self , disable_placement_bundles ):
446+ """When models don't have autoscaling_config, ingress should keep default."""
447+ llm_cfg = LLMConfig (
448+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
449+ accelerator_type = "L4" ,
450+ )
451+
452+ app = build_openai_app (
453+ LLMServingArgs (llm_configs = [llm_cfg ]),
454+ )
455+ autoscaling_config = (
456+ app ._bound_deployment ._deployment_config .autoscaling_config
457+ )
458+ assert autoscaling_config .min_replicas == 1
459+
460+ def test_user_override_takes_precedence (self , disable_placement_bundles ):
461+ """User-specified ingress min_replicas should override scale-to-zero logic."""
462+ llm_cfg = LLMConfig (
463+ model_loading_config = ModelLoadingConfig (model_id = "model_a" ),
464+ accelerator_type = "L4" ,
465+ deployment_config = {
466+ "autoscaling_config" : {
467+ "min_replicas" : 0 ,
468+ "max_replicas" : 2 ,
469+ }
470+ },
471+ )
472+
473+ app = build_openai_app (
474+ LLMServingArgs (
475+ llm_configs = [llm_cfg ],
476+ ingress_deployment_config = {
477+ "autoscaling_config" : {
478+ "min_replicas" : 3 ,
479+ "max_replicas" : 5 ,
480+ }
481+ },
482+ )
483+ )
484+ autoscaling_config = (
485+ app ._bound_deployment ._deployment_config .autoscaling_config
486+ )
487+ assert autoscaling_config .min_replicas == 3
488+
489+
375490def extract_applications_from_output (output : bytes ) -> dict :
376491 """
377492 Extracts the 'applications' block from mixed output and returns it as a dict.
0 commit comments