1818
1919import  aiodocker 
2020import  arrow 
21+ import  botocore 
2122import  pytest 
2223import  tenacity 
2324from  aws_library .ec2  import  EC2InstanceBootSpecific , EC2InstanceData , Resources 
25+ from  common_library .json_serialization  import  json_dumps 
2426from  fastapi  import  FastAPI 
2527from  models_library .docker  import  (
2628    DockerGenericTag ,
6870    AutoscalingDocker ,
6971    get_docker_client ,
7072)
73+ from  simcore_service_autoscaling .modules .ec2  import  get_ec2_client 
7174from  simcore_service_autoscaling .utils .utils_docker  import  (
7275    _OSPARC_NODE_EMPTY_DATETIME_LABEL_KEY ,
7376    _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY ,
@@ -522,9 +525,9 @@ async def _test_cluster_scaling_up_and_down(  # noqa: PLR0915
522525    all_instances  =  await  ec2_client .describe_instances (Filters = instance_type_filters )
523526    assert  not  all_instances ["Reservations" ]
524527
525-     assert  scale_up_params . expected_num_instances   ==   1 ,  (
526-         "This test is not made to work with more than 1 expected instance. so please adapt if needed" 
527-     )
528+     assert  (
529+         scale_up_params . expected_num_instances   ==   1 
530+     ),  "This test is not made to work with more than 1 expected instance. so please adapt if needed" 
528531
529532    # create the service(s) 
530533    created_docker_services  =  await  create_services_batch (scale_up_params )
@@ -1254,7 +1257,7 @@ async def test_cluster_scaling_up_starts_multiple_instances(
12541257                expected_instance_type = "g4dn.8xlarge" ,  # 32CPUs, 128GiB  
12551258                expected_num_instances = 7 , 
12561259            ), 
1257-             id = "A batch of services requiring g3.4xlarge  and a batch requiring g4dn.8xlarge" , 
1260+             id = "A batch of services requiring g4dn.2xlarge  and a batch requiring g4dn.8xlarge" , 
12581261        ), 
12591262    ], 
12601263) 
@@ -1283,9 +1286,7 @@ async def test_cluster_adapts_machines_on_the_fly(  # noqa: PLR0915
12831286    assert  (
12841287        scale_up_params1 .num_services 
12851288        >=  app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES 
1286-     ), (
1287-         "this test requires to run a first batch of more services than the maximum number of instances allowed" 
1288-     )
1289+     ), "this test requires to run a first batch of more services than the maximum number of instances allowed" 
12891290    # we have nothing running now 
12901291    all_instances  =  await  ec2_client .describe_instances ()
12911292    assert  not  all_instances ["Reservations" ]
@@ -1502,9 +1503,7 @@ async def test_cluster_adapts_machines_on_the_fly(  # noqa: PLR0915
15021503    assert  "Instances"  in  reservation1 
15031504    assert  len (reservation1 ["Instances" ]) ==  (
15041505        app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES 
1505-     ), (
1506-         f"expected { app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES }   EC2 instances, found { len (reservation1 ['Instances' ])}  " 
1507-     )
1506+     ), f"expected { app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES }   EC2 instances, found { len (reservation1 ['Instances' ])}  " 
15081507    for  instance  in  reservation1 ["Instances" ]:
15091508        assert  "InstanceType"  in  instance 
15101509        assert  instance ["InstanceType" ] ==  scale_up_params1 .expected_instance_type 
@@ -1518,9 +1517,9 @@ async def test_cluster_adapts_machines_on_the_fly(  # noqa: PLR0915
15181517
15191518    reservation2  =  all_instances ["Reservations" ][1 ]
15201519    assert  "Instances"  in  reservation2 
1521-     assert  len ( reservation2 [ "Instances" ])  ==   1 ,  (
1522-         f"expected 1 EC2 instances, found  { len (reservation2 [' Instances' ]) } " 
1523-     )
1520+     assert  (
1521+         len (reservation2 [" Instances" ])  ==   1 
1522+     ),  f"expected 1 EC2 instances, found  { len ( reservation2 [ 'Instances' ]) } " 
15241523    for  instance  in  reservation2 ["Instances" ]:
15251524        assert  "InstanceType"  in  instance 
15261525        assert  instance ["InstanceType" ] ==  scale_up_params2 .expected_instance_type 
@@ -2086,7 +2085,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers(
20862085    ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ], 
20872086    indirect = True , 
20882087) 
2089- async  def  test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071 (
2088+ async  def  test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071 (   # noqa: PLR0915 
20902089    patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
20912090    minimal_configuration : None ,
20922091    with_instances_machines_hot_buffer : EnvVarsDict ,
@@ -2247,9 +2246,9 @@ async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7
22472246    # BUG REPRODUCTION 
22482247    # 
22492248    # start a service that imposes same type as the hot buffer 
2250-     assert  hot_buffer_instance_type   ==   "t2.xlarge" ,  (
2251-         "the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too "
2252-     )
2249+     assert  (
2250+         hot_buffer_instance_type   ==   "t2.xlarge "
2251+     ),  "the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too" 
22532252    scale_up_params  =  _ScaleUpParams (
22542253        imposed_instance_type = hot_buffer_instance_type ,
22552254        service_resources = Resources (
@@ -2335,3 +2334,195 @@ async def _check_autoscaling_is_stable() -> None:
23352334
23362335    with  pytest .raises (tenacity .RetryError ):
23372336        await  _check_autoscaling_is_stable ()
2337+ 
2338+ 
2339+ @pytest .fixture  
2340+ async  def  with_multiple_small_subnet_ids (
2341+     create_aws_subnet_id : Callable [..., Awaitable [str ]], monkeypatch : pytest .MonkeyPatch 
2342+ ) ->  tuple [str , ...]:
2343+     subnet_1  =  await  create_aws_subnet_id ("10.0.200.0/29" )  # 3 usable IPs 
2344+     subnet_2  =  await  create_aws_subnet_id ("10.0.201.0/29" )  # 3 usable IPs 
2345+     monkeypatch .setenv ("EC2_INSTANCES_SUBNET_IDS" , json_dumps ([subnet_1 , subnet_2 ]))
2346+     return  subnet_1 , subnet_2 
2347+ 
2348+ 
2349+ @pytest .mark .parametrize ( 
2350+     "scale_up_params" , 
2351+     [ 
2352+         pytest .param ( 
2353+             _ScaleUpParams ( 
2354+                 imposed_instance_type = None , 
2355+                 service_resources = Resources ( 
2356+                     cpus = 5 , ram = TypeAdapter (ByteSize ).validate_python ("36Gib" ) 
2357+                 ), 
2358+                 num_services = 1 , 
2359+                 expected_instance_type = "r5n.4xlarge" ,  # 1 GPU, 16 CPUs, 128GiB  
2360+                 expected_num_instances = 1 , 
2361+             ), 
2362+         ), 
2363+     ], 
2364+ ) 
2365+ @pytest .mark .parametrize ( 
2366+     # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options  
2367+     "with_docker_join_drained" , 
2368+     ["without_AUTOSCALING_DOCKER_JOIN_DRAINED" ], 
2369+     indirect = True , 
2370+ ) 
2371+ @pytest .mark .parametrize ( 
2372+     # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options  
2373+     "with_drain_nodes_labelled" , 
2374+     ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ], 
2375+     indirect = True , 
2376+ ) 
2377+ async  def  test_fresh_instance_is_started_in_second_subnet_if_warm_buffers_used_up_all_ips_in_first_subnet (
2378+     patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
2379+     minimal_configuration : None ,
2380+     with_multiple_small_subnet_ids : tuple [str , ...],
2381+     initialized_app : FastAPI ,
2382+     app_settings : ApplicationSettings ,
2383+     create_buffer_machines : Callable [
2384+         [int , InstanceTypeType , InstanceStateNameType , list [DockerGenericTag ] |  None ],
2385+         Awaitable [list [str ]],
2386+     ],
2387+     ec2_client : EC2Client ,
2388+     scale_up_params : _ScaleUpParams ,
2389+     create_services_batch : Callable [[_ScaleUpParams ], Awaitable [list [Service ]]],
2390+     ec2_instance_custom_tags : dict [str , str ],
2391+     instance_type_filters : Sequence [FilterTypeDef ],
2392+ ):
2393+     # we have nothing running now 
2394+     all_instances  =  await  ec2_client .describe_instances ()
2395+     assert  not  all_instances ["Reservations" ]
2396+ 
2397+     # have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet 
2398+     assert  app_settings .AUTOSCALING_EC2_INSTANCES 
2399+     await  create_buffer_machines (
2400+         3 ,
2401+         cast (
2402+             InstanceTypeType ,
2403+             next (
2404+                 iter (app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_ALLOWED_TYPES )
2405+             ),
2406+         ),
2407+         "stopped" ,
2408+         None ,
2409+     )
2410+ 
2411+     # create several tasks that needs more power 
2412+     await  create_services_batch (scale_up_params )
2413+     # now autoscale shall create machines in the second subnet 
2414+     await  auto_scale_cluster (
2415+         app = initialized_app , auto_scaling_mode = DynamicAutoscalingProvider ()
2416+     )
2417+     # check the instances were started 
2418+     created_instances  =  await  assert_autoscaled_dynamic_ec2_instances (
2419+         ec2_client ,
2420+         expected_num_reservations = 1 ,
2421+         expected_num_instances = scale_up_params .expected_num_instances ,
2422+         expected_instance_type = scale_up_params .expected_instance_type ,
2423+         expected_instance_state = "running" ,
2424+         expected_additional_tag_keys = list (ec2_instance_custom_tags ),
2425+         instance_filters = instance_type_filters ,
2426+     )
2427+     # check the instance is in the second subnet 
2428+     assert  created_instances 
2429+     assert  "SubnetId"  in  created_instances [0 ]
2430+     assert  created_instances [0 ]["SubnetId" ] ==  with_multiple_small_subnet_ids [1 ]
2431+ 
2432+ 
2433+ @pytest .fixture  
2434+ def  mock_start_instances_to_raise_insufficient_capacity_error (
2435+     initialized_app : FastAPI ,
2436+     mocker : MockerFixture ,
2437+ ) ->  mock .Mock :
2438+     async  def  _raise_insufficient_capacity_error (* args : Any , ** kwargs : Any ) ->  None :
2439+         raise  botocore .exceptions .ClientError (
2440+             error_response = {
2441+                 "Error" : {
2442+                     "Code" : "500" ,
2443+                     "Message" : "An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 4): We currently do not have sufficient g4dn.4xlarge capacity in the Availability Zone you requested (us-east-1a). Our system will be working on provisioning additional capacity. You can currently get g4dn.4xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1b, us-east-1c, us-east-1d, us-east-1f" ,
2444+                 }
2445+             },
2446+             operation_name = "StartInstances" ,
2447+         )
2448+ 
2449+     return  mocker .patch .object (
2450+         get_ec2_client (initialized_app ).client ,
2451+         "start_instances" ,
2452+         autospec = True ,
2453+         side_effect = _raise_insufficient_capacity_error ,
2454+     )
2455+ 
2456+ 
2457+ @pytest .mark .xfail ( 
2458+     reason = "bug described in https://github.com/ITISFoundation/osparc-simcore/issues/8273"  
2459+ ) 
2460+ @pytest .mark .parametrize ( 
2461+     # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options  
2462+     "with_docker_join_drained" , 
2463+     ["without_AUTOSCALING_DOCKER_JOIN_DRAINED" ], 
2464+     indirect = True , 
2465+ ) 
2466+ @pytest .mark .parametrize ( 
2467+     # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options  
2468+     "with_drain_nodes_labelled" , 
2469+     ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ], 
2470+     indirect = True , 
2471+ ) 
2472+ async  def  test_fresh_instance_is_launched_if_warm_buffers_cannot_start_due_to_insufficient_capacity_error (
2473+     patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
2474+     minimal_configuration : None ,
2475+     with_multiple_small_subnet_ids : tuple [str , ...],
2476+     initialized_app : FastAPI ,
2477+     mock_start_instances_to_raise_insufficient_capacity_error : None ,
2478+     app_settings : ApplicationSettings ,
2479+     create_buffer_machines : Callable [
2480+         [int , InstanceTypeType , InstanceStateNameType , list [DockerGenericTag ] |  None ],
2481+         Awaitable [list [str ]],
2482+     ],
2483+     ec2_client : EC2Client ,
2484+     create_services_batch : Callable [[_ScaleUpParams ], Awaitable [list [Service ]]],
2485+     ec2_instance_custom_tags : dict [str , str ],
2486+     instance_type_filters : Sequence [FilterTypeDef ],
2487+ ):
2488+     # we have nothing running now 
2489+     all_instances  =  await  ec2_client .describe_instances ()
2490+     assert  not  all_instances ["Reservations" ]
2491+ 
2492+     # have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet 
2493+     assert  app_settings .AUTOSCALING_EC2_INSTANCES 
2494+     warm_buffer_instance_type  =  cast (
2495+         InstanceTypeType ,
2496+         next (iter (app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_ALLOWED_TYPES )),
2497+     )
2498+     await  create_buffer_machines (1 , warm_buffer_instance_type , "stopped" , None )
2499+ 
2500+     # create several tasks that needs more power 
2501+     scale_up_params  =  _ScaleUpParams (
2502+         imposed_instance_type = warm_buffer_instance_type ,
2503+         service_resources = Resources (
2504+             cpus = 1 , ram = TypeAdapter (ByteSize ).validate_python ("1Gib" )
2505+         ),
2506+         num_services = 1 ,
2507+         expected_instance_type = warm_buffer_instance_type ,
2508+         expected_num_instances = 1 ,
2509+     )
2510+     await  create_services_batch (scale_up_params )
2511+     # now autoscale shall create machines in the second subnet 
2512+     await  auto_scale_cluster (
2513+         app = initialized_app , auto_scaling_mode = DynamicAutoscalingProvider ()
2514+     )
2515+     # check the instances were started 
2516+     created_instances  =  await  assert_autoscaled_dynamic_ec2_instances (
2517+         ec2_client ,
2518+         expected_num_reservations = 1 ,
2519+         expected_num_instances = scale_up_params .expected_num_instances ,
2520+         expected_instance_type = scale_up_params .expected_instance_type ,
2521+         expected_instance_state = "running" ,
2522+         expected_additional_tag_keys = list (ec2_instance_custom_tags ),
2523+         instance_filters = instance_type_filters ,
2524+     )
2525+     # check the instance is in the second subnet 
2526+     assert  created_instances 
2527+     assert  "SubnetId"  in  created_instances [0 ]
2528+     assert  created_instances [0 ]["SubnetId" ] ==  with_multiple_small_subnet_ids [1 ]
0 commit comments