1818
1919import aiodocker
2020import arrow
21+ import botocore
2122import pytest
2223import tenacity
2324from aws_library .ec2 import EC2InstanceBootSpecific , EC2InstanceData , Resources
25+ from common_library .json_serialization import json_dumps
2426from fastapi import FastAPI
2527from models_library .docker import (
2628 DockerGenericTag ,
6870 AutoscalingDocker ,
6971 get_docker_client ,
7072)
73+ from simcore_service_autoscaling .modules .ec2 import get_ec2_client
7174from simcore_service_autoscaling .utils .utils_docker import (
7275 _OSPARC_NODE_EMPTY_DATETIME_LABEL_KEY ,
7376 _OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY ,
@@ -522,9 +525,9 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915
522525 all_instances = await ec2_client .describe_instances (Filters = instance_type_filters )
523526 assert not all_instances ["Reservations" ]
524527
525- assert scale_up_params . expected_num_instances == 1 , (
526- "This test is not made to work with more than 1 expected instance. so please adapt if needed"
527- )
528+ assert (
529+ scale_up_params . expected_num_instances == 1
530+ ), "This test is not made to work with more than 1 expected instance. so please adapt if needed"
528531
529532 # create the service(s)
530533 created_docker_services = await create_services_batch (scale_up_params )
@@ -1254,7 +1257,7 @@ async def test_cluster_scaling_up_starts_multiple_instances(
12541257 expected_instance_type = "g4dn.8xlarge" , # 32CPUs, 128GiB
12551258 expected_num_instances = 7 ,
12561259 ),
1257- id = "A batch of services requiring g3.4xlarge and a batch requiring g4dn.8xlarge" ,
1260+ id = "A batch of services requiring g4dn.2xlarge and a batch requiring g4dn.8xlarge" ,
12581261 ),
12591262 ],
12601263)
@@ -1283,9 +1286,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
12831286 assert (
12841287 scale_up_params1 .num_services
12851288 >= app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES
1286- ), (
1287- "this test requires to run a first batch of more services than the maximum number of instances allowed"
1288- )
1289+ ), "this test requires to run a first batch of more services than the maximum number of instances allowed"
12891290 # we have nothing running now
12901291 all_instances = await ec2_client .describe_instances ()
12911292 assert not all_instances ["Reservations" ]
@@ -1502,9 +1503,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
15021503 assert "Instances" in reservation1
15031504 assert len (reservation1 ["Instances" ]) == (
15041505 app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES
1505- ), (
1506- f"expected { app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES } EC2 instances, found { len (reservation1 ['Instances' ])} "
1507- )
1506+ ), f"expected { app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES } EC2 instances, found { len (reservation1 ['Instances' ])} "
15081507 for instance in reservation1 ["Instances" ]:
15091508 assert "InstanceType" in instance
15101509 assert instance ["InstanceType" ] == scale_up_params1 .expected_instance_type
@@ -1518,9 +1517,9 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
15181517
15191518 reservation2 = all_instances ["Reservations" ][1 ]
15201519 assert "Instances" in reservation2
1521- assert len ( reservation2 [ "Instances" ]) == 1 , (
1522- f"expected 1 EC2 instances, found { len (reservation2 [' Instances' ]) } "
1523- )
1520+ assert (
1521+ len (reservation2 [" Instances" ]) == 1
1522+ ), f"expected 1 EC2 instances, found { len ( reservation2 [ 'Instances' ]) } "
15241523 for instance in reservation2 ["Instances" ]:
15251524 assert "InstanceType" in instance
15261525 assert instance ["InstanceType" ] == scale_up_params2 .expected_instance_type
@@ -2086,7 +2085,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers(
20862085 ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ],
20872086 indirect = True ,
20882087)
2089- async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071 (
2088+ async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071 ( # noqa: PLR0915
20902089 patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
20912090 minimal_configuration : None ,
20922091 with_instances_machines_hot_buffer : EnvVarsDict ,
@@ -2247,9 +2246,9 @@ async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7
22472246 # BUG REPRODUCTION
22482247 #
22492248 # start a service that imposes same type as the hot buffer
2250- assert hot_buffer_instance_type == "t2.xlarge" , (
2251- "the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too "
2252- )
2249+ assert (
2250+ hot_buffer_instance_type == "t2.xlarge "
2251+ ), "the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too"
22532252 scale_up_params = _ScaleUpParams (
22542253 imposed_instance_type = hot_buffer_instance_type ,
22552254 service_resources = Resources (
@@ -2335,3 +2334,195 @@ async def _check_autoscaling_is_stable() -> None:
23352334
23362335 with pytest .raises (tenacity .RetryError ):
23372336 await _check_autoscaling_is_stable ()
2337+
2338+
2339+ @pytest .fixture
2340+ async def with_multiple_small_subnet_ids (
2341+ create_aws_subnet_id : Callable [..., Awaitable [str ]], monkeypatch : pytest .MonkeyPatch
2342+ ) -> tuple [str , ...]:
2343+ subnet_1 = await create_aws_subnet_id ("10.0.200.0/29" ) # 3 usable IPs
2344+ subnet_2 = await create_aws_subnet_id ("10.0.201.0/29" ) # 3 usable IPs
2345+ monkeypatch .setenv ("EC2_INSTANCES_SUBNET_IDS" , json_dumps ([subnet_1 , subnet_2 ]))
2346+ return subnet_1 , subnet_2
2347+
2348+
2349+ @pytest .mark .parametrize (
2350+ "scale_up_params" ,
2351+ [
2352+ pytest .param (
2353+ _ScaleUpParams (
2354+ imposed_instance_type = None ,
2355+ service_resources = Resources (
2356+ cpus = 5 , ram = TypeAdapter (ByteSize ).validate_python ("36Gib" )
2357+ ),
2358+ num_services = 1 ,
2359+ expected_instance_type = "r5n.4xlarge" , # 1 GPU, 16 CPUs, 128GiB
2360+ expected_num_instances = 1 ,
2361+ ),
2362+ ),
2363+ ],
2364+ )
2365+ @pytest .mark .parametrize (
2366+ # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2367+ "with_docker_join_drained" ,
2368+ ["without_AUTOSCALING_DOCKER_JOIN_DRAINED" ],
2369+ indirect = True ,
2370+ )
2371+ @pytest .mark .parametrize (
2372+ # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2373+ "with_drain_nodes_labelled" ,
2374+ ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ],
2375+ indirect = True ,
2376+ )
2377+ async def test_fresh_instance_is_started_in_second_subnet_if_warm_buffers_used_up_all_ips_in_first_subnet (
2378+ patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
2379+ minimal_configuration : None ,
2380+ with_multiple_small_subnet_ids : tuple [str , ...],
2381+ initialized_app : FastAPI ,
2382+ app_settings : ApplicationSettings ,
2383+ create_buffer_machines : Callable [
2384+ [int , InstanceTypeType , InstanceStateNameType , list [DockerGenericTag ] | None ],
2385+ Awaitable [list [str ]],
2386+ ],
2387+ ec2_client : EC2Client ,
2388+ scale_up_params : _ScaleUpParams ,
2389+ create_services_batch : Callable [[_ScaleUpParams ], Awaitable [list [Service ]]],
2390+ ec2_instance_custom_tags : dict [str , str ],
2391+ instance_type_filters : Sequence [FilterTypeDef ],
2392+ ):
2393+ # we have nothing running now
2394+ all_instances = await ec2_client .describe_instances ()
2395+ assert not all_instances ["Reservations" ]
2396+
2397+ # have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet
2398+ assert app_settings .AUTOSCALING_EC2_INSTANCES
2399+ await create_buffer_machines (
2400+ 3 ,
2401+ cast (
2402+ InstanceTypeType ,
2403+ next (
2404+ iter (app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_ALLOWED_TYPES )
2405+ ),
2406+ ),
2407+ "stopped" ,
2408+ None ,
2409+ )
2410+
2411+ # create several tasks that needs more power
2412+ await create_services_batch (scale_up_params )
2413+ # now autoscale shall create machines in the second subnet
2414+ await auto_scale_cluster (
2415+ app = initialized_app , auto_scaling_mode = DynamicAutoscalingProvider ()
2416+ )
2417+ # check the instances were started
2418+ created_instances = await assert_autoscaled_dynamic_ec2_instances (
2419+ ec2_client ,
2420+ expected_num_reservations = 1 ,
2421+ expected_num_instances = scale_up_params .expected_num_instances ,
2422+ expected_instance_type = scale_up_params .expected_instance_type ,
2423+ expected_instance_state = "running" ,
2424+ expected_additional_tag_keys = list (ec2_instance_custom_tags ),
2425+ instance_filters = instance_type_filters ,
2426+ )
2427+ # check the instance is in the second subnet
2428+ assert created_instances
2429+ assert "SubnetId" in created_instances [0 ]
2430+ assert created_instances [0 ]["SubnetId" ] == with_multiple_small_subnet_ids [1 ]
2431+
2432+
2433+ @pytest .fixture
2434+ def mock_start_instances_to_raise_insufficient_capacity_error (
2435+ initialized_app : FastAPI ,
2436+ mocker : MockerFixture ,
2437+ ) -> mock .Mock :
2438+ async def _raise_insufficient_capacity_error (* args : Any , ** kwargs : Any ) -> None :
2439+ raise botocore .exceptions .ClientError (
2440+ error_response = {
2441+ "Error" : {
2442+ "Code" : "500" ,
2443+ "Message" : "An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 4): We currently do not have sufficient g4dn.4xlarge capacity in the Availability Zone you requested (us-east-1a). Our system will be working on provisioning additional capacity. You can currently get g4dn.4xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1b, us-east-1c, us-east-1d, us-east-1f" ,
2444+ }
2445+ },
2446+ operation_name = "StartInstances" ,
2447+ )
2448+
2449+ return mocker .patch .object (
2450+ get_ec2_client (initialized_app ).client ,
2451+ "start_instances" ,
2452+ autospec = True ,
2453+ side_effect = _raise_insufficient_capacity_error ,
2454+ )
2455+
2456+
2457+ @pytest .mark .xfail (
2458+ reason = "bug described in https://github.com/ITISFoundation/osparc-simcore/issues/8273"
2459+ )
2460+ @pytest .mark .parametrize (
2461+ # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2462+ "with_docker_join_drained" ,
2463+ ["without_AUTOSCALING_DOCKER_JOIN_DRAINED" ],
2464+ indirect = True ,
2465+ )
2466+ @pytest .mark .parametrize (
2467+ # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2468+ "with_drain_nodes_labelled" ,
2469+ ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS" ],
2470+ indirect = True ,
2471+ )
2472+ async def test_fresh_instance_is_launched_if_warm_buffers_cannot_start_due_to_insufficient_capacity_error (
2473+ patch_ec2_client_launch_instances_min_number_of_instances : mock .Mock ,
2474+ minimal_configuration : None ,
2475+ with_multiple_small_subnet_ids : tuple [str , ...],
2476+ initialized_app : FastAPI ,
2477+ mock_start_instances_to_raise_insufficient_capacity_error : None ,
2478+ app_settings : ApplicationSettings ,
2479+ create_buffer_machines : Callable [
2480+ [int , InstanceTypeType , InstanceStateNameType , list [DockerGenericTag ] | None ],
2481+ Awaitable [list [str ]],
2482+ ],
2483+ ec2_client : EC2Client ,
2484+ create_services_batch : Callable [[_ScaleUpParams ], Awaitable [list [Service ]]],
2485+ ec2_instance_custom_tags : dict [str , str ],
2486+ instance_type_filters : Sequence [FilterTypeDef ],
2487+ ):
2488+ # we have nothing running now
2489+ all_instances = await ec2_client .describe_instances ()
2490+ assert not all_instances ["Reservations" ]
2491+
2492+ # have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet
2493+ assert app_settings .AUTOSCALING_EC2_INSTANCES
2494+ warm_buffer_instance_type = cast (
2495+ InstanceTypeType ,
2496+ next (iter (app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_ALLOWED_TYPES )),
2497+ )
2498+ await create_buffer_machines (1 , warm_buffer_instance_type , "stopped" , None )
2499+
2500+ # create several tasks that needs more power
2501+ scale_up_params = _ScaleUpParams (
2502+ imposed_instance_type = warm_buffer_instance_type ,
2503+ service_resources = Resources (
2504+ cpus = 1 , ram = TypeAdapter (ByteSize ).validate_python ("1Gib" )
2505+ ),
2506+ num_services = 1 ,
2507+ expected_instance_type = warm_buffer_instance_type ,
2508+ expected_num_instances = 1 ,
2509+ )
2510+ await create_services_batch (scale_up_params )
2511+ # now autoscale shall create machines in the second subnet
2512+ await auto_scale_cluster (
2513+ app = initialized_app , auto_scaling_mode = DynamicAutoscalingProvider ()
2514+ )
2515+ # check the instances were started
2516+ created_instances = await assert_autoscaled_dynamic_ec2_instances (
2517+ ec2_client ,
2518+ expected_num_reservations = 1 ,
2519+ expected_num_instances = scale_up_params .expected_num_instances ,
2520+ expected_instance_type = scale_up_params .expected_instance_type ,
2521+ expected_instance_state = "running" ,
2522+ expected_additional_tag_keys = list (ec2_instance_custom_tags ),
2523+ instance_filters = instance_type_filters ,
2524+ )
2525+ # check the instance is in the second subnet
2526+ assert created_instances
2527+ assert "SubnetId" in created_instances [0 ]
2528+ assert created_instances [0 ]["SubnetId" ] == with_multiple_small_subnet_ids [1 ]
0 commit comments