@@ -1194,22 +1194,22 @@ async def test_cluster_scaling_up_starts_multiple_instances(
11941194 _ScaleUpParams (
11951195 imposed_instance_type = "g3.4xlarge" , # 1 GPU, 16 CPUs, 122GiB
11961196 service_resources = Resources (
1197- cpus = 5 , ram = TypeAdapter (ByteSize ).validate_python ("30Gib" )
1197+ cpus = 16 , ram = TypeAdapter (ByteSize ).validate_python ("30Gib" )
11981198 ),
1199- num_services = 10 ,
1199+ num_services = 12 ,
12001200 expected_instance_type = "g3.4xlarge" , # 1 GPU, 16 CPUs, 122GiB
1201- expected_num_instances = 4 ,
1201+ expected_num_instances = 10 ,
12021202 ),
12031203 _ScaleUpParams (
12041204 imposed_instance_type = "g4dn.8xlarge" , # 32CPUs, 128GiB
12051205 service_resources = Resources (
1206- cpus = 5 , ram = TypeAdapter (ByteSize ).validate_python ("20480MB" )
1206+ cpus = 32 , ram = TypeAdapter (ByteSize ).validate_python ("20480MB" )
12071207 ),
12081208 num_services = 7 ,
12091209 expected_instance_type = "g4dn.8xlarge" , # 32CPUs, 128GiB
1210- expected_num_instances = 2 ,
1210+ expected_num_instances = 7 ,
12111211 ),
1212- id = "Two different instance types are needed " ,
1212+ id = "A batch of services requiring g3.4xlarge and a batch requiring g4dn.8xlarge " ,
12131213 ),
12141214 ],
12151215)
@@ -1218,6 +1218,7 @@ async def test_cluster_adapts_machines_on_the_fly(
12181218 minimal_configuration : None ,
12191219 ec2_client : EC2Client ,
12201220 initialized_app : FastAPI ,
1221+ app_settings : ApplicationSettings ,
12211222 create_service : Callable [
12221223 [dict [str , Any ], dict [DockerLabelKey , str ], str , list [str ]], Awaitable [Service ]
12231224 ],
@@ -1230,11 +1231,19 @@ async def test_cluster_adapts_machines_on_the_fly(
12301231 scale_up_params1 : _ScaleUpParams ,
12311232 scale_up_params2 : _ScaleUpParams ,
12321233):
1234+ # pre-requisites
1235+ assert app_settings .AUTOSCALING_EC2_INSTANCES
1236+ assert app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES > 0
1237+ assert (
1238+ scale_up_params1 .num_services
1239+ >= app_settings .AUTOSCALING_EC2_INSTANCES .EC2_INSTANCES_MAX_INSTANCES
1240+ ), "this test requires to run a first batch of more services than the maximum number of instances allowed"
12331241 # we have nothing running now
12341242 all_instances = await ec2_client .describe_instances ()
12351243 assert not all_instances ["Reservations" ]
12361244
1237- # create several tasks that needs more power
1245+ #
1246+ # 1. create the first batch of services requiring the initial machines
12381247 await asyncio .gather (
12391248 * (
12401249 create_service (
@@ -1257,21 +1266,59 @@ async def test_cluster_adapts_machines_on_the_fly(
12571266 for _ in range (scale_up_params1 .num_services )
12581267 )
12591268 )
1269+ for _ in range (3 ):
1270+ # it will only scale once and do nothing else
1271+ await auto_scale_cluster (
1272+ app = initialized_app , auto_scaling_mode = DynamicAutoscaling ()
1273+ )
1274+ await assert_autoscaled_dynamic_ec2_instances (
1275+ ec2_client ,
1276+ expected_num_reservations = 1 ,
1277+ expected_num_instances = scale_up_params1 .expected_num_instances ,
1278+ expected_instance_type = scale_up_params1 .expected_instance_type ,
1279+ expected_instance_state = "running" ,
1280+ expected_additional_tag_keys = list (ec2_instance_custom_tags ),
1281+ instance_filters = instance_type_filters ,
1282+ )
12601283
1261- await auto_scale_cluster (
1262- app = initialized_app , auto_scaling_mode = DynamicAutoscaling ()
1263- )
1264-
1265- # check the instances were started
1266- await assert_autoscaled_dynamic_ec2_instances (
1267- ec2_client ,
1268- expected_num_reservations = 1 ,
1269- expected_num_instances = scale_up_params1 .expected_num_instances ,
1270- expected_instance_type = scale_up_params1 .expected_instance_type ,
1271- expected_instance_state = "running" ,
1272- expected_additional_tag_keys = list (ec2_instance_custom_tags ),
1273- instance_filters = instance_type_filters ,
1284+ #
1285+ # 2. now we start the second batch of services requiring a different type of machines
1286+ await asyncio .gather (
1287+ * (
1288+ create_service (
1289+ task_template
1290+ | create_task_reservations (
1291+ int (scale_up_params2 .service_resources .cpus ),
1292+ scale_up_params2 .service_resources .ram ,
1293+ ),
1294+ service_monitored_labels
1295+ | osparc_docker_label_keys .to_simcore_runtime_docker_labels (),
1296+ "pending" ,
1297+ (
1298+ [
1299+ f"node.labels.{ DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY } =={ scale_up_params2 .imposed_instance_type } "
1300+ ]
1301+ if scale_up_params2 .imposed_instance_type
1302+ else []
1303+ ),
1304+ )
1305+ for _ in range (scale_up_params2 .num_services )
1306+ )
12741307 )
1308+ for _ in range (3 ):
1309+ # scaling will do nothing since we have hit the maximum number of machines
1310+ await auto_scale_cluster (
1311+ app = initialized_app , auto_scaling_mode = DynamicAutoscaling ()
1312+ )
1313+ await assert_autoscaled_dynamic_ec2_instances (
1314+ ec2_client ,
1315+ expected_num_reservations = 1 ,
1316+ expected_num_instances = scale_up_params1 .expected_num_instances ,
1317+ expected_instance_type = scale_up_params1 .expected_instance_type ,
1318+ expected_instance_state = "running" ,
1319+ expected_additional_tag_keys = list (ec2_instance_custom_tags ),
1320+ instance_filters = instance_type_filters ,
1321+ )
12751322
12761323
12771324@pytest .mark .parametrize (
0 commit comments