Skip to content

Commit bf9db82

Browse files
committed
added 2 tests with regard to multiple zones
1 parent 5cb6143 commit bf9db82

File tree

1 file changed

+208
-17
lines changed

1 file changed

+208
-17
lines changed

services/autoscaling/tests/unit/test_modules_cluster_scaling_dynamic.py

Lines changed: 208 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818

1919
import aiodocker
2020
import arrow
21+
import botocore
2122
import pytest
2223
import tenacity
2324
from aws_library.ec2 import EC2InstanceBootSpecific, EC2InstanceData, Resources
25+
from common_library.json_serialization import json_dumps
2426
from fastapi import FastAPI
2527
from models_library.docker import (
2628
DockerGenericTag,
@@ -68,6 +70,7 @@
6870
AutoscalingDocker,
6971
get_docker_client,
7072
)
73+
from simcore_service_autoscaling.modules.ec2 import get_ec2_client
7174
from simcore_service_autoscaling.utils.utils_docker import (
7275
_OSPARC_NODE_EMPTY_DATETIME_LABEL_KEY,
7376
_OSPARC_NODE_TERMINATION_PROCESS_LABEL_KEY,
@@ -522,9 +525,9 @@ async def _test_cluster_scaling_up_and_down( # noqa: PLR0915
522525
all_instances = await ec2_client.describe_instances(Filters=instance_type_filters)
523526
assert not all_instances["Reservations"]
524527

525-
assert scale_up_params.expected_num_instances == 1, (
526-
"This test is not made to work with more than 1 expected instance. so please adapt if needed"
527-
)
528+
assert (
529+
scale_up_params.expected_num_instances == 1
530+
), "This test is not made to work with more than 1 expected instance. so please adapt if needed"
528531

529532
# create the service(s)
530533
created_docker_services = await create_services_batch(scale_up_params)
@@ -1254,7 +1257,7 @@ async def test_cluster_scaling_up_starts_multiple_instances(
12541257
expected_instance_type="g4dn.8xlarge", # 32CPUs, 128GiB
12551258
expected_num_instances=7,
12561259
),
1257-
id="A batch of services requiring g3.4xlarge and a batch requiring g4dn.8xlarge",
1260+
id="A batch of services requiring g4dn.2xlarge and a batch requiring g4dn.8xlarge",
12581261
),
12591262
],
12601263
)
@@ -1283,9 +1286,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
12831286
assert (
12841287
scale_up_params1.num_services
12851288
>= app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES
1286-
), (
1287-
"this test requires to run a first batch of more services than the maximum number of instances allowed"
1288-
)
1289+
), "this test requires to run a first batch of more services than the maximum number of instances allowed"
12891290
# we have nothing running now
12901291
all_instances = await ec2_client.describe_instances()
12911292
assert not all_instances["Reservations"]
@@ -1502,9 +1503,7 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
15021503
assert "Instances" in reservation1
15031504
assert len(reservation1["Instances"]) == (
15041505
app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES
1505-
), (
1506-
f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES} EC2 instances, found {len(reservation1['Instances'])}"
1507-
)
1506+
), f"expected {app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES} EC2 instances, found {len(reservation1['Instances'])}"
15081507
for instance in reservation1["Instances"]:
15091508
assert "InstanceType" in instance
15101509
assert instance["InstanceType"] == scale_up_params1.expected_instance_type
@@ -1518,9 +1517,9 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915
15181517

15191518
reservation2 = all_instances["Reservations"][1]
15201519
assert "Instances" in reservation2
1521-
assert len(reservation2["Instances"]) == 1, (
1522-
f"expected 1 EC2 instances, found {len(reservation2['Instances'])}"
1523-
)
1520+
assert (
1521+
len(reservation2["Instances"]) == 1
1522+
), f"expected 1 EC2 instances, found {len(reservation2['Instances'])}"
15241523
for instance in reservation2["Instances"]:
15251524
assert "InstanceType" in instance
15261525
assert instance["InstanceType"] == scale_up_params2.expected_instance_type
@@ -2086,7 +2085,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers(
20862085
["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"],
20872086
indirect=True,
20882087
)
2089-
async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071(
2088+
async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7071( # noqa: PLR0915
20902089
patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock,
20912090
minimal_configuration: None,
20922091
with_instances_machines_hot_buffer: EnvVarsDict,
@@ -2247,9 +2246,9 @@ async def test_warm_buffers_only_replace_hot_buffer_if_service_is_started_issue7
22472246
# BUG REPRODUCTION
22482247
#
22492248
# start a service that imposes same type as the hot buffer
2250-
assert hot_buffer_instance_type == "t2.xlarge", (
2251-
"the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too"
2252-
)
2249+
assert (
2250+
hot_buffer_instance_type == "t2.xlarge"
2251+
), "the test is hard-coded for this type and accordingly resource. If this changed then the resource shall be changed too"
22532252
scale_up_params = _ScaleUpParams(
22542253
imposed_instance_type=hot_buffer_instance_type,
22552254
service_resources=Resources(
@@ -2335,3 +2334,195 @@ async def _check_autoscaling_is_stable() -> None:
23352334

23362335
with pytest.raises(tenacity.RetryError):
23372336
await _check_autoscaling_is_stable()
2337+
2338+
2339+
@pytest.fixture
2340+
async def with_multiple_small_subnet_ids(
2341+
create_aws_subnet_id: Callable[..., Awaitable[str]], monkeypatch: pytest.MonkeyPatch
2342+
) -> tuple[str, ...]:
2343+
subnet_1 = await create_aws_subnet_id("10.0.200.0/29") # 3 usable IPs
2344+
subnet_2 = await create_aws_subnet_id("10.0.201.0/29") # 3 usable IPs
2345+
monkeypatch.setenv("EC2_INSTANCES_SUBNET_IDS", json_dumps([subnet_1, subnet_2]))
2346+
return subnet_1, subnet_2
2347+
2348+
2349+
@pytest.mark.parametrize(
2350+
"scale_up_params",
2351+
[
2352+
pytest.param(
2353+
_ScaleUpParams(
2354+
imposed_instance_type=None,
2355+
service_resources=Resources(
2356+
cpus=5, ram=TypeAdapter(ByteSize).validate_python("36Gib")
2357+
),
2358+
num_services=1,
2359+
expected_instance_type="r5n.4xlarge", # 1 GPU, 16 CPUs, 128GiB
2360+
expected_num_instances=1,
2361+
),
2362+
),
2363+
],
2364+
)
2365+
@pytest.mark.parametrize(
2366+
# NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2367+
"with_docker_join_drained",
2368+
["without_AUTOSCALING_DOCKER_JOIN_DRAINED"],
2369+
indirect=True,
2370+
)
2371+
@pytest.mark.parametrize(
2372+
# NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2373+
"with_drain_nodes_labelled",
2374+
["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"],
2375+
indirect=True,
2376+
)
2377+
async def test_fresh_instance_is_started_in_second_subnet_if_warm_buffers_used_up_all_ips_in_first_subnet(
2378+
patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock,
2379+
minimal_configuration: None,
2380+
with_multiple_small_subnet_ids: tuple[str, ...],
2381+
initialized_app: FastAPI,
2382+
app_settings: ApplicationSettings,
2383+
create_buffer_machines: Callable[
2384+
[int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag] | None],
2385+
Awaitable[list[str]],
2386+
],
2387+
ec2_client: EC2Client,
2388+
scale_up_params: _ScaleUpParams,
2389+
create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]],
2390+
ec2_instance_custom_tags: dict[str, str],
2391+
instance_type_filters: Sequence[FilterTypeDef],
2392+
):
2393+
# we have nothing running now
2394+
all_instances = await ec2_client.describe_instances()
2395+
assert not all_instances["Reservations"]
2396+
2397+
# have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet
2398+
assert app_settings.AUTOSCALING_EC2_INSTANCES
2399+
await create_buffer_machines(
2400+
3,
2401+
cast(
2402+
InstanceTypeType,
2403+
next(
2404+
iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES)
2405+
),
2406+
),
2407+
"stopped",
2408+
None,
2409+
)
2410+
2411+
# create several tasks that needs more power
2412+
await create_services_batch(scale_up_params)
2413+
# now autoscale shall create machines in the second subnet
2414+
await auto_scale_cluster(
2415+
app=initialized_app, auto_scaling_mode=DynamicAutoscalingProvider()
2416+
)
2417+
# check the instances were started
2418+
created_instances = await assert_autoscaled_dynamic_ec2_instances(
2419+
ec2_client,
2420+
expected_num_reservations=1,
2421+
expected_num_instances=scale_up_params.expected_num_instances,
2422+
expected_instance_type=scale_up_params.expected_instance_type,
2423+
expected_instance_state="running",
2424+
expected_additional_tag_keys=list(ec2_instance_custom_tags),
2425+
instance_filters=instance_type_filters,
2426+
)
2427+
# check the instance is in the second subnet
2428+
assert created_instances
2429+
assert "SubnetId" in created_instances[0]
2430+
assert created_instances[0]["SubnetId"] == with_multiple_small_subnet_ids[1]
2431+
2432+
2433+
@pytest.fixture
2434+
def mock_start_instances_to_raise_insufficient_capacity_error(
2435+
initialized_app: FastAPI,
2436+
mocker: MockerFixture,
2437+
) -> mock.Mock:
2438+
async def _raise_insufficient_capacity_error(*args: Any, **kwargs: Any) -> None:
2439+
raise botocore.exceptions.ClientError(
2440+
error_response={
2441+
"Error": {
2442+
"Code": "500",
2443+
"Message": "An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 4): We currently do not have sufficient g4dn.4xlarge capacity in the Availability Zone you requested (us-east-1a). Our system will be working on provisioning additional capacity. You can currently get g4dn.4xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1b, us-east-1c, us-east-1d, us-east-1f",
2444+
}
2445+
},
2446+
operation_name="StartInstances",
2447+
)
2448+
2449+
return mocker.patch.object(
2450+
get_ec2_client(initialized_app).client,
2451+
"start_instances",
2452+
autospec=True,
2453+
side_effect=_raise_insufficient_capacity_error,
2454+
)
2455+
2456+
2457+
@pytest.mark.xfail(
2458+
reason="bug described in https://github.com/ITISFoundation/osparc-simcore/issues/8273"
2459+
)
2460+
@pytest.mark.parametrize(
2461+
# NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2462+
"with_docker_join_drained",
2463+
["without_AUTOSCALING_DOCKER_JOIN_DRAINED"],
2464+
indirect=True,
2465+
)
2466+
@pytest.mark.parametrize(
2467+
# NOTE: only the main test test_cluster_scaling_up_and_down is run with all options
2468+
"with_drain_nodes_labelled",
2469+
["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"],
2470+
indirect=True,
2471+
)
2472+
async def test_fresh_instance_is_launched_if_warm_buffers_cannot_start_due_to_insufficient_capacity_error(
2473+
patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock,
2474+
minimal_configuration: None,
2475+
with_multiple_small_subnet_ids: tuple[str, ...],
2476+
initialized_app: FastAPI,
2477+
mock_start_instances_to_raise_insufficient_capacity_error: None,
2478+
app_settings: ApplicationSettings,
2479+
create_buffer_machines: Callable[
2480+
[int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag] | None],
2481+
Awaitable[list[str]],
2482+
],
2483+
ec2_client: EC2Client,
2484+
create_services_batch: Callable[[_ScaleUpParams], Awaitable[list[Service]]],
2485+
ec2_instance_custom_tags: dict[str, str],
2486+
instance_type_filters: Sequence[FilterTypeDef],
2487+
):
2488+
# we have nothing running now
2489+
all_instances = await ec2_client.describe_instances()
2490+
assert not all_instances["Reservations"]
2491+
2492+
# have warm buffers in the first subnet *fixture uses subnet_1 by default*, this will use all the IPs in the first subnet
2493+
assert app_settings.AUTOSCALING_EC2_INSTANCES
2494+
warm_buffer_instance_type = cast(
2495+
InstanceTypeType,
2496+
next(iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES)),
2497+
)
2498+
await create_buffer_machines(1, warm_buffer_instance_type, "stopped", None)
2499+
2500+
# create several tasks that needs more power
2501+
scale_up_params = _ScaleUpParams(
2502+
imposed_instance_type=warm_buffer_instance_type,
2503+
service_resources=Resources(
2504+
cpus=1, ram=TypeAdapter(ByteSize).validate_python("1Gib")
2505+
),
2506+
num_services=1,
2507+
expected_instance_type=warm_buffer_instance_type,
2508+
expected_num_instances=1,
2509+
)
2510+
await create_services_batch(scale_up_params)
2511+
# now autoscale shall create machines in the second subnet
2512+
await auto_scale_cluster(
2513+
app=initialized_app, auto_scaling_mode=DynamicAutoscalingProvider()
2514+
)
2515+
# check the instances were started
2516+
created_instances = await assert_autoscaled_dynamic_ec2_instances(
2517+
ec2_client,
2518+
expected_num_reservations=1,
2519+
expected_num_instances=scale_up_params.expected_num_instances,
2520+
expected_instance_type=scale_up_params.expected_instance_type,
2521+
expected_instance_state="running",
2522+
expected_additional_tag_keys=list(ec2_instance_custom_tags),
2523+
instance_filters=instance_type_filters,
2524+
)
2525+
# check the instance is in the second subnet
2526+
assert created_instances
2527+
assert "SubnetId" in created_instances[0]
2528+
assert created_instances[0]["SubnetId"] == with_multiple_small_subnet_ids[1]

0 commit comments

Comments
 (0)