Skip to content

Commit 6953507

Browse files
sandereggCopilot
andauthored
🎨🐛Autoscaling: Allow EC2 launches in multiple AvailabilityZones ⚠️ (DevOPS) 🚨 (#8210)
Co-authored-by: Copilot <[email protected]>
1 parent 3e8d9ed commit 6953507

File tree

26 files changed

+1139
-299
lines changed

26 files changed

+1139
-299
lines changed

packages/aws-library/src/aws_library/ec2/__init__.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
from ._client import SimcoreEC2API
2-
from ._errors import EC2AccessError, EC2NotConnectedError, EC2RuntimeError
2+
from ._errors import (
3+
EC2AccessError,
4+
EC2InsufficientCapacityError,
5+
EC2NotConnectedError,
6+
EC2RuntimeError,
7+
)
38
from ._models import (
49
AWS_TAG_KEY_MAX_LENGTH,
510
AWS_TAG_KEY_MIN_LENGTH,
@@ -16,22 +21,22 @@
1621
)
1722

1823
__all__: tuple[str, ...] = (
19-
"AWSTagKey",
20-
"AWSTagValue",
21-
"AWS_TAG_KEY_MIN_LENGTH",
2224
"AWS_TAG_KEY_MAX_LENGTH",
23-
"AWS_TAG_VALUE_MIN_LENGTH",
25+
"AWS_TAG_KEY_MIN_LENGTH",
2426
"AWS_TAG_VALUE_MAX_LENGTH",
27+
"AWS_TAG_VALUE_MIN_LENGTH",
28+
"AWSTagKey",
29+
"AWSTagValue",
2530
"EC2AccessError",
2631
"EC2InstanceBootSpecific",
2732
"EC2InstanceConfig",
2833
"EC2InstanceData",
2934
"EC2InstanceType",
35+
"EC2InsufficientCapacityError",
3036
"EC2NotConnectedError",
3137
"EC2RuntimeError",
3238
"EC2Tags",
3339
"Resources",
3440
"SimcoreEC2API",
3541
)
36-
3742
# nopycln: file

packages/aws-library/src/aws_library/ec2/_client.py

Lines changed: 131 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,17 @@
1313
from settings_library.ec2 import EC2Settings
1414
from types_aiobotocore_ec2 import EC2Client
1515
from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType
16-
from types_aiobotocore_ec2.type_defs import FilterTypeDef, TagTypeDef
16+
from types_aiobotocore_ec2.type_defs import (
17+
FilterTypeDef,
18+
TagTypeDef,
19+
)
1720

1821
from ._error_handler import ec2_exception_handler
19-
from ._errors import EC2InstanceNotFoundError, EC2TooManyInstancesError
22+
from ._errors import (
23+
EC2InstanceNotFoundError,
24+
EC2InsufficientCapacityError,
25+
EC2SubnetsNotEnoughIPsError,
26+
)
2027
from ._models import (
2128
AWSTagKey,
2229
EC2InstanceConfig,
@@ -25,7 +32,13 @@
2532
EC2Tags,
2633
Resources,
2734
)
28-
from ._utils import compose_user_data, ec2_instance_data_from_aws_instance
35+
from ._utils import (
36+
check_max_number_of_instances_not_exceeded,
37+
compose_user_data,
38+
ec2_instance_data_from_aws_instance,
39+
get_subnet_azs,
40+
get_subnet_capacity,
41+
)
2942

3043
_logger = logging.getLogger(__name__)
3144

@@ -92,6 +105,11 @@ async def get_ec2_instance_capabilities(
92105
list_instances: list[EC2InstanceType] = []
93106
for instance in instance_types.get("InstanceTypes", []):
94107
with contextlib.suppress(KeyError):
108+
assert "InstanceType" in instance # nosec
109+
assert "VCpuInfo" in instance # nosec
110+
assert "DefaultVCpus" in instance["VCpuInfo"] # nosec
111+
assert "MemoryInfo" in instance # nosec
112+
assert "SizeInMiB" in instance["MemoryInfo"] # nosec
95113
list_instances.append(
96114
EC2InstanceType(
97115
name=instance["InstanceType"],
@@ -118,94 +136,145 @@ async def launch_instances(
118136
119137
Arguments:
120138
instance_config -- The EC2 instance configuration
121-
min_number_of_instances -- the minimal number of instances needed (fails if this amount cannot be reached)
139+
min_number_of_instances -- the minimal number of instances required (fails if this amount cannot be reached)
122140
number_of_instances -- the ideal number of instances needed (it it cannot be reached AWS will return a number >=min_number_of_instances)
123-
124-
Keyword Arguments:
125-
max_total_number_of_instances -- The total maximum allowed number of instances for this given instance_config (default: {10})
141+
max_total_number_of_instances -- The total maximum allowed number of instances for this given instance_config
126142
127143
Raises:
128-
EC2TooManyInstancesError:
144+
EC2TooManyInstancesError: max_total_number_of_instances would be exceeded
145+
EC2SubnetsNotEnoughIPsError: not enough IPs in the subnets
146+
EC2InsufficientCapacityError: not enough capacity in the subnets
147+
129148
130149
Returns:
131150
The created instance data infos
132151
"""
152+
133153
with log_context(
134154
_logger,
135155
logging.INFO,
136-
msg=f"launch {number_of_instances} AWS instance(s) {instance_config.type.name} with {instance_config.tags=}",
156+
msg=f"launch {number_of_instances} AWS instance(s) {instance_config.type.name}"
157+
f" with {instance_config.tags=} in {instance_config.subnet_ids=}",
137158
):
138159
# first check the max amount is not already reached
139-
current_instances = await self.get_instances(
140-
key_names=[instance_config.key_name], tags=instance_config.tags
160+
await check_max_number_of_instances_not_exceeded(
161+
self,
162+
instance_config,
163+
required_number_instances=number_of_instances,
164+
max_total_number_of_instances=max_total_number_of_instances,
141165
)
142-
if (
143-
len(current_instances) + number_of_instances
144-
> max_total_number_of_instances
145-
):
146-
raise EC2TooManyInstancesError(
147-
num_instances=max_total_number_of_instances
166+
167+
# NOTE: checking subnets capacity is not strictly needed as AWS will do it for us
168+
# but it gives us a chance to give early feedback to the user
169+
# and avoid trying to launch instances in subnets that are already full
170+
# and also allows to circumvent a moto bug that does not raise
171+
# InsufficientInstanceCapacity when a subnet is full
172+
subnet_id_to_available_ips = await get_subnet_capacity(
173+
self.client, subnet_ids=instance_config.subnet_ids
174+
)
175+
176+
total_available_ips = sum(subnet_id_to_available_ips.values())
177+
if total_available_ips < min_number_of_instances:
178+
raise EC2SubnetsNotEnoughIPsError(
179+
subnet_ids=instance_config.subnet_ids,
180+
instance_type=instance_config.type.name,
181+
available_ips=total_available_ips,
148182
)
149183

184+
# now let's not try to run instances in subnets that have not enough IPs
185+
subnet_ids_with_capacity = [
186+
subnet_id
187+
for subnet_id, capacity in subnet_id_to_available_ips.items()
188+
if capacity >= min_number_of_instances
189+
]
190+
150191
resource_tags: list[TagTypeDef] = [
151192
{"Key": tag_key, "Value": tag_value}
152193
for tag_key, tag_value in instance_config.tags.items()
153194
]
154195

155-
instances = await self.client.run_instances(
156-
ImageId=instance_config.ami_id,
157-
MinCount=min_number_of_instances,
158-
MaxCount=number_of_instances,
159-
IamInstanceProfile=(
160-
{"Arn": instance_config.iam_instance_profile}
161-
if instance_config.iam_instance_profile
162-
else {}
163-
),
164-
InstanceType=instance_config.type.name,
165-
InstanceInitiatedShutdownBehavior="terminate",
166-
KeyName=instance_config.key_name,
167-
TagSpecifications=[
168-
{"ResourceType": "instance", "Tags": resource_tags},
169-
{"ResourceType": "volume", "Tags": resource_tags},
170-
{"ResourceType": "network-interface", "Tags": resource_tags},
171-
],
172-
UserData=compose_user_data(instance_config.startup_script),
173-
NetworkInterfaces=[
174-
{
175-
"AssociatePublicIpAddress": True,
176-
"DeviceIndex": 0,
177-
"SubnetId": instance_config.subnet_id,
178-
"Groups": instance_config.security_group_ids,
179-
}
180-
],
181-
)
182-
instance_ids = [i["InstanceId"] for i in instances["Instances"]]
183-
_logger.info(
184-
"%s New instances launched: %s, waiting for them to start now...",
185-
len(instance_ids),
186-
instance_ids,
187-
)
196+
# Try each subnet in order until one succeeds
197+
for subnet_id in subnet_ids_with_capacity:
198+
try:
199+
_logger.debug(
200+
"Attempting to launch instances in subnet %s", subnet_id
201+
)
188202

189-
# wait for the instance to be in a pending state
190-
# NOTE: reference to EC2 states https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
191-
waiter = self.client.get_waiter("instance_exists")
192-
await waiter.wait(InstanceIds=instance_ids)
193-
_logger.debug("instances %s exists now.", instance_ids)
203+
instances = await self.client.run_instances(
204+
ImageId=instance_config.ami_id,
205+
MinCount=min_number_of_instances,
206+
MaxCount=number_of_instances,
207+
IamInstanceProfile=(
208+
{"Arn": instance_config.iam_instance_profile}
209+
if instance_config.iam_instance_profile
210+
else {}
211+
),
212+
InstanceType=instance_config.type.name,
213+
InstanceInitiatedShutdownBehavior="terminate",
214+
KeyName=instance_config.key_name,
215+
TagSpecifications=[
216+
{"ResourceType": "instance", "Tags": resource_tags},
217+
{"ResourceType": "volume", "Tags": resource_tags},
218+
{
219+
"ResourceType": "network-interface",
220+
"Tags": resource_tags,
221+
},
222+
],
223+
UserData=compose_user_data(instance_config.startup_script),
224+
NetworkInterfaces=[
225+
{
226+
"AssociatePublicIpAddress": True,
227+
"DeviceIndex": 0,
228+
"SubnetId": subnet_id,
229+
"Groups": instance_config.security_group_ids,
230+
}
231+
],
232+
)
233+
# If we get here, the launch succeeded
234+
break
235+
except botocore.exceptions.ClientError as exc:
236+
error_code = exc.response.get("Error", {}).get("Code")
237+
if error_code == "InsufficientInstanceCapacity":
238+
_logger.warning(
239+
"Insufficient capacity in subnet %s for instance type %s, trying next subnet",
240+
subnet_id,
241+
instance_config.type.name,
242+
)
243+
continue
244+
# For any other ClientError, re-raise to let the decorator handle it
245+
raise
246+
247+
else:
248+
subnet_zones = await get_subnet_azs(
249+
self.client, subnet_ids=subnet_ids_with_capacity
250+
)
251+
raise EC2InsufficientCapacityError(
252+
availability_zones=subnet_zones,
253+
instance_type=instance_config.type.name,
254+
)
255+
instance_ids = [
256+
i["InstanceId"] # pyright: ignore[reportTypedDictNotRequiredAccess]
257+
for i in instances["Instances"]
258+
]
259+
with log_context(
260+
_logger,
261+
logging.INFO,
262+
msg=f"{len(instance_ids)} instances: {instance_ids=} launched. Wait to reach pending state",
263+
):
264+
# wait for the instance to be in a pending state
265+
# NOTE: reference to EC2 states https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
266+
waiter = self.client.get_waiter("instance_exists")
267+
await waiter.wait(InstanceIds=instance_ids)
194268

195-
# NOTE: waiting for pending ensure we get all the IPs back
269+
# NOTE: waiting for pending ensures we get all the IPs back
196270
described_instances = await self.client.describe_instances(
197271
InstanceIds=instance_ids
198272
)
199273
assert "Instances" in described_instances["Reservations"][0] # nosec
200-
instance_datas = [
274+
return [
201275
await ec2_instance_data_from_aws_instance(self, i)
202276
for i in described_instances["Reservations"][0]["Instances"]
203277
]
204-
_logger.info(
205-
"%s are pending now",
206-
f"{instance_ids=}",
207-
)
208-
return instance_datas
209278

210279
@ec2_exception_handler(_logger)
211280
async def get_instances(

packages/aws-library/src/aws_library/ec2/_error_handler.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,24 @@
11
import functools
22
import logging
3+
import re
34
from collections.abc import Callable, Coroutine
4-
from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar
5+
from typing import (
6+
TYPE_CHECKING,
7+
Any,
8+
Concatenate,
9+
Final,
10+
ParamSpec,
11+
TypeVar,
12+
cast,
13+
)
514

615
from botocore import exceptions as botocore_exc
716

817
from ._errors import (
918
EC2AccessError,
1019
EC2InstanceNotFoundError,
1120
EC2InstanceTypeInvalidError,
21+
EC2InsufficientCapacityError,
1222
EC2NotConnectedError,
1323
EC2RuntimeError,
1424
EC2TimeoutError,
@@ -26,30 +36,46 @@
2636
Self = TypeVar("Self", bound="SimcoreEC2API")
2737

2838

39+
_INSUFFICIENT_CAPACITY_ERROR_MSG_PATTERN: Final[re.Pattern] = re.compile(
40+
r"sufficient (?P<instance_type>\S+) capacity in the Availability Zone you requested "
41+
r"\((?P<failed_az>\S+)\)"
42+
)
43+
44+
2945
def _map_botocore_client_exception(
3046
botocore_error: botocore_exc.ClientError,
3147
*args, # pylint: disable=unused-argument # noqa: ARG001
3248
**kwargs, # pylint: disable=unused-argument # noqa: ARG001
3349
) -> EC2AccessError:
34-
status_code = int(
35-
botocore_error.response.get("ResponseMetadata", {}).get("HTTPStatusCode")
36-
or botocore_error.response.get("Error", {}).get("Code", -1)
50+
# see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html#parsing-error-responses-and-catching-exceptions-from-aws-services
51+
status_code = cast(
52+
int,
53+
botocore_error.response.get("ResponseMetadata", {}).get("HTTPStatusCode", "-1"),
3754
)
55+
error_code = botocore_error.response.get("Error", {}).get("Code", "Unknown")
56+
error_msg = botocore_error.response.get("Error", {}).get("Message", "Unknown")
3857
operation_name = botocore_error.operation_name
39-
match status_code, operation_name:
40-
case 400, "StartInstances":
58+
match error_code:
59+
case "InvalidInstanceID.NotFound":
4160
return EC2InstanceNotFoundError()
42-
case 400, "StopInstances":
43-
return EC2InstanceNotFoundError()
44-
case 400, "TerminateInstances":
45-
return EC2InstanceNotFoundError()
46-
case 400, "DescribeInstanceTypes":
61+
case "InvalidInstanceType":
4762
return EC2InstanceTypeInvalidError()
63+
case "InsufficientInstanceCapacity":
64+
availability_zone = "unknown"
65+
instance_type = "unknown"
66+
if match := re.search(_INSUFFICIENT_CAPACITY_ERROR_MSG_PATTERN, error_msg):
67+
instance_type = match.group("instance_type")
68+
availability_zone = match.group("failed_az")
69+
70+
raise EC2InsufficientCapacityError(
71+
availability_zones=availability_zone, instance_type=instance_type
72+
)
4873
case _:
4974
return EC2AccessError(
75+
status_code=status_code,
5076
operation_name=operation_name,
51-
code=status_code,
52-
error=f"{botocore_error}",
77+
code=error_code,
78+
error=error_msg,
5379
)
5480

5581

0 commit comments

Comments
 (0)