Skip to content

Commit 2aa5080

Browse files
authored
šŸ›Autoscaling: Fix lost stopped EC2 instances and missing error logs (#7493)
1 parent 0dc7846 commit 2aa5080

File tree

4 files changed

+31
-18
lines changed

4 files changed

+31
-18
lines changed

ā€Žpackages/aws-library/src/aws_library/ec2/_error_handler.pyā€Ž

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,8 @@ def ec2_exception_handler(
5959
[Callable[Concatenate[Self, P], Coroutine[Any, Any, R]]],
6060
Callable[Concatenate[Self, P], Coroutine[Any, Any, R]],
6161
]:
62-
"""
63-
Raises:
64-
SSMAccessError:
65-
"""
66-
6762
def decorator(
68-
func: Callable[Concatenate[Self, P], Coroutine[Any, Any, R]]
63+
func: Callable[Concatenate[Self, P], Coroutine[Any, Any, R]],
6964
) -> Callable[Concatenate[Self, P], Coroutine[Any, Any, R]]:
7065
@functools.wraps(func)
7166
async def wrapper(self: Self, *args: P.args, **kwargs: P.kwargs) -> R:

ā€Žpackages/service-library/src/servicelib/background_task.pyā€Ž

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from tenacity.wait import wait_fixed
1111

1212
from .async_utils import cancel_wait_task, delayed_start
13-
from .logging_utils import log_context
13+
from .logging_utils import log_catch, log_context
1414

1515
_logger = logging.getLogger(__name__)
1616

@@ -84,7 +84,8 @@ class _InternalTryAgain(TryAgain):
8484
)
8585
@functools.wraps(func)
8686
async def _wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
87-
await func(*args, **kwargs)
87+
with log_catch(_logger, reraise=True):
88+
await func(*args, **kwargs)
8889
raise _InternalTryAgain
8990

9091
return _wrapper

ā€Žpackages/service-library/tests/test_background_task.pyā€Ž

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import asyncio
88
import datetime
9+
import logging
910
from collections.abc import AsyncIterator, Awaitable, Callable
1011
from typing import Final
1112
from unittest import mock
@@ -204,3 +205,19 @@ async def _func() -> None:
204205
await task
205206

206207
assert mock_func.call_count > 1
208+
209+
210+
async def test_periodic_task_logs_error(
211+
mock_background_task: mock.AsyncMock,
212+
task_interval: datetime.timedelta,
213+
caplog: pytest.LogCaptureFixture,
214+
):
215+
mock_background_task.side_effect = RuntimeError("Test error")
216+
217+
with caplog.at_level(logging.ERROR):
218+
async with periodic_task(
219+
mock_background_task, interval=task_interval, task_name="test_task"
220+
):
221+
await asyncio.sleep(2 * task_interval.total_seconds())
222+
223+
assert "Test error" in caplog.text

ā€Žservices/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.pyā€Ž

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,10 @@ async def _sorted_allowed_instance_types(app: FastAPI) -> list[EC2InstanceType]:
338338
allowed_instance_type_names
339339
), "EC2_INSTANCES_ALLOWED_TYPES cannot be empty!"
340340

341-
allowed_instance_types: list[
342-
EC2InstanceType
343-
] = await ec2_client.get_ec2_instance_capabilities(
344-
cast(set[InstanceTypeType], set(allowed_instance_type_names))
341+
allowed_instance_types: list[EC2InstanceType] = (
342+
await ec2_client.get_ec2_instance_capabilities(
343+
cast(set[InstanceTypeType], set(allowed_instance_type_names))
344+
)
345345
)
346346

347347
def _as_selection(instance_type: EC2InstanceType) -> int:
@@ -470,14 +470,14 @@ async def _start_warm_buffer_instances(
470470
with log_context(
471471
_logger, logging.INFO, f"start {len(instances_to_start)} buffer machines"
472472
):
473-
await get_ec2_client(app).set_instances_tags(
474-
instances_to_start,
475-
tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode),
476-
)
477-
478473
started_instances = await get_ec2_client(app).start_instances(
479474
instances_to_start
480475
)
476+
# NOTE: first start the instance and then set the tags in case the instance cannot start (e.g. InsufficientInstanceCapacity)
477+
await get_ec2_client(app).set_instances_tags(
478+
started_instances,
479+
tags=get_activated_buffer_ec2_tags(app, auto_scaling_mode),
480+
)
481481
started_instance_ids = [i.id for i in started_instances]
482482

483483
return dataclasses.replace(
@@ -669,7 +669,7 @@ async def _find_needed_instances(
669669
"found following %s needed instances: %s",
670670
len(needed_new_instance_types_for_tasks),
671671
[
672-
f"{i.instance_type.name}:{i.instance_type.resources} takes {len(i.assigned_tasks)} task{'s' if len(i.assigned_tasks)>1 else ''}"
672+
f"{i.instance_type.name}:{i.instance_type.resources} takes {len(i.assigned_tasks)} task{'s' if len(i.assigned_tasks) > 1 else ''}"
673673
for i in needed_new_instance_types_for_tasks
674674
],
675675
)

0 commit comments

Comments
Ā (0)