|
15 | 15 | EC2Tags, |
16 | 16 | Resources, |
17 | 17 | ) |
18 | | -from aws_library.ec2._errors import EC2TooManyInstancesError |
| 18 | +from aws_library.ec2._errors import EC2AccessError, EC2TooManyInstancesError |
19 | 19 | from fastapi import FastAPI |
20 | 20 | from models_library.generated_models.docker_rest_api import Node |
21 | 21 | from models_library.rabbitmq_messages import ProgressType |
@@ -421,7 +421,7 @@ async def _activate_drained_nodes( |
421 | 421 | ) |
422 | 422 |
|
423 | 423 |
|
424 | | -async def _start_warm_buffer_instances( |
| 424 | +async def _try_start_warm_buffer_instances( |
425 | 425 | app: FastAPI, cluster: Cluster, auto_scaling_mode: AutoscalingProvider |
426 | 426 | ) -> Cluster: |
427 | 427 | """starts warm buffer if there are assigned tasks, or if a hot buffer of the same type is needed""" |
@@ -471,9 +471,20 @@ async def _start_warm_buffer_instances( |
471 | 471 | with log_context( |
472 | 472 | _logger, logging.INFO, f"start {len(instances_to_start)} warm buffer machines" |
473 | 473 | ): |
474 | | - started_instances = await get_ec2_client(app).start_instances( |
475 | | - instances_to_start |
476 | | - ) |
| 474 | + try: |
| 475 | + started_instances = await get_ec2_client(app).start_instances( |
| 476 | + instances_to_start |
| 477 | + ) |
| 478 | + except EC2AccessError: |
| 479 | + _logger.warning( |
| 480 | + "Could not start warm buffer instances! " |
| 481 | + "TIP: This can happen in case of Insufficient " |
| 482 | + "Capacity on AWS AZ(s) where the warm buffers were created. " |
| 483 | + "Scaling up will be achieved via launching new EC2 instances instead.", |
| 484 | + exc_info=True, |
| 485 | + ) |
| 486 | + # we need to re-assign the tasks assigned to the warm buffer instances |
| 487 | + return cluster |
477 | 488 | # NOTE: first start the instance and then set the tags in case the instance cannot start (e.g. InsufficientInstanceCapacity) |
478 | 489 | await get_ec2_client(app).set_instances_tags( |
479 | 490 | started_instances, |
@@ -1231,7 +1242,7 @@ async def _autoscale_cluster( |
1231 | 1242 | cluster = await _activate_drained_nodes(app, cluster) |
1232 | 1243 |
|
1233 | 1244 | # 3. start warm buffer instances to cover the remaining tasks |
1234 | | - cluster = await _start_warm_buffer_instances(app, cluster, auto_scaling_mode) |
| 1245 | + cluster = await _try_start_warm_buffer_instances(app, cluster, auto_scaling_mode) |
1235 | 1246 |
|
1236 | 1247 | # 4. scale down unused instances |
1237 | 1248 | cluster = await _scale_down_unused_cluster_instances( |
|
0 commit comments