Skip to content

Commit fea215e

Browse files
authored
🎨Computational backend: performance improvements step2 - autoscaling shall ask dask to retire nodes only if necessary (#8374)
1 parent dd684b7 commit fea215e

File tree

2 files changed

+21
-2
lines changed

2 files changed

+21
-2
lines changed

services/autoscaling/src/simcore_service_autoscaling/modules/cluster_scaling/_auto_scaling_core.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1215,7 +1215,10 @@ async def _scale_down_unused_cluster_instances(
12151215
cluster: Cluster,
12161216
auto_scaling_mode: AutoscalingProvider,
12171217
) -> Cluster:
1218-
await auto_scaling_mode.try_retire_nodes(app)
1218+
if any(not instance.has_assigned_tasks() for instance in cluster.active_nodes):
1219+
# ask the provider to try to retire nodes actively
1220+
with log_catch(_logger, reraise=False):
1221+
await auto_scaling_mode.try_retire_nodes(app)
12191222
cluster = await _deactivate_empty_nodes(app, cluster)
12201223
return await _try_scale_down_cluster(app, cluster)
12211224

services/autoscaling/tests/unit/test_modules_cluster_scaling_computational.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,13 @@ def mock_dask_is_worker_connected(mocker: MockerFixture) -> mock.Mock:
235235
)
236236

237237

238+
@pytest.fixture
239+
def spy_dask_try_retire_nodes(mocker: MockerFixture) -> mock.Mock:
240+
from simcore_service_autoscaling.modules import dask
241+
242+
return mocker.spy(dask, "try_retire_nodes")
243+
244+
238245
async def _create_task_with_resources(
239246
ec2_client: EC2Client,
240247
dask_task_imposed_ec2_type: InstanceTypeType | None,
@@ -491,6 +498,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
491498
mock_dask_get_worker_has_results_in_memory: mock.Mock,
492499
mock_dask_get_worker_used_resources: mock.Mock,
493500
mock_dask_is_worker_connected: mock.Mock,
501+
spy_dask_try_retire_nodes: mock.Mock,
494502
mocker: MockerFixture,
495503
dask_spec_local_cluster: distributed.SpecCluster,
496504
with_drain_nodes_labelled: bool,
@@ -527,6 +535,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
527535
mock_dask_get_worker_has_results_in_memory.assert_not_called()
528536
mock_dask_get_worker_used_resources.assert_not_called()
529537
mock_dask_is_worker_connected.assert_not_called()
538+
spy_dask_try_retire_nodes.assert_not_called()
530539
# check rabbit messages were sent
531540
_assert_rabbit_autoscaling_message_sent(
532541
mock_rabbitmq_post_message,
@@ -547,6 +556,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
547556
mock_dask_get_worker_used_resources.assert_called_once()
548557
mock_dask_get_worker_used_resources.reset_mock()
549558
mock_dask_is_worker_connected.assert_not_called()
559+
spy_dask_try_retire_nodes.assert_not_called()
550560
instances = await assert_autoscaled_computational_ec2_instances(
551561
ec2_client,
552562
expected_num_reservations=1,
@@ -667,6 +677,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
667677
assert mock_docker_tag_node.call_count == num_useless_calls
668678
mock_docker_tag_node.reset_mock()
669679
mock_docker_set_node_availability.assert_not_called()
680+
spy_dask_try_retire_nodes.assert_not_called()
670681
# check the number of instances did not change and is still running
671682
await assert_autoscaled_computational_ec2_instances(
672683
ec2_client,
@@ -697,6 +708,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
697708
mock_dask_get_worker_used_resources.reset_mock()
698709
# the node shall be waiting before draining
699710
mock_docker_set_node_availability.assert_not_called()
711+
spy_dask_try_retire_nodes.assert_called_once()
712+
spy_dask_try_retire_nodes.reset_mock()
700713
mock_docker_tag_node.assert_called_once_with(
701714
get_docker_client(initialized_app),
702715
fake_attached_node,
@@ -731,6 +744,8 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
731744
mock_dask_get_worker_used_resources.reset_mock()
732745
# the node shall be set to drain, but not yet terminated
733746
mock_docker_set_node_availability.assert_not_called()
747+
spy_dask_try_retire_nodes.assert_called_once()
748+
spy_dask_try_retire_nodes.reset_mock()
734749
mock_docker_tag_node.assert_called_once_with(
735750
get_docker_client(initialized_app),
736751
fake_attached_node,
@@ -827,6 +842,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
827842
)
828843
.datetime.isoformat()
829844
)
845+
spy_dask_try_retire_nodes.assert_not_called()
830846
await auto_scale_cluster(app=initialized_app, auto_scaling_mode=auto_scaling_mode)
831847
mocked_docker_remove_node.assert_called_once_with(
832848
mock.ANY, nodes=[fake_attached_node], force=True
@@ -839,7 +855,7 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915
839855
expected_instance_state="terminated",
840856
expected_additional_tag_keys=list(ec2_instance_custom_tags),
841857
)
842-
858+
spy_dask_try_retire_nodes.assert_not_called()
843859
# this call should never be used in computational mode
844860
mock_docker_compute_node_used_resources.assert_not_called()
845861

0 commit comments

Comments
 (0)