Skip to content

Commit d3490b6

Browse files
Rexrexcsn
authored andcommitted
Add slurm test for head node down case
Signed-off-by: Rex <[email protected]>
1 parent 09bb1ed commit d3490b6

File tree

4 files changed

+85
-1
lines changed

4 files changed

+85
-1
lines changed

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
2020
from retrying import retry
2121
from time_utils import minutes, seconds
22-
from utils import InstanceTypesData, get_compute_nodes_instance_ids
22+
from utils import InstanceTypesData, get_compute_nodes_instance_ids, get_head_node_instance_id
2323

2424
from tests.common.assertions import (
2525
assert_errors_in_logs,
@@ -231,6 +231,17 @@ def test_error_handling(scheduler, region, instance, pcluster_config_reader, clu
231231
num_dynamic_nodes=1,
232232
dynamic_instance_type=instance,
233233
)
234+
_test_head_node_down(
235+
remote_command_executor,
236+
scheduler_commands,
237+
cluster.cfn_name,
238+
region,
239+
test_datadir,
240+
partition="ondemand1",
241+
num_static_nodes=1,
242+
num_dynamic_nodes=1,
243+
dynamic_instance_type=instance,
244+
)
234245

235246

236247
def _assert_cluster_initial_conditions(
@@ -582,6 +593,48 @@ def _test_clustermgtd_down_logic(
582593
)
583594

584595

596+
def _test_head_node_down(
597+
remote_command_executor,
598+
scheduler_commands,
599+
cluster_name,
600+
region,
601+
test_datadir,
602+
partition,
603+
num_static_nodes,
604+
num_dynamic_nodes,
605+
dynamic_instance_type,
606+
):
607+
# Make sure clustermgtd and slurmctld are running
608+
remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_clustermgtd.sh"), run_as_root=True)
609+
remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_slurmctld.sh"), run_as_root=True)
610+
# Sleep for 60 seconds to make sure clustermgtd finishes 1 iteration and write a valid heartbeat
611+
# Otherwise ResumeProgram will not be able to launch dynamic nodes due to invalid heartbeat
612+
time.sleep(60)
613+
submit_initial_job(
614+
scheduler_commands,
615+
"sleep infinity",
616+
partition,
617+
dynamic_instance_type,
618+
num_dynamic_nodes,
619+
other_options="--no-requeue",
620+
)
621+
# On slurmctld restart, offline nodes might still show as responding for a short time, breaking assertions
622+
# Add some retries to avoid failing due to this case
623+
_, _ = retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_initial_conditions)(
624+
scheduler_commands, num_static_nodes, num_dynamic_nodes, partition
625+
)
626+
_stop_head_node(cluster_name, region)
627+
# Default computemgtd clustermgtd_timeout is 10 mins, check that compute instances are terminated around this time
628+
retry(wait_fixed=seconds(20), stop_max_delay=minutes(15))(assert_num_instances_in_cluster)(cluster_name, region, 0)
629+
630+
631+
def _stop_head_node(cluster_name, region):
632+
"""Stop head node instance."""
633+
head_node_id = get_head_node_instance_id(cluster_name, region)
634+
ec2_client = boto3.client("ec2", region_name=region)
635+
ec2_client.stop_instances(InstanceIds=head_node_id)
636+
637+
585638
def _wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes):
586639
"""Wait for static and dynamic nodes to be reset."""
587640
if static_nodes:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License").
5+
# You may not use this file except in compliance with the License.
6+
# A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file.
11+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
12+
# See the License for the specific language governing permissions and limitations under the License.
13+
source /opt/parallelcluster/pyenv/versions/cookbook_virtualenv/bin/activate && supervisorctl start clustermgtd
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License").
5+
# You may not use this file except in compliance with the License.
6+
# A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file.
11+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
12+
# See the License for the specific language governing permissions and limitations under the License.
13+
systemctl start slurmctld

tests/integration-tests/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,11 @@ def get_compute_nodes_instance_ids(stack_name, region, instance_types=None):
183183
return get_cluster_nodes_instance_ids(stack_name, region, instance_types, node_type="Compute")
184184

185185

186+
def get_head_node_instance_id(stack_name, region):
187+
"""Return a list of Head node Instance Id."""
188+
return get_cluster_nodes_instance_ids(stack_name, region, instance_types=None, node_type="Master")
189+
190+
186191
def get_cluster_nodes_instance_ids(stack_name, region, instance_types=None, node_type=None):
187192
"""Return a list of cluster Instances Id's."""
188193
try:

0 commit comments

Comments
 (0)