Add slurm test for head node down case

Rex · rexcsn · commit d3490b60dbf6 · 2021-06-29T16:15:55.000-07:00
Signed-off-by: Rex &lt;shuningc@amazon.com&gt;
diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py
@@ -19,7 +19,7 @@
 from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
 from retrying import retry
 from time_utils import minutes, seconds
-from utils import InstanceTypesData, get_compute_nodes_instance_ids
+from utils import InstanceTypesData, get_compute_nodes_instance_ids, get_head_node_instance_id
 
 from tests.common.assertions import (
     assert_errors_in_logs,
@@ -231,6 +231,17 @@ def test_error_handling(scheduler, region, instance, pcluster_config_reader, clu
         num_dynamic_nodes=1,
         dynamic_instance_type=instance,
     )
+    _test_head_node_down(
+        remote_command_executor,
+        scheduler_commands,
+        cluster.cfn_name,
+        region,
+        test_datadir,
+        partition="ondemand1",
+        num_static_nodes=1,
+        num_dynamic_nodes=1,
+        dynamic_instance_type=instance,
+    )
 
 
 def _assert_cluster_initial_conditions(
@@ -582,6 +593,48 @@ def _test_clustermgtd_down_logic(
     )
 
 
+def _test_head_node_down(
+    remote_command_executor,
+    scheduler_commands,
+    cluster_name,
+    region,
+    test_datadir,
+    partition,
+    num_static_nodes,
+    num_dynamic_nodes,
+    dynamic_instance_type,
+):
+    # Make sure clustermgtd and slurmctld are running
+    remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_clustermgtd.sh"), run_as_root=True)
+    remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_slurmctld.sh"), run_as_root=True)
+    # Sleep for 60 seconds to make sure clustermgtd finishes 1 iteration and write a valid heartbeat
+    # Otherwise ResumeProgram will not be able to launch dynamic nodes due to invalid heartbeat
+    time.sleep(60)
+    submit_initial_job(
+        scheduler_commands,
+        "sleep infinity",
+        partition,
+        dynamic_instance_type,
+        num_dynamic_nodes,
+        other_options="--no-requeue",
+    )
+    # On slurmctld restart, offline nodes might still show as responding for a short time, breaking assertions
+    # Add some retries to avoid failing due to this case
+    _, _ = retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_initial_conditions)(
+        scheduler_commands, num_static_nodes, num_dynamic_nodes, partition
+    )
+    _stop_head_node(cluster_name, region)
+    # Default computemgtd clustermgtd_timeout is 10 mins, check that compute instances are terminated around this time
+    retry(wait_fixed=seconds(20), stop_max_delay=minutes(15))(assert_num_instances_in_cluster)(cluster_name, region, 0)
+
+
+def _stop_head_node(cluster_name, region):
+    """Stop head node instance."""
+    head_node_id = get_head_node_instance_id(cluster_name, region)
+    ec2_client = boto3.client("ec2", region_name=region)
+    ec2_client.stop_instances(InstanceIds=head_node_id)
+
+
 def _wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes):
     """Wait for static and dynamic nodes to be reset."""
     if static_nodes:
diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_error_handling/slurm_start_clustermgtd.sh b/tests/integration-tests/tests/schedulers/test_slurm/test_error_handling/slurm_start_clustermgtd.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+source /opt/parallelcluster/pyenv/versions/cookbook_virtualenv/bin/activate && supervisorctl start clustermgtd
diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_error_handling/slurm_start_slurmctld.sh b/tests/integration-tests/tests/schedulers/test_slurm/test_error_handling/slurm_start_slurmctld.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+systemctl start slurmctld
diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py
@@ -183,6 +183,11 @@ def get_compute_nodes_instance_ids(stack_name, region, instance_types=None):
     return get_cluster_nodes_instance_ids(stack_name, region, instance_types, node_type="Compute")
 
 
+def get_head_node_instance_id(stack_name, region):
+    """Return a list of Head node Instance Id."""
+    return get_cluster_nodes_instance_ids(stack_name, region, instance_types=None, node_type="Master")
+
+
 def get_cluster_nodes_instance_ids(stack_name, region, instance_types=None, node_type=None):
     """Return a list of cluster Instances Id's."""
     try: