|
19 | 19 | from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor |
20 | 20 | from retrying import retry |
21 | 21 | from time_utils import minutes, seconds |
22 | | -from utils import InstanceTypesData, get_compute_nodes_instance_ids |
| 22 | +from utils import InstanceTypesData, get_compute_nodes_instance_ids, get_head_node_instance_id |
23 | 23 |
|
24 | 24 | from tests.common.assertions import ( |
25 | 25 | assert_errors_in_logs, |
@@ -231,6 +231,17 @@ def test_error_handling(scheduler, region, instance, pcluster_config_reader, clu |
231 | 231 | num_dynamic_nodes=1, |
232 | 232 | dynamic_instance_type=instance, |
233 | 233 | ) |
| 234 | + _test_head_node_down( |
| 235 | + remote_command_executor, |
| 236 | + scheduler_commands, |
| 237 | + cluster.cfn_name, |
| 238 | + region, |
| 239 | + test_datadir, |
| 240 | + partition="ondemand1", |
| 241 | + num_static_nodes=1, |
| 242 | + num_dynamic_nodes=1, |
| 243 | + dynamic_instance_type=instance, |
| 244 | + ) |
234 | 245 |
|
235 | 246 |
|
236 | 247 | def _assert_cluster_initial_conditions( |
@@ -582,6 +593,48 @@ def _test_clustermgtd_down_logic( |
582 | 593 | ) |
583 | 594 |
|
584 | 595 |
|
| 596 | +def _test_head_node_down( |
| 597 | + remote_command_executor, |
| 598 | + scheduler_commands, |
| 599 | + cluster_name, |
| 600 | + region, |
| 601 | + test_datadir, |
| 602 | + partition, |
| 603 | + num_static_nodes, |
| 604 | + num_dynamic_nodes, |
| 605 | + dynamic_instance_type, |
| 606 | +): |
| 607 | + # Make sure clustermgtd and slurmctld are running |
| 608 | + remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_clustermgtd.sh"), run_as_root=True) |
| 609 | + remote_command_executor.run_remote_script(str(test_datadir / "slurm_start_slurmctld.sh"), run_as_root=True) |
| 610 | + # Sleep for 60 seconds to make sure clustermgtd finishes 1 iteration and write a valid heartbeat |
| 611 | + # Otherwise ResumeProgram will not be able to launch dynamic nodes due to invalid heartbeat |
| 612 | + time.sleep(60) |
| 613 | + submit_initial_job( |
| 614 | + scheduler_commands, |
| 615 | + "sleep infinity", |
| 616 | + partition, |
| 617 | + dynamic_instance_type, |
| 618 | + num_dynamic_nodes, |
| 619 | + other_options="--no-requeue", |
| 620 | + ) |
| 621 | + # On slurmctld restart, offline nodes might still show as responding for a short time, breaking assertions |
| 622 | + # Add some retries to avoid failing due to this case |
| 623 | + _, _ = retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_initial_conditions)( |
| 624 | + scheduler_commands, num_static_nodes, num_dynamic_nodes, partition |
| 625 | + ) |
| 626 | + _stop_head_node(cluster_name, region) |
| 627 | + # Default computemgtd clustermgtd_timeout is 10 mins, check that compute instances are terminated around this time |
| 628 | + retry(wait_fixed=seconds(20), stop_max_delay=minutes(15))(assert_num_instances_in_cluster)(cluster_name, region, 0) |
| 629 | + |
| 630 | + |
| 631 | +def _stop_head_node(cluster_name, region): |
| 632 | + """Stop head node instance.""" |
| 633 | + head_node_id = get_head_node_instance_id(cluster_name, region) |
| 634 | + ec2_client = boto3.client("ec2", region_name=region) |
| 635 | + ec2_client.stop_instances(InstanceIds=head_node_id) |
| 636 | + |
| 637 | + |
585 | 638 | def _wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes): |
586 | 639 | """Wait for static and dynamic nodes to be reset.""" |
587 | 640 | if static_nodes: |
|
0 commit comments