Skip to content

Commit cd066cf

Browse files
committed
[Test] Relax test_slurm_scaling for Rocky, increasing the accepted time to replace static nodes from 5min to 6 min. We observed in 3.13.0 an increase in the bootstrap time of Rocky nodes.
1 parent 5893c57 commit cd066cf

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def test_slurm_from_login_nodes_in_private_network(
250250
@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
251251
@pytest.mark.slurm_scaling
252252
def test_slurm_scaling(
253-
scheduler, region, instance, pcluster_config_reader, clusters_factory, test_datadir, scheduler_commands_factory
253+
scheduler, region, os, instance, pcluster_config_reader, clusters_factory, test_datadir, scheduler_commands_factory
254254
):
255255
"""Test that slurm-specific scaling logic is behaving as expected for normal actions and failures."""
256256
cluster_config = pcluster_config_reader(scaledown_idletime=3)
@@ -291,6 +291,7 @@ def test_slurm_scaling(
291291
test_datadir,
292292
cluster.cfn_name,
293293
region,
294+
os,
294295
partition="ondemand1",
295296
num_static_nodes=2,
296297
num_dynamic_nodes=3,
@@ -1171,6 +1172,7 @@ def _test_replace_down_nodes(
11711172
test_datadir,
11721173
cluster_name,
11731174
region,
1175+
os,
11741176
partition,
11751177
num_static_nodes,
11761178
num_dynamic_nodes,
@@ -1194,7 +1196,10 @@ def _test_replace_down_nodes(
11941196
remote_command_executor.run_remote_script(str(test_datadir / "slurm_kill_slurmd_job.sh"), args=[node])
11951197
# set dynamic to down manually
11961198
_set_nodes_to_down_manually(scheduler_commands, dynamic_nodes)
1197-
_wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes)
1199+
# TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
1200+
# We must address it and restore the default wait time to 300s.
1201+
stop_max_delay_secs = 360 if os.starts_with("rocky") else 300
1202+
_wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes, stop_max_delay_secs=stop_max_delay_secs)
11981203
assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
11991204

12001205

0 commit comments

Comments
 (0)