Skip to content

Commit 5f12a0e

Browse files
committed
Integ-tests: Add retry for waiting head node np setting complete
Signed-off-by: Yulei Wang <[email protected]>
1 parent b1114db commit 5f12a0e

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

tests/integration-tests/tests/schedulers/test_torque.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
import pytest
1818
from assertpy import assert_that
1919
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
20-
from time_utils import minutes
20+
from retrying import retry
21+
from time_utils import minutes, seconds
2122

2223
from tests.common.assertions import assert_no_errors_in_logs, assert_scaling_worked
2324
from tests.common.scaling_common import watch_compute_nodes
@@ -228,9 +229,9 @@ def _get_job_state(remote_command_executor, job_id):
228229

229230
def _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size):
230231
compute_nodes_count = torque_commands.compute_nodes_count()
231-
hostname = remote_command_executor.run_remote_command("hostname").stdout
232-
result = remote_command_executor.run_remote_command("pbsnodes {0}".format(hostname)).stdout
233-
assert_that(result).contains("np = {0}\n".format((max_queue_size - compute_nodes_count) * max_slots))
232+
# Add wait head node np setting retry for 1 mins, daemons retrieve ASG every 3 mins and there is a time gap
233+
# between ASG retrieval and head node np setting, 200 seconds sleep is not enough for this test after updating ASG
234+
_wait_head_node_np_setting_complete(compute_nodes_count, max_queue_size, max_slots, remote_command_executor)
234235

235236
torque_config = remote_command_executor.run_remote_command("sudo /opt/torque/bin/qmgr -c 'p s'").stdout
236237
assert_that(torque_config).contains("set queue batch resources_max.ncpus = {0}\n".format(max_slots))
@@ -244,6 +245,17 @@ def _assert_scheduler_configuration(remote_command_executor, torque_commands, ma
244245
assert_that(torque_config).contains("set server resources_max.nodect = {0}\n".format(max_queue_size))
245246

246247

248+
@retry(
249+
retry_on_result=lambda result: result is False,
250+
wait_fixed=seconds(5),
251+
stop_max_delay=minutes(1),
252+
)
253+
def _wait_head_node_np_setting_complete(compute_nodes_count, max_queue_size, max_slots, remote_command_executor):
254+
hostname = remote_command_executor.run_remote_command("hostname").stdout
255+
result = remote_command_executor.run_remote_command("pbsnodes {0}".format(hostname)).stdout
256+
return "np = {0}\n".format((max_queue_size - compute_nodes_count) * max_slots) in result
257+
258+
247259
def _assert_job_completed(remote_command_executor, job_id):
248260
try:
249261
result = remote_command_executor.run_remote_command("qstat -f {0}".format(job_id), log_error=False)

0 commit comments

Comments
 (0)