1717import pytest
1818from assertpy import assert_that
1919from remote_command_executor import RemoteCommandExecutionError , RemoteCommandExecutor
20- from time_utils import minutes
20+ from retrying import retry
21+ from time_utils import minutes , seconds
2122
2223from tests .common .assertions import assert_no_errors_in_logs , assert_scaling_worked
2324from tests .common .scaling_common import watch_compute_nodes
@@ -228,9 +229,9 @@ def _get_job_state(remote_command_executor, job_id):
228229
229230def _assert_scheduler_configuration (remote_command_executor , torque_commands , max_slots , max_queue_size ):
230231 compute_nodes_count = torque_commands .compute_nodes_count ()
231- hostname = remote_command_executor . run_remote_command ( "hostname" ). stdout
232- result = remote_command_executor . run_remote_command ( "pbsnodes {0}" . format ( hostname )). stdout
233- assert_that ( result ). contains ( "np = {0} \n " . format (( max_queue_size - compute_nodes_count ) * max_slots ) )
232+ # Add wait head node np setting retry for 1 mins, daemons retrieve ASG every 3 mins and there is a time gap
233+ # between ASG retrieval and head node np setting, 200 seconds sleep is not enough for this test after updating ASG
234+ _wait_head_node_np_setting_complete ( compute_nodes_count , max_queue_size , max_slots , remote_command_executor )
234235
235236 torque_config = remote_command_executor .run_remote_command ("sudo /opt/torque/bin/qmgr -c 'p s'" ).stdout
236237 assert_that (torque_config ).contains ("set queue batch resources_max.ncpus = {0}\n " .format (max_slots ))
@@ -244,6 +245,17 @@ def _assert_scheduler_configuration(remote_command_executor, torque_commands, ma
244245 assert_that (torque_config ).contains ("set server resources_max.nodect = {0}\n " .format (max_queue_size ))
245246
246247
248+ @retry (
249+ retry_on_result = lambda result : result is False ,
250+ wait_fixed = seconds (5 ),
251+ stop_max_delay = minutes (1 ),
252+ )
253+ def _wait_head_node_np_setting_complete (compute_nodes_count , max_queue_size , max_slots , remote_command_executor ):
254+ hostname = remote_command_executor .run_remote_command ("hostname" ).stdout
255+ result = remote_command_executor .run_remote_command ("pbsnodes {0}" .format (hostname )).stdout
256+ return "np = {0}\n " .format ((max_queue_size - compute_nodes_count ) * max_slots ) in result
257+
258+
247259def _assert_job_completed (remote_command_executor , job_id ):
248260 try :
249261 result = remote_command_executor .run_remote_command ("qstat -f {0}" .format (job_id ), log_error = False )
0 commit comments