@@ -1539,19 +1539,31 @@ def _wait_compute_cloudinit_done(remote_command_executor, compute_node):
15391539 assert_that (compute_cloudinit_status_output ).contains ("status: done" )
15401540
15411541
1542- @ retry ( wait_fixed = seconds ( 10 ), stop_max_attempt_number = 4 )
1543- def _check_mpi_process ( remote_command_executor , slurm_commands , num_nodes , after_completion ):
1544- """Submit script and check for MPI processes."""
1545- # Clean up old datafiles
1546- remote_command_executor . run_remote_command ( "rm -f /shared/check_proc.out" )
1547- result = slurm_commands . submit_command ( "ps aux | grep IMB | grep MPI >> /shared/check_proc.out" , nodes = num_nodes )
1542+ def _assert_mpi_process_completion (
1543+ remote_command_executor , slurm_commands , num_nodes , after_completion , check_proc_file
1544+ ):
1545+ result = slurm_commands . submit_command (
1546+ f'ps aux | grep "mpiexec.hydra.*sleep" | grep -v "grep" >> { check_proc_file } ' , nodes = num_nodes
1547+ )
15481548 job_id = slurm_commands .assert_job_submitted (result .stdout )
15491549 slurm_commands .wait_job_completed (job_id )
1550- proc_track_result = remote_command_executor .run_remote_command ("cat /shared/check_proc.out " )
1550+ proc_track_result = remote_command_executor .run_remote_command (f "cat { check_proc_file } " )
15511551 if after_completion :
1552- assert_that (proc_track_result .stdout ).does_not_contain ( "IMB-MPI1 " )
1552+ assert_that (proc_track_result .stdout ).does_not_match ( ".*mpiexec.hydra.*sleep " )
15531553 else :
1554- assert_that (proc_track_result .stdout ).contains ("IMB-MPI1" )
1554+ assert_that (proc_track_result .stdout ).matches (".*mpiexec.hydra.*sleep" )
1555+
1556+
1557+ def _check_mpi_process (remote_command_executor , slurm_commands , num_nodes , after_completion ):
1558+ """Submit script and check for MPI processes."""
1559+ # Clean up old datafiles
1560+ check_proc_file = "/shared/check_proc.out"
1561+
1562+ # Check completion status of MPI process using the shared datafile
1563+ remote_command_executor .run_remote_command (f"rm -f { check_proc_file } " )
1564+ retry (wait_fixed = seconds (10 ), stop_max_attempt_number = 4 )(_assert_mpi_process_completion )(
1565+ remote_command_executor , slurm_commands , num_nodes , after_completion , check_proc_file
1566+ )
15551567
15561568
15571569def _test_cluster_gpu_limits (slurm_commands , partition , instance_type , max_count , gpu_per_instance , gpu_type ):
0 commit comments