@@ -76,9 +76,9 @@ def setUp(self):
7676 # Cast to 1 decimal place
7777 res_per_trial = float ("{:.1f}" .format (self .num_cpus / self .samples ))
7878 options = ["" , "--resume" ]
79- self .exec = AutoTunerTestUtils .get_exec_cmd ()
79+ self .executable_command = AutoTunerTestUtils .get_exec_cmd ()
8080 self .commands = [
81- f"{ self .exec } "
81+ f"{ self .executable_command } "
8282 f" --design { self .design } "
8383 f" --platform { self .platform } "
8484 f" --config { self .config } "
@@ -90,12 +90,13 @@ def setUp(self):
9090 for c in options
9191 ]
9292
93- def check_trial_times (self , iteration : int = 0 ) -> str :
93+ def check_trial_times (self , iteration : int = 0 ) -> int :
9494 """
9595 Checks the nth iteration time of a trial.
9696
9797 :param iteration: The iteration to check.
9898 :return: The latest modified UNIX time of the nth iteration.
99+ If no folders are found, returns a default value of 9e99.
99100 """
100101 if iteration < 0 or iteration >= self .iterations :
101102 raise ValueError ("Iteration must be between 0 and iterations - 1" )
@@ -114,32 +115,41 @@ def test_tune_resume(self):
114115 # Run the first config asynchronously.
115116 print ("Running the first config" )
116117 latest_modified_time = 0
117- with managed_process (self .commands [0 ], shell = True ) as proc :
118+ with managed_process (self .commands [0 ]. split () ) as proc :
118119 time .sleep (30 )
119120 # Check if first config is complete
120121 while True :
121122 cur_modified_time = self .check_trial_times ()
122123 print (f"Current modified time: { cur_modified_time } " )
123124 print (f"Latest modified time: { latest_modified_time } " )
124- if abs (cur_modified_time - latest_modified_time ) < 1e-6 :
125+ if abs (cur_modified_time - latest_modified_time ) < 1e-3 :
125126 break
126127 latest_modified_time = cur_modified_time
127128 time .sleep (10 )
128129
129130 # Keep trying to stop the ray cluster until it is stopped
130131 while 1 :
131- proc = subprocess .run ("ray status" , shell = True )
132+ proc = subprocess .run (
133+ "ray status" , shell = True , capture_output = True , text = True
134+ )
135+ if proc .returncode != 0 :
136+ print (f"Error running 'ray status': { proc .stderr } " )
132137 no_nodes = proc .returncode != 0
133- proc = subprocess .run ("ray stop" , shell = True )
134- successful = proc .returncode == 0
138+ proc = subprocess .run (
139+ "ray stop" , shell = True , capture_output = True , text = True
140+ )
141+ if proc .returncode != 0 :
142+ print (f"Error running 'ray stop': { proc .stderr } " )
143+ raise RuntimeError ("Failed to stop the ray cluster" )
144+ successful = True
135145
136146 if no_nodes and successful :
137147 break
138148 time .sleep (10 )
139149
140150 # Run the second config to completion
141151 print ("Running the second config" )
142- proc = subprocess .run (self .commands [1 ], shell = True )
152+ proc = subprocess .run (self .commands [1 ], shell = True , check = True )
143153 successful = proc .returncode == 0
144154 self .assertTrue (successful )
145155
0 commit comments