3232## POSSIBILITY OF SUCH DAMAGE.
3333###############################################################################
3434
35+ import glob
3536import unittest
3637import subprocess
3738import os
4142from contextlib import contextmanager
4243
4344cur_dir = os .path .dirname (os .path .abspath (__file__ ))
44- src_dir = os .path .join (cur_dir , "../src" )
45- orfs_dir = os .path .join (cur_dir , "../../../flow" )
46- os .chdir (src_dir )
4745
4846
4947@contextmanager
@@ -65,18 +63,16 @@ class ResumeCheck(unittest.TestCase):
6563 design = "gcd"
6664 samples = 5
6765 iterations = 2
66+ experiment_name = "test-resume"
6867
6968 def setUp (self ):
7069 self .config = os .path .join (
71- orfs_dir , "designs" , self .platform , self .design , "autotuner.json"
70+ cur_dir ,
71+ f"../../../flow/designs/{ self .platform } /{ self .design } /autotuner.json" ,
7272 )
7373 self .jobs = self .samples
7474 self .num_cpus = os .cpu_count ()
7575
76- # How it works: Say we have 5 samples and 5 iterations.
77- # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
78- # We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
79-
8076 # Cast to 1 decimal place
8177 res_per_trial = float ("{:.1f}" .format (self .num_cpus / self .samples ))
8278 options = ["" , "--resume" ]
@@ -87,21 +83,48 @@ def setUp(self):
8783 f" --platform { self .platform } "
8884 f" --config { self .config } "
8985 f" --jobs { self .jobs } "
90- f" --experiment test-resume "
86+ f" --experiment { self . experiment_name } "
9187 f" tune --iterations { self .iterations } --samples { self .samples } "
9288 f" --resources_per_trial { res_per_trial } "
9389 f" { c } "
9490 for c in options
9591 ]
9692
93+ def check_trial_times (self , iteration : int = 0 ) -> str :
94+ """
95+ Checks the nth iteration time of a trial.
96+
97+ :param iteration: The iteration to check.
98+ :return: The latest modified UNIX time of the nth iteration.
99+ """
100+ if iteration < 0 or iteration >= self .iterations :
101+ raise ValueError ("Iteration must be between 0 and iterations - 1" )
102+
103+ experiment_dir = os .path .join (
104+ cur_dir ,
105+ f"../../../flow/logs/{ self .platform } /{ self .design } /{ self .experiment_name } -tune" ,
106+ )
107+ folders = glob .glob (os .path .join (experiment_dir , f"variant-*-or-{ iteration } " ))
108+ return max ((os .path .getmtime (folder ) for folder in folders ), default = 9e99 )
109+
97110 def test_tune_resume (self ):
98111 # Goal is to first run the first config (without resume) and then run the second config (with resume)
99112 # and check if the run is able to complete.
100113
101114 # Run the first config asynchronously.
102115 print ("Running the first config" )
116+ latest_modified_time = 0
103117 with managed_process (self .commands [0 ], shell = True ) as proc :
104- time .sleep (120 )
118+ time .sleep (30 )
119+ # Check if first config is complete
120+ while True :
121+ cur_modified_time = self .check_trial_times ()
122+ print (f"Current modified time: { cur_modified_time } " )
123+ print (f"Latest modified time: { latest_modified_time } " )
124+ if abs (cur_modified_time - latest_modified_time ) < 1e-6 :
125+ break
126+ latest_modified_time = cur_modified_time
127+ time .sleep (10 )
105128
106129 # Keep trying to stop the ray cluster until it is stopped
107130 while 1 :
0 commit comments