diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh index afd4e6ed8d..1e25d9c1ce 100755 --- a/flow/test/test_autotuner.sh +++ b/flow/test/test_autotuner.sh @@ -30,8 +30,7 @@ if [ "$PLATFORM_WITHOUT_DASHES" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; the python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files echo "Running AutoTuner resume test (only once)" - # Temporarily disable resume check test due to flakiness - #python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume + python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume echo "Running AutoTuner binary check (only once)" openroad_autotuner -h diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py index 69eaec1f24..21604753d7 100644 --- a/tools/AutoTuner/test/resume_check.py +++ b/tools/AutoTuner/test/resume_check.py @@ -32,6 +32,7 @@ ## POSSIBILITY OF SUCH DAMAGE. ############################################################################### +import glob import unittest import subprocess import os @@ -41,9 +42,7 @@ from contextlib import contextmanager cur_dir = os.path.dirname(os.path.abspath(__file__)) -src_dir = os.path.join(cur_dir, "../src") -orfs_dir = os.path.join(cur_dir, "../../../flow") -os.chdir(src_dir) +DEFAULT_MODIFIED_TIME = 0 @contextmanager @@ -65,50 +64,92 @@ class ResumeCheck(unittest.TestCase): design = "gcd" samples = 5 iterations = 2 + experiment_name = "test-resume" def setUp(self): self.config = os.path.join( - orfs_dir, "designs", self.platform, self.design, "autotuner.json" + cur_dir, + f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json", ) self.jobs = self.samples self.num_cpus = os.cpu_count() - # How it works: Say we have 5 samples and 5 iterations. - # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray) - # We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!) - # Cast to 1 decimal place res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples)) options = ["", "--resume"] - self.exec = AutoTunerTestUtils.get_exec_cmd() + self.executable_command = AutoTunerTestUtils.get_exec_cmd() self.commands = [ - f"{self.exec}" + f"{self.executable_command}" f" --design {self.design}" f" --platform {self.platform}" f" --config {self.config}" f" --jobs {self.jobs}" - f" --experiment test-resume" + f" --experiment {self.experiment_name}" f" tune --iterations {self.iterations} --samples {self.samples}" f" --resources_per_trial {res_per_trial}" f" {c}" for c in options ] + def get_last_modified_time(self, iteration: int = 0) -> int: + """ + Returns the nth iteration time of a trial. + + :param iteration: The iteration to check. + :return: The latest modified UNIX time of the nth iteration. + If no folders are found, returns a default value. + """ + if iteration < 0 or iteration >= self.iterations: + raise ValueError("Iteration must be between 0 and (iterations - 1)") + + experiment_dir = os.path.join( + cur_dir, + f"../../../flow/logs/{self.platform}/{self.design}/{self.experiment_name}-tune", + ) + iteration_folders = glob.glob( + os.path.join(experiment_dir, f"variant-*-or-{iteration}") + ) + latest_modified_time = DEFAULT_MODIFIED_TIME + for folder in iteration_folders: + modified_time = os.path.getmtime(folder) + if modified_time > latest_modified_time: + latest_modified_time = modified_time + return latest_modified_time + def test_tune_resume(self): # Goal is to first run the first config (without resume) and then run the second config (with resume) # and check if the run is able to complete. # Run the first config asynchronously. print("Running the first config") - with managed_process(self.commands[0], shell=True) as proc: - time.sleep(120) + latest_modified_time = 0 + with managed_process(self.commands[0].split()) as proc: + time.sleep(30) + # Check if first config is complete + while True: + cur_modified_time = self.get_last_modified_time() + print(f"Current modified time: {cur_modified_time}") + print(f"Latest modified time: {latest_modified_time}") + if abs(cur_modified_time - latest_modified_time) < 1e-3: + break + latest_modified_time = cur_modified_time + time.sleep(10) # Keep trying to stop the ray cluster until it is stopped while 1: - proc = subprocess.run("ray status", shell=True) + proc = subprocess.run( + "ray status", shell=True, capture_output=True, text=True + ) + if proc.returncode != 0: + print(f"Error running 'ray status': {proc.stderr}") no_nodes = proc.returncode != 0 - proc = subprocess.run("ray stop", shell=True) - successful = proc.returncode in accepted_rc + proc = subprocess.run( + "ray stop", shell=True, capture_output=True, text=True + ) + if proc.returncode not in accepted_rc: + print(f"Error running 'ray stop': {proc.stderr}") + raise RuntimeError("Failed to stop the ray cluster") + successful = True if no_nodes and successful: break