[Autotuner] CI Smoke Test - Resume (#2097)

luarss · web-flow · commit 0a48f8f91667 · 2025-01-15T15:25:40.000-03:00
* Resume smoke test

Signed-off-by: Jack Luar &lt;jluar@precisioninno.com&gt;
diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+DESIGN_NAME=${1:-gcd}
+PLATFORM=${2:-nangate45}
 
 # run the commands in ORFS root dir
 echo "[INFO FLW-0029] Installing dependencies in virtual environment."
@@ -20,28 +22,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
 echo "Running Autotuner smoke tests for --sample and --iteration."
 python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration
 
-if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
+if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
   echo "Running Autotuner ref file test (only once)"
   python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
-fi
 
-echo "Running Autotuner smoke algorithm & evaluation test"
-python3 -m unittest tools.AutoTuner.test.smoke_test_algo_eval.${PLATFORM}AlgoEvalSmokeTest.test_algo_eval
-
-# run this test last (because it modifies current path)
-echo "Running Autotuner remote test"
-if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
-  # Get the directory of the current script
-  script_dir="$(dirname "${BASH_SOURCE[0]}")"
-  cd "$script_dir"/../../
-  latest_image=$(./etc/DockerTag.sh -dev)
-  echo "ORFS_VERSION=$latest_image" > ./tools/AutoTuner/.env
-  cd ./tools/AutoTuner
-  docker compose up --wait
-  docker compose exec ray-worker bash -c "cd /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner && \
-    python3 distributed.py --design gcd --platform asap7 --server 127.0.0.1 --port 10001 \
-    --config ../../../../flow/designs/asap7/gcd/autotuner.json tune --samples 1"
-        docker compose down -v --remove-orphans
+  echo "Running AutoTuner resume test (only once)"
+  python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
 fi
 
 exit $ret
diff --git a/flow/test/test_helper.sh b/flow/test/test_helper.sh
@@ -108,7 +108,7 @@ fi
 if [ "${RUN_AUTOTUNER}" == "true" ]; then
   set +x
   echo "Start AutoTuner test."
-  ./test/test_autotuner.sh
+  ./test/test_autotuner.sh $DESIGN_NAME $PLATFORM
   set -x
 fi
 
diff --git a/tools/AutoTuner/src/autotuner/distributed.py b/tools/AutoTuner/src/autotuner/distributed.py
@@ -77,7 +77,7 @@ def setup(self, config):
         """
         # We create the following directory structure:
         #      1/     2/         3/       4/                5/   6/
-        # <repo>/<logs>/<platform>/<design>/<experiment>-DATE/<id>/<cwd>
+        # <repo>/<logs>/<platform>/<design>/<experiment>/<id>/<cwd>
         repo_dir = os.getcwd() + "/../" * 6
         self.repo_dir = os.path.abspath(repo_dir)
         self.parameters = parse_config(config, path=os.getcwd())
@@ -88,7 +88,8 @@ def step(self):
         """
         Run step experiment and compute its score.
         """
-        metrics_file = openroad(self.repo_dir, self.parameters, self.variant)
+        self._variant = f"{self.variant}-{self.step_}"
+        metrics_file = openroad(self.repo_dir, self.parameters, self._variant)
         self.step_ += 1
         (score, effective_clk_period, num_drc) = self.evaluate(
             self.read_metrics(metrics_file)
@@ -709,7 +710,10 @@ def parse_arguments():
         help="Time limit (in hours) for each trial run. Default is no limit.",
     )
     tune_parser.add_argument(
-        "--resume", action="store_true", help="Resume previous run."
+        "--resume",
+        action="store_true",
+        help="Resume previous run. Note that you must also set a unique experiment\
+                name identifier via `--experiment NAME` to be able to resume.",
     )
 
     # Setup
@@ -797,8 +801,8 @@ def parse_arguments():
     )
     tune_parser.add_argument(
         "--resources_per_trial",
-        type=int,
-        metavar="<int>",
+        type=float,
+        metavar="<float>",
         default=1,
         help="Number of CPUs to request for each tuning job.",
     )
@@ -874,7 +878,20 @@ def parse_arguments():
             )
             sys.exit(7)
 
-    arguments.experiment += f"-{arguments.mode}-{DATE}"
+        # Check for experiment name and resume flag.
+        if arguments.resume and arguments.experiment == "test":
+            print(
+                '[ERROR TUN-0031] The flag "--resume"'
+                ' requires that "--experiment NAME" is also given.'
+            )
+            sys.exit(1)
+
+    # If the experiment name is the default, add a UUID to the end.
+    if arguments.experiment == "test":
+        id = str(uuid())[:8]
+        arguments.experiment = f"{arguments.mode}-{id}"
+    else:
+        arguments.experiment += f"-{arguments.mode}"
 
     if arguments.timeout is not None:
         arguments.timeout = round(arguments.timeout * 3600)
@@ -1075,7 +1092,7 @@ def sweep():
             local_dir=LOCAL_DIR,
             resume=args.resume,
             stop={"training_iteration": args.iterations},
-            resources_per_trial={"cpu": args.resources_per_trial},
+            resources_per_trial={"cpu": os.cpu_count() / args.jobs},
             log_to_file=["trail-out.log", "trail-err.log"],
             trial_name_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
             trial_dirname_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
@@ -0,0 +1,89 @@
+import unittest
+import subprocess
+import os
+import time
+
+from contextlib import contextmanager
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+src_dir = os.path.join(cur_dir, "../src/autotuner")
+orfs_dir = os.path.join(cur_dir, "../../../flow")
+os.chdir(src_dir)
+
+
+@contextmanager
+def managed_process(*args, **kwargs):
+    """
+    Runs process and ensures it is killed when the context is exited.
+    """
+    proc = subprocess.Popen(*args, **kwargs)
+    try:
+        yield proc
+    finally:
+        if proc.poll() is None:  # If the process is still running
+            proc.kill()  # Forcefully kill it
+
+
+class ResumeCheck(unittest.TestCase):
+    # only test 1 platform/design.
+    platform = "asap7"
+    design = "gcd"
+    samples = 5
+    iterations = 2
+
+    def setUp(self):
+        self.config = os.path.join(
+            orfs_dir, "designs", self.platform, self.design, "autotuner.json"
+        )
+        self.jobs = self.samples
+        self.num_cpus = os.cpu_count()
+
+        # How it works: Say we have 5 samples and 5 iterations.
+        # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
+        #  We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
+
+        # Cast to 1 decimal place
+        res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
+        options = ["", "--resume"]
+        self.commands = [
+            f"python3 distributed.py"
+            f" --design {self.design}"
+            f" --platform {self.platform}"
+            f" --config {self.config}"
+            f" --jobs {self.jobs}"
+            f" --experiment test_resume"
+            f" tune --iterations {self.iterations} --samples {self.samples}"
+            f" --resources_per_trial {res_per_trial}"
+            f" {c}"
+            for c in options
+        ]
+
+    def test_tune_resume(self):
+        # Goal is to first run the first config (without resume) and then run the second config (with resume)
+        # and check if the run is able to complete.
+
+        # Run the first config asynchronously.
+        print("Running the first config")
+        with managed_process(self.commands[0], shell=True) as proc:
+            time.sleep(120)
+
+        # Keep trying to stop the ray cluster until it is stopped
+        while 1:
+            proc = subprocess.run("ray status", shell=True)
+            no_nodes = proc.returncode != 0
+            proc = subprocess.run("ray stop", shell=True)
+            successful = proc.returncode == 0
+
+            if no_nodes and successful:
+                break
+            time.sleep(10)
+
+        # Run the second config to completion
+        print("Running the second config")
+        proc = subprocess.run(self.commands[1], shell=True)
+        successful = proc.returncode == 0
+        self.assertTrue(successful)
+
+
+if __name__ == "__main__":
+    unittest.main()