Skip to content

Commit 0a48f8f

Browse files
authored
[Autotuner] CI Smoke Test - Resume (#2097)
* Resume smoke test Signed-off-by: Jack Luar <[email protected]>
1 parent 9ab0084 commit 0a48f8f

File tree

4 files changed

+119
-27
lines changed

4 files changed

+119
-27
lines changed

flow/test/test_autotuner.sh

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#!/usr/bin/env bash
2+
DESIGN_NAME=${1:-gcd}
3+
PLATFORM=${2:-nangate45}
24

35
# run the commands in ORFS root dir
46
echo "[INFO FLW-0029] Installing dependencies in virtual environment."
@@ -20,28 +22,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
2022
echo "Running Autotuner smoke tests for --sample and --iteration."
2123
python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration
2224

23-
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
25+
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
2426
echo "Running Autotuner ref file test (only once)"
2527
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
26-
fi
2728

28-
echo "Running Autotuner smoke algorithm & evaluation test"
29-
python3 -m unittest tools.AutoTuner.test.smoke_test_algo_eval.${PLATFORM}AlgoEvalSmokeTest.test_algo_eval
30-
31-
# run this test last (because it modifies current path)
32-
echo "Running Autotuner remote test"
33-
if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN" == "gcd" ]; then
34-
# Get the directory of the current script
35-
script_dir="$(dirname "${BASH_SOURCE[0]}")"
36-
cd "$script_dir"/../../
37-
latest_image=$(./etc/DockerTag.sh -dev)
38-
echo "ORFS_VERSION=$latest_image" > ./tools/AutoTuner/.env
39-
cd ./tools/AutoTuner
40-
docker compose up --wait
41-
docker compose exec ray-worker bash -c "cd /OpenROAD-flow-scripts/tools/AutoTuner/src/autotuner && \
42-
python3 distributed.py --design gcd --platform asap7 --server 127.0.0.1 --port 10001 \
43-
--config ../../../../flow/designs/asap7/gcd/autotuner.json tune --samples 1"
44-
docker compose down -v --remove-orphans
29+
echo "Running AutoTuner resume test (only once)"
30+
python3 -m unittest tools.AutoTuner.test.resume_check.ResumeCheck.test_tune_resume
4531
fi
4632

4733
exit $ret

flow/test/test_helper.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ fi
108108
if [ "${RUN_AUTOTUNER}" == "true" ]; then
109109
set +x
110110
echo "Start AutoTuner test."
111-
./test/test_autotuner.sh
111+
./test/test_autotuner.sh $DESIGN_NAME $PLATFORM
112112
set -x
113113
fi
114114

tools/AutoTuner/src/autotuner/distributed.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def setup(self, config):
7777
"""
7878
# We create the following directory structure:
7979
# 1/ 2/ 3/ 4/ 5/ 6/
80-
# <repo>/<logs>/<platform>/<design>/<experiment>-DATE/<id>/<cwd>
80+
# <repo>/<logs>/<platform>/<design>/<experiment>/<id>/<cwd>
8181
repo_dir = os.getcwd() + "/../" * 6
8282
self.repo_dir = os.path.abspath(repo_dir)
8383
self.parameters = parse_config(config, path=os.getcwd())
@@ -88,7 +88,8 @@ def step(self):
8888
"""
8989
Run step experiment and compute its score.
9090
"""
91-
metrics_file = openroad(self.repo_dir, self.parameters, self.variant)
91+
self._variant = f"{self.variant}-{self.step_}"
92+
metrics_file = openroad(self.repo_dir, self.parameters, self._variant)
9293
self.step_ += 1
9394
(score, effective_clk_period, num_drc) = self.evaluate(
9495
self.read_metrics(metrics_file)
@@ -709,7 +710,10 @@ def parse_arguments():
709710
help="Time limit (in hours) for each trial run. Default is no limit.",
710711
)
711712
tune_parser.add_argument(
712-
"--resume", action="store_true", help="Resume previous run."
713+
"--resume",
714+
action="store_true",
715+
help="Resume previous run. Note that you must also set a unique experiment\
716+
name identifier via `--experiment NAME` to be able to resume.",
713717
)
714718

715719
# Setup
@@ -797,8 +801,8 @@ def parse_arguments():
797801
)
798802
tune_parser.add_argument(
799803
"--resources_per_trial",
800-
type=int,
801-
metavar="<int>",
804+
type=float,
805+
metavar="<float>",
802806
default=1,
803807
help="Number of CPUs to request for each tuning job.",
804808
)
@@ -874,7 +878,20 @@ def parse_arguments():
874878
)
875879
sys.exit(7)
876880

877-
arguments.experiment += f"-{arguments.mode}-{DATE}"
881+
# Check for experiment name and resume flag.
882+
if arguments.resume and arguments.experiment == "test":
883+
print(
884+
'[ERROR TUN-0031] The flag "--resume"'
885+
' requires that "--experiment NAME" is also given.'
886+
)
887+
sys.exit(1)
888+
889+
# If the experiment name is the default, add a UUID to the end.
890+
if arguments.experiment == "test":
891+
id = str(uuid())[:8]
892+
arguments.experiment = f"{arguments.mode}-{id}"
893+
else:
894+
arguments.experiment += f"-{arguments.mode}"
878895

879896
if arguments.timeout is not None:
880897
arguments.timeout = round(arguments.timeout * 3600)
@@ -1075,7 +1092,7 @@ def sweep():
10751092
local_dir=LOCAL_DIR,
10761093
resume=args.resume,
10771094
stop={"training_iteration": args.iterations},
1078-
resources_per_trial={"cpu": args.resources_per_trial},
1095+
resources_per_trial={"cpu": os.cpu_count() / args.jobs},
10791096
log_to_file=["trail-out.log", "trail-err.log"],
10801097
trial_name_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
10811098
trial_dirname_creator=lambda x: f"variant-{x.trainable_name}-{x.trial_id}-ray",
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import unittest
2+
import subprocess
3+
import os
4+
import time
5+
6+
from contextlib import contextmanager
7+
8+
cur_dir = os.path.dirname(os.path.abspath(__file__))
9+
src_dir = os.path.join(cur_dir, "../src/autotuner")
10+
orfs_dir = os.path.join(cur_dir, "../../../flow")
11+
os.chdir(src_dir)
12+
13+
14+
@contextmanager
15+
def managed_process(*args, **kwargs):
16+
"""
17+
Runs process and ensures it is killed when the context is exited.
18+
"""
19+
proc = subprocess.Popen(*args, **kwargs)
20+
try:
21+
yield proc
22+
finally:
23+
if proc.poll() is None: # If the process is still running
24+
proc.kill() # Forcefully kill it
25+
26+
27+
class ResumeCheck(unittest.TestCase):
28+
# only test 1 platform/design.
29+
platform = "asap7"
30+
design = "gcd"
31+
samples = 5
32+
iterations = 2
33+
34+
def setUp(self):
35+
self.config = os.path.join(
36+
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
37+
)
38+
self.jobs = self.samples
39+
self.num_cpus = os.cpu_count()
40+
41+
# How it works: Say we have 5 samples and 5 iterations.
42+
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
43+
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
44+
45+
# Cast to 1 decimal place
46+
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
47+
options = ["", "--resume"]
48+
self.commands = [
49+
f"python3 distributed.py"
50+
f" --design {self.design}"
51+
f" --platform {self.platform}"
52+
f" --config {self.config}"
53+
f" --jobs {self.jobs}"
54+
f" --experiment test_resume"
55+
f" tune --iterations {self.iterations} --samples {self.samples}"
56+
f" --resources_per_trial {res_per_trial}"
57+
f" {c}"
58+
for c in options
59+
]
60+
61+
def test_tune_resume(self):
62+
# Goal is to first run the first config (without resume) and then run the second config (with resume)
63+
# and check if the run is able to complete.
64+
65+
# Run the first config asynchronously.
66+
print("Running the first config")
67+
with managed_process(self.commands[0], shell=True) as proc:
68+
time.sleep(120)
69+
70+
# Keep trying to stop the ray cluster until it is stopped
71+
while 1:
72+
proc = subprocess.run("ray status", shell=True)
73+
no_nodes = proc.returncode != 0
74+
proc = subprocess.run("ray stop", shell=True)
75+
successful = proc.returncode == 0
76+
77+
if no_nodes and successful:
78+
break
79+
time.sleep(10)
80+
81+
# Run the second config to completion
82+
print("Running the second config")
83+
proc = subprocess.run(self.commands[1], shell=True)
84+
successful = proc.returncode == 0
85+
self.assertTrue(successful)
86+
87+
88+
if __name__ == "__main__":
89+
unittest.main()

0 commit comments

Comments
 (0)