Skip to content

Commit 2ec9ebb

Browse files
committed
add initial scaffold for distributed config
Signed-off-by: Jack Luar <[email protected]>
1 parent 48dfc74 commit 2ec9ebb

File tree

6 files changed

+172
-3
lines changed

6 files changed

+172
-3
lines changed

tools/AutoTuner/distributed/README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,31 +76,41 @@ make dashboard
7676
4. Ray CLI API
7777

7878
```bash
79+
# Run this once the dashboard is up
80+
HEAD_SERVER=$(ray job submit --address http://localhost:8265 -- python3 distributed/scripts/show_main_ip.py | grep "Main IP address" | awk '{print $4}')
81+
echo "HEAD_SERVER is at $HEAD_SERVER"
82+
export RAY_ADDRESS=$HEAD_SERVER:6379
83+
7984
# Commands on machine (assume files/commands are present on cluster)
8085
ray job submit --address http://localhost:8265 ls
8186

8287
# Case 1: 1 job
8388
ray job submit --address http://localhost:8265 -- python3 -m autotuner.distributed --design gcd --platform asap7 --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
8489

8590
# Case 2A: 2 job, with resource spec.
86-
HEAD_SERVER=10.138.0.13
8791
ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
8892
ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
8993

9094
# Case 2B: 2 job, with resource spec (sweep)
91-
HEAD_SERVER=10.138.0.13
9295
ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ./src/autotuner/distributed-sweep-example.json --cloud_dir gs://autotuner_test sweep
9396
ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ./src/autotuner/distributed-sweep-example.json --cloud_dir gs://autotuner_test sweep
9497

9598
# Case 3: Overprovisioned resource spec (should fail because the cluster cannot meet this demand.)
96-
HEAD_SERVER=10.138.0.13
9799
ray job submit --address http://localhost:8265 --entrypoint-num-cpus 4 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
98100

99101
# Commands on machine (sync local working dir, note the dir is stored as some /tmp dir)
100102
ray job submit --address http://localhost:8265 \
101103
--working-dir scripts -- python3 hello_world.py
102104
```
103105

106+
5. Ray Wrapper
107+
108+
Alternatively, we also provided convenience scripts as an abstraction to the Ray Job Submission API.
109+
110+
```bash
111+
python ray_wrapper.py --ray-dashboard-address localhost --ray-cluster-head-address $HEAD_ADDRESS
112+
```
113+
104114
## Useful commands
105115

106116
```bash
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#############################################################
2+
# This file is used to configure the distributed autotuner.
3+
#############################################################
4+
# Ray specific variables
5+
entrypoint_num_cpus: 2
6+
7+
# Autotuner specific - common variables
8+
cloud_dir: gs://autotuner_test
9+
10+
# Autotuner specific - tune variables
11+
# note for "config" - please specify a relative path to ./tools/AutoTuner
12+
design: gcd
13+
platform: asap7
14+
config: ../../flow/designs/asap7/gcd/autotuner.json
15+
mode_arg: tune
16+
samples: 1
17+
18+
# Autotuner specific - sweep variables
19+
# config: ./src/autotuner/distributed-sweep-example.json
20+
# mode_arg: sweep
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import argparse
2+
import os
3+
import datetime
4+
from ray.job_submission import JobSubmissionClient
5+
6+
from autotuner.utils.config_helper import parse_config
7+
8+
# Constants
9+
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
10+
EXPERIMENT_DATETIME = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
11+
12+
TUNE_TEMPLATE = """python3 -m autotuner.distributed \
13+
--design {design} \
14+
--platform {platform} \
15+
--config {config} \
16+
--server {server} \
17+
--cloud_dir {cloud_dir} \
18+
tune \
19+
--samples {samples}"""
20+
21+
SWEEP_TEMPLATE = """python3 -m autotuner.distributed \
22+
--design {design} \
23+
--platform {platform} \
24+
--config {config} \
25+
--server {server} \
26+
--cloud_dir {cloud_dir} \
27+
sweep"""
28+
29+
# CLI Arguments
30+
arguments = argparse.ArgumentParser()
31+
arguments.add_argument(
32+
"--ray-dashboard-address",
33+
type=str,
34+
help="IP address of the Ray dashboard. If not provided, it will be set to the IP address of the current machine.",
35+
default="localhost",
36+
)
37+
arguments.add_argument(
38+
"--ray-cluster-head-address",
39+
type=str,
40+
help="IP address of the Ray cluster head. If not provided, it will be set to the IP address of the current machine.",
41+
default="10.138.0.21",
42+
)
43+
44+
# Parse configs
45+
# TODO: Validations + Schemas
46+
config_args = parse_config(os.path.join(FILE_DIR, "config.yaml"))
47+
48+
49+
if __name__ == "__main__":
50+
args = arguments.parse_args()
51+
ray_dashboard_address = args.ray_dashboard_address
52+
ray_cluster_head_address = args.ray_cluster_head_address
53+
54+
# Initialize JobSubmissionClient
55+
client = JobSubmissionClient(address=f"http://{ray_dashboard_address}:8265")
56+
57+
# Get ray parameters
58+
entrypoint_num_cpus = config_args.get("entrypoint_num_cpus", 1)
59+
60+
# Get experiment parameters
61+
cloud_dir = config_args.get("cloud_dir", "gs://autotuner_test")
62+
design = config_args.get("design", "gcd")
63+
platform = config_args.get("platform", "asap7")
64+
config = config_args.get("config", "../../flow/designs/asap7/gcd/autotuner.json")
65+
mode_arg = config_args.get("mode_arg", "tune")
66+
samples = config_args.get("samples", 1)
67+
68+
# Submit a job
69+
entrypoint = (
70+
TUNE_TEMPLATE.format(
71+
design=design,
72+
platform=platform,
73+
config=config,
74+
server=ray_cluster_head_address,
75+
cloud_dir=cloud_dir,
76+
samples=samples,
77+
)
78+
if mode_arg == "tune"
79+
else SWEEP_TEMPLATE.format(
80+
design=design,
81+
platform=platform,
82+
config=config,
83+
server=ray_cluster_head_address,
84+
cloud_dir=cloud_dir,
85+
)
86+
)
87+
job_id = client.submit_job(
88+
entrypoint=entrypoint,
89+
entrypoint_num_cpus=entrypoint_num_cpus,
90+
)
91+
print(f"Job submitted with ID: {job_id}")
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import subprocess
2+
import re
3+
4+
5+
def get_main_ip():
6+
"""
7+
Get the main IP address of the Ray head node.
8+
"""
9+
result = subprocess.run(["ip", "a"], capture_output=True, text=True)
10+
ip_output = result.stdout
11+
12+
# Regex to find all inet addresses (excluding loopback 127.x.x.x and docker/bridge)
13+
matches = re.findall(r"inet (\d+\.\d+\.\d+\.\d+)/\d+", ip_output)
14+
for ip in matches:
15+
# TODO: Add more checks to filter out unwanted IPs
16+
if (
17+
not ip.startswith("127.")
18+
and not ip.startswith("169.")
19+
and not ip.startswith("172.17.")
20+
):
21+
return ip
22+
return "No valid IP found"
23+
24+
25+
def main():
26+
print(f"Main IP address: {get_main_ip()}")
27+
28+
29+
if __name__ == "__main__":
30+
main()

tools/AutoTuner/src/autotuner/utils/__init__.py

Whitespace-only changes.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import yaml
2+
3+
4+
def parse_config(file_path: str):
5+
"""
6+
Parses a YAML configuration file and returns the content as a dictionary.
7+
8+
:param file_path: Path to the YAML configuration file.
9+
:return: Dictionary containing the parsed configuration.
10+
"""
11+
with open(file_path) as file:
12+
config = yaml.safe_load(file)
13+
return config
14+
15+
16+
if __name__ == "__main__":
17+
config = parse_config("config.yaml")
18+
print(config)

0 commit comments

Comments
 (0)