The-OpenROAD-Project
diff --git a/‎tools/AutoTuner/.dockerignore‎
Lines changed: 8 additions & 0 deletions b/‎tools/AutoTuner/.dockerignore‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tools/AutoTuner/.gitignore‎
Lines changed: 10 additions & 0 deletions b/‎tools/AutoTuner/.gitignore‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/.env.sample‎
Lines changed: 2 additions & 0 deletions b/‎tools/AutoTuner/distributed/.env.sample‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/Dockerfile‎
Lines changed: 25 additions & 0 deletions b/‎tools/AutoTuner/distributed/Dockerfile‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/Makefile‎
Lines changed: 54 additions & 0 deletions b/‎tools/AutoTuner/distributed/Makefile‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/NOTES.md‎
Lines changed: 26 additions & 0 deletions b/‎tools/AutoTuner/distributed/NOTES.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/README.md‎
Lines changed: 119 additions & 0 deletions b/‎tools/AutoTuner/distributed/README.md‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/at_distributed.sh‎
Lines changed: 15 additions & 0 deletions b/‎tools/AutoTuner/distributed/at_distributed.sh‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tools/AutoTuner/distributed/at_worker.py‎
Lines changed: 26 additions & 0 deletions b/‎tools/AutoTuner/distributed/at_worker.py‎
Lines changed: 26 additions & 0 deletions
@@ -0,0 +1,8 @@
+*
+!src
+!distributed
+!requirements.txt
+!requirements-dev.txt
+!setup.sh
+!pyproject.toml
+
@@ -10,3 +10,13 @@ __pycache__/
 # Autotuner env
 autotuner_env
 .env
+
+# Ray distributed
+public.yaml
+private.yaml
+
+# Docker build
+docker-build.log
+
+# GCP
+service_account.json
@@ -0,0 +1,2 @@
+DOCKERHUB_USERNAME={{DOCKERHUB_USERNAME}}
+DOCKERHUB_PASSWORD={{DOCKERHUB_PASSWORD}}
@@ -0,0 +1,25 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE:-openroad/orfs:latest}
+
+# Customize this based on the user's needs
+ENV GOOGLE_APPLICATION_CREDENTIALS=/OpenROAD-flow-scripts/service_account.json
+
+# Install AT required packages
+WORKDIR /OpenROAD-flow-scripts/tools/AutoTuner
+RUN pip3 install --no-cache-dir --upgrade pip
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install --no-cache-dir -r requirements-dev.txt
+
+# Install rsync (https://github.com/ray-project/ray/issues/40566)
+# ssh: to communicate with other nodes within image.
+# sudo: for grpc calls in ray.
+RUN apt-get update && \
+    apt-get install -y rsync openssh-client sudo && \
+    rm -rf /var/lib/apt/lists/*
+
+# Replace pre-existing AT files with local copy
+RUN rm -rf /OpenROAD-flow-scripts/tools/AutoTuner
+COPY . /OpenROAD-flow-scripts/tools/AutoTuner
+
+# Install AT package in editable mode
+RUN pip3 install --no-cache-dir -e .
@@ -0,0 +1,54 @@
+include .env
+export
+
+BASE_TAG=$(shell cd ../../../ && ./etc/DockerTag.sh -dev)
+ORFS_IMAGE := openroad/orfs:v3.0-2422-g3dc9e665  # update manually (do not use latest)
+ORFS_AUTOTUNER_IMAGE := orfs-autotuner
+
+.PHONY: init
+init:
+	@echo "Setting up environment..."
+	@../installer.sh
+
+.PHONY: clean
+clean:
+	@echo "Cleaning up old images"
+	@docker rmi ${ORFS_AUTOTUNER_IMAGE}:latest || true
+
+.PHONY: docker
+docker: clean
+	@echo "Building docker image..."
+	@docker build -t ${ORFS_AUTOTUNER_IMAGE}:latest -f Dockerfile --build-arg BASE_IMAGE=${ORFS_IMAGE} .. | tee docker-build.log
+	@docker tag ${ORFS_AUTOTUNER_IMAGE}:latest ${ORFS_AUTOTUNER_IMAGE}:$(BASE_TAG)
+
+.PHONY: upload
+upload: docker
+	@echo "Uploading docker image..."
+	@docker login -u $(DOCKERHUB_USERNAME) -p $(DOCKERHUB_PASSWORD)
+	@echo "Base image: $(BASE_TAG)"
+	@docker tag ${ORFS_AUTOTUNER_IMAGE}:latest ${DOCKERHUB_USERNAME}/${ORFS_AUTOTUNER_IMAGE}:$(BASE_TAG)
+	@docker tag ${ORFS_AUTOTUNER_IMAGE}:latest ${DOCKERHUB_USERNAME}/${ORFS_AUTOTUNER_IMAGE}:latest
+	@docker push ${DOCKERHUB_USERNAME}/${ORFS_AUTOTUNER_IMAGE}:$(BASE_TAG)
+	@docker push ${DOCKERHUB_USERNAME}/${ORFS_AUTOTUNER_IMAGE}:latest
+	@docker logout
+
+.PHONY: up
+up:
+	@echo "Starting Ray cluster..."
+	@ray disable-usage-stats
+	@ray up -y public.yaml
+
+.PHONY: down
+down:
+	@echo "Stopping Ray cluster..."
+	@ray down -y public.yaml
+
+.PHONY: dashboard
+dashboard:
+	@echo "Starting Ray dashboard..."
+	@ray dashboard public.yaml
+
+.PHONY: monitor
+monitor:
+	@echo "Monitoring Ray cluster..."
+	@ray monitor public.yaml
@@ -0,0 +1,26 @@
+1) Setup two AT instances on same internal network
+2) Setup the requirements
+
+```
+sudo apt-get install -y python3-pip python3-venv
+python3 -m venv .venv
+.venv/bin/activate && pip install ray[tune]
+
+```
+
+3) Common setup script
+- `at_distributed.sh`
+
+4) Worker script
+- `at_worker.py`
+- `mkdir -p /tmp/owo && touch /tmp/owo/abc`
+
+
+5) Benchmark file transfers (do on worker)
+- Observation: sync_dir just makes sure the files are sync-ed. So neat feature is that only file diffs are transffered.
+- You do not have to create the dest_dir, sync_dir does that for you.
+- `max_size_bytes` is limited to 1GiB. So we have to lift up the restriction manually if needed.
+- Bottleneck seems to start at 1GiB transfers and above
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=100` - creates 100MB file. (Time taken: 2.2103039264678954 ± 0.556972017400803)
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=1000` - creates 1Gb file. (Time taken: 8.897777223587036 ± 0.6503669298689543)
+- `dd if=/dev/zero of=/tmp/owo/owo bs=1M count=5000` - creates 5Gb file. (Time taken: 54.920665216445926 ± 1.0533714623736783)
@@ -0,0 +1,119 @@
+# Ray Cluster Setup on Google Cloud Platform (GCP)
+
+This tutorial covers the setup of Ray Clusters on GCP. Ray Clusters are a way to
+start compute intensive jobs (e.g. Autotuner) on a distributed set of nodes spawned 
+automatically. For more information on Ray Cluster, refer to [here](https://docs.ray.io/en/latest/cluster/getting-started.html).
+
+To run Autotuner jobs on Ray Cluster, we have to first install ORFS onto the
+GCP nodes.
+
+How does this differ from the previous Kubernetes approach?
+- Support for autoscaling
+- Faster startup time using Docker (no need for JIT rebuilds of runtime dependencies)
+- Simplified architecture and codebase
+
+There are two different ways for ORFS setup on Ray Cluster, namely:
+- [Public](#public-cluster-setup): Upload Docker image to Dockerhub (or any public Docker registry).
+- [Private](#private-cluster-setup): Upload Docker image to private registry. Authentication needs then to be handled for Kubernetes. 
+
+```note
+Currently it looks like the `autoscaler.yaml` file might only be used for public.yaml
+For private deployments, we might have to use KubeRay
+1. https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/ray-on-gke
+2. https://www.paulsblog.dev/how-to-install-a-private-docker-container-registry-in-kubernetes/
+```
+
+## TODO
+
+- Look up how to preserve the cache during pip install. 
+- Public flow, fixed: via autotuner script
+    - Tune
+    - Sweep
+- Public flow, fixed: via ray API.
+- Public flow, autoscaling
+- test using private registry on dockerhub same flow
+- Scaling concerns
+    - increase storage of head node.
+    - Object store memory - does that affect file transfer?
+
+## Prerequisites
+
+Make sure Autotuner prerequisites are installed. To do so, refer to the installation script.
+
+```bash
+pip install ray[default] google-api-python-client cryptography cloudpathlib
+```
+
+## Public cluster setup
+
+0a. Authenticate the necessary GCP account with enough privileges to do:
+- `setIamPolicy`
+
+```bash
+gcloud auth application-default login
+```
+
+0b. Generate your service account keys for `ray-autoscaler-sa-v1@<project_id>.iam.gserviceaccount.com`.
+Rename it `service_account.json`.
+
+1. Set up `.env` with Docker registry username/password. Also, set up the `public.yaml`
+file accordingly to your desired specifications.
+
+```bash
+cp .env.sample .env
+cp public.yaml.template public.yaml
+```
+
+2. Run the following commands to build, tag and upload the public image:
+
+```bash
+make clean
+make base
+make docker
+make upload
+```
+
+3. Launch your cluster as follows:
+
+```bash
+make up
+```
+
+4. Ray CLI API
+
+```bash
+# Commands on machine (assume files/commands are present on cluster)
+ray job submit --address http://localhost:8265 ls
+
+# Case 1: 1 job
+ray job submit --address http://localhost:8265 -- python3 -m autotuner.distributed --design gcd --platform asap7 --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
+ 
+# Case 2A: 2 job, with resource spec.
+HEAD_SERVER=10.138.0.13
+ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
+ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
+
+# Case 2B: 2 job, with resource spec (sweep)
+HEAD_SERVER=10.138.0.13
+ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ./src/autotuner/distributed-sweep-example.json --cloud_dir gs://autotuner_test sweep
+ray job submit --address http://localhost:8265 --entrypoint-num-cpus 2 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ./src/autotuner/distributed-sweep-example.json --cloud_dir gs://autotuner_test sweep
+
+# Case 3: Overprovisioned resource spec (should fail because the cluster cannot meet this demand.)
+HEAD_SERVER=10.138.0.13
+ray job submit --address http://localhost:8265 --entrypoint-num-cpus 4 -- python3 -m autotuner.distributed --design gcd --platform asap7 --server $HEAD_SERVER --config ../../flow/designs/asap7/gcd/autotuner.json --cloud_dir gs://autotuner_test tune --samples 1
+
+# Commands on machine (sync local working dir, note the dir is stored as some /tmp dir)
+ray job submit --address http://localhost:8265 \
+    --working-dir scripts -- python3 hello_world.py
+```
+
+## Useful commands
+
+```bash
+HEAD_SERVER=10.138.0.13
+ray job stop --address $HEAD_SERVER:6379 --no-wait {{ JOB_SUBMIT_ID }}
+```
+
+## Private cluster setup
+
+Coming soon.
@@ -0,0 +1,15 @@
+    #!/bin/bash -eu
+
+    # Ray script that starts the Ray HEAD/WORKER node based on the command line arg
+
+    IS_HEAD=${1:-false}
+    RAY_HEAD_IP_ADDRESS=${2:-10.129.0.4}
+
+
+    if [ "$IS_HEAD" = "true" ]; then
+        echo "Starting Ray HEAD node"
+        ray start --head --port=6379
+    else
+        echo "Starting Ray WORKER node"
+        ray start --address=$RAY_HEAD_IP_ADDRESS:6379
+    fi
@@ -0,0 +1,26 @@
+import ray
+from ray.tune.utils.file_transfer import sync_dir_between_nodes
+import time
+
+ray.init()
+_DEFAULT_MAX_SIZE_BYTES = 5 * 1024 * 1024 * 1024  # 1 GiB
+
+res = []
+for i in range(5):
+    start = time.time()
+    sync_dir_between_nodes(
+        source_ip="10.129.0.5",
+        source_path="/tmp/owo",
+        target_ip="10.129.0.4",
+        target_path=f"/tmp/owo{i}",
+        max_size_bytes=_DEFAULT_MAX_SIZE_BYTES,
+    )
+    print(f"Time taken: {time.time() - start}")
+    res.append(time.time() - start)
+
+print(
+    "Time taken: {} ± {}".format(
+        sum(res) / len(res),
+        sum((x - sum(res) / len(res)) ** 2 for x in res) / len(res) ** 0.5,
+    )
+)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+DOCKERHUB_USERNAME={{DOCKERHUB_USERNAME}}`
	`2`	`+DOCKERHUB_PASSWORD={{DOCKERHUB_PASSWORD}}`