Skip to content

Commit face046

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 01bf472 + 1887640 commit face046

File tree

10 files changed

+7
-419
lines changed

10 files changed

+7
-419
lines changed

examples/stable-diffusion-dreambooth/Makefile

Lines changed: 1 addition & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,12 @@ JOB_NAME=git-clone-job
55
NAMESPACE=distributed
66
WORKER_LABEL_KEY=node-role.kubernetes.io/worker
77

8-
.PHONY: install-openshift-ai add-gpu-operator add-gpu-machineset setup-kueue-premption setup-ray-distributed-training deploy-odf teardown-ray-distributed-training
9-
10-
install-openshift-ai: add-gpu-operator deploy-oai
8+
.PHONY: add-gpu-machineset setup-kueue-premption setup-ray-distributed-training deploy-odf teardown-ray-distributed-training
119

1210
add-gpu-machineset:
1311
@mkdir -p $(WORK_DIR)
1412
@$(BASE)/scripts/add-gpu.sh $(WORK_DIR)
1513

16-
add-gpu-operator:
17-
oc apply -f $(BASE)/yaml/operators/nfd.yaml
18-
19-
@until oc get crd nodefeaturediscoveries.nfd.openshift.io >/dev/null 2>&1; do \
20-
echo "Wait until CRD nodefeaturediscoveries.nfd.openshift.io is ready..."; \
21-
sleep 10; \
22-
done
23-
24-
oc apply -f $(BASE)/yaml/operators/nfd-cr.yaml
25-
oc apply -f $(BASE)/yaml/operators/nvidia.yaml
26-
27-
@until oc get crd clusterpolicies.nvidia.com>/dev/null 2>&1; do \
28-
echo "Wait until CRD clusterpolicies.nvidia.com is ready..."; \
29-
sleep 10; \
30-
done
31-
32-
oc apply -f $(BASE)/yaml/operators/nvidia-cluster-policy.yaml
33-
3414
deploy-odf:
3515
@node_count=$$(oc get nodes -l $(WORKER_LABEL_KEY) -o json | jq '[.items[] | select(.spec.taints | not)] | length'); \
3616
if [ $$node_count -lt 3 ]; then \
@@ -58,35 +38,6 @@ deploy-odf:
5838
oc patch console.operator cluster -n openshift-storage --type json -p '[{"op": "add", "path": "/spec/plugins", "value": ["odf-console"]}]'
5939

6040
@echo "ODF is ready"
61-
62-
deploy-oai:
63-
oc apply -f $(BASE)/yaml/operators/serverless.yaml
64-
oc apply -f $(BASE)/yaml/operators/servicemesh.yaml
65-
66-
@$(BASE)/scripts/install-operator.sh openshift-serverless "Red Hat OpenShift Serverless"
67-
@$(BASE)/scripts/install-operator.sh default "Red Hat OpenShift Service Mesh"
68-
69-
oc apply -f $(BASE)/yaml/operators/oai.yaml
70-
@$(BASE)/scripts/install-operator.sh redhat-ods-operator "Red Hat OpenShift AI"
71-
72-
@until oc get DSCInitialization default-dsci -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' | grep -q "True"; do \
73-
echo "Waiting for OpenShift AI DSCInitialization to be ready..."; \
74-
sleep 10; \
75-
done
76-
77-
@until oc get crd datascienceclusters.datasciencecluster.opendatahub.io>/dev/null 2>&1; do \
78-
echo "Wait until CRD datascienceclusters.datasciencecluster.opendatahub.io is ready..."; \
79-
sleep 10; \
80-
done
81-
82-
oc apply -f $(BASE)/yaml/operators/dsc.yaml
83-
84-
@until oc get datasciencecluster default-dsc -n redhat-ods-applications -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' | grep -q "True"; do \
85-
echo "Waiting for OpenShift AI Data Science Cluster to be ready..."; \
86-
sleep 60; \
87-
done
88-
89-
@echo "OpenShift AI Data Science Cluster is ready"
9041

9142
teardown-ray-distributed-training:
9243
-oc delete -f $(BASE)/yaml/distributed/git-clone.yaml

examples/stable-diffusion-dreambooth/README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ In this example, a workbench will be automatically created with a few notebooks
2424

2525
## Prerequisite
2626

27+
* An OpenShift cluster with OpenShift AI (RHOAI) 2.16+ installed:
28+
* The `codeflare`, `dashboard`, `ray`, `kueue` and `workbenches` components enabled
29+
* OpenShift AI dependencies installed - Red Hat OpenShift Serverless and Red Hat OpenShift Service Mesh
30+
2731
* Ensure there is at least 1 worker node that has a GPU. On AWS, this can be a p3.8xlarge instance, otherwise you can run the makefile target to add a `machineset` for a single replica of p3.8xlarge.
2832

2933
> [!NOTE]
@@ -41,6 +45,8 @@ In this example, a workbench will be automatically created with a few notebooks
4145
oc adm taint nodes <gpu-node> nvidia.com/gpu=Exists:NoSchedule
4246
```
4347

48+
Install and setup operators needed for GPU to be available in the cluster (Node Feature Discovery Operator, NVIDIA GPU Operator)
49+
4450
* Ensure there is a RWX available storage class, such as `
4551
ocs-storagecluster-cephfs`. The `stable-diffusion-shared-storage` pvc uses cephfs otherwise update the PVC resource.
4652

@@ -64,13 +70,6 @@ ocs-storagecluster-cephfs`. The `stable-diffusion-shared-storage` pvc uses cephf
6470
effect: NoSchedule
6571
```
6672
67-
## Setup
68-
Install OpenShift AI using the OpenShift AI Operator. This install the latest version from the fast channel.
69-
70-
```bash
71-
make install-openshift-ai
72-
```
73-
7473
## Setting Up the Demo
7574
7675
Run the makefile target. This creates a Data Science Project called `distributed` with the following

examples/stable-diffusion-dreambooth/yaml/operators/dsc.yaml

Lines changed: 0 additions & 30 deletions
This file was deleted.

examples/stable-diffusion-dreambooth/yaml/operators/nfd-cr.yaml

Lines changed: 0 additions & 127 deletions
This file was deleted.

examples/stable-diffusion-dreambooth/yaml/operators/nfd.yaml

Lines changed: 0 additions & 29 deletions
This file was deleted.

examples/stable-diffusion-dreambooth/yaml/operators/nvidia-cluster-policy.yaml

Lines changed: 0 additions & 82 deletions
This file was deleted.

0 commit comments

Comments
 (0)