Skip to content

Commit f07ee65

Browse files
committed
set up for new gpu class, creating notebooks, localqueue config to point to clusterqueues, and observability for jobs through rolebinding
1 parent b59331c commit f07ee65

File tree

7 files changed

+492
-3
lines changed

7 files changed

+492
-3
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
repos:
22
- repo: https://github.com/Lucas-C/pre-commit-hooks
3-
rev: v1.5.4
3+
rev: v1.5.5
44
hooks:
55
- id: remove-tabs
66

77
- repo: https://github.com/pre-commit/pre-commit-hooks
8-
rev: v4.5.0
8+
rev: v6.0.0
99
hooks:
1010
- id: trailing-whitespace
1111
- id: check-merge-conflict
@@ -18,7 +18,7 @@ repos:
1818
- id: detect-private-key
1919

2020
- repo: https://github.com/adrienverge/yamllint.git
21-
rev: v1.32.0
21+
rev: v1.37.1
2222
hooks:
2323
- id: yamllint
2424
files: \.(yaml|yml)$

gpu-class/cleanup.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
pattern="^bu-cs599-pmpp-cuda-"
2+
3+
for proj in $(oc get projects -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep "$pattern"); do
4+
echo "deleting notebook + pvc"
5+
oc -n "$proj" delete notebook --as system:admin --all --ignore-not-found --wait=true || true
6+
oc -n "$proj" delete pvc --as system:admin --all --ignore-not-found --wait=true || true
7+
done

gpu-class/clusterqueue_rb.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: kueue-clusterqueue-reader
5+
rules:
6+
- apiGroups: ["kueue.x-k8s.io"]
7+
resources: ["clusterqueues"]
8+
verbs: ["get", "list", "watch"]

gpu-class/gpu-class-setup.sh

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/bin/bash
2+
3+
CLASS_NAME="bu-cs599-pmpp-cuda"
4+
5+
create_resource_command=(oc create -f -)
6+
openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.edu.nerc.mghpcc.org/projects
7+
# split openshift url to provide as parameters
8+
host="${openshift_url%/projects*}" # get everything before projects
9+
hub_host=$host
10+
run_name="gpu_class_test"
11+
image_name="csw-dev-f25"
12+
13+
create_wb() {
14+
random_id=$(openssl rand -hex 3)
15+
16+
#set namespace
17+
namespace=$1
18+
19+
username=$(oc -n "$ns" get rolebinding edit -o json \
20+
| jq -r '
21+
(.subjects // [])
22+
| map(.name
23+
| if test("@.*\\..*$")
24+
then sub("@"; "-40") | gsub("\\.";"-2")
25+
else .
26+
end)
27+
| map(select(. != "jappavoo-40bu-2edu"))
28+
| map(select(. != "sdanni-40redhat-2com"))
29+
| map(select(. != "istaplet"))
30+
| .[]
31+
')
32+
33+
# give notebook within namespace a name
34+
notebook_name=cs599-${username}-wb
35+
36+
params=(
37+
-p NOTEBOOK_NAME="$notebook_name"
38+
-p RUN_NAME="$run_name"
39+
-p USERNAME="$username"
40+
-p NAMESPACE="$namespace"
41+
-p IMAGE_NAME="$image_name"
42+
-p OPENSHIFT_URL="$openshift_url"
43+
-p HUB_HOST="$hub_host"
44+
)
45+
46+
oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
47+
48+
echo "$notebook_name"
49+
}
50+
51+
apply_localqueue() {
52+
namespace=$1
53+
54+
local_params=(
55+
-p NAMESPACE="$namespace"
56+
)
57+
58+
oc process -f localqueue.yaml --local "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
59+
}
60+
61+
apply_rolebinding() {
62+
#set namespace and nb name
63+
namespace=$1
64+
notebook_name=$2
65+
66+
rb_params=(
67+
-p NAMESPACE="$namespace"
68+
-p SERVICE_ACCOUNT_NB="$notebook_name"
69+
)
70+
71+
oc process -f rb.yaml --local "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
72+
}
73+
74+
apply_clusterq() {
75+
76+
oc apply -f clusterqueue_rb.yaml --as system:admin
77+
}
78+
79+
apply_clusterq
80+
81+
oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
82+
oc project "$ns"
83+
84+
#create a workbench and save the name of the notebook to apply rolebindings
85+
nb_name="$(create_wb "$ns")"
86+
apply_rolebinding "$ns" "$nb_name"
87+
apply_localqueue "$ns"
88+
89+
done

gpu-class/localqueue.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
metadata:
4+
name: localqueue
5+
parameters:
6+
- name: NAMESPACE
7+
required: true
8+
objects:
9+
- apiVersion: kueue.x-k8s.io/v1beta1
10+
kind: LocalQueue
11+
metadata:
12+
name: v100-localqueue
13+
namespace: ${NAMESPACE}
14+
spec:
15+
clusterQueue: v100-clusterqueue
16+
- apiVersion: kueue.x-k8s.io/v1beta1
17+
kind: LocalQueue
18+
metadata:
19+
name: a100-localqueue
20+
namespace: ${NAMESPACE}
21+
spec:
22+
clusterQueue: a100-clusterqueue
23+
- apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: LocalQueue
25+
metadata:
26+
name: h100-localqueue
27+
namespace: ${NAMESPACE}
28+
spec:
29+
clusterQueue: h100-clusterqueue

gpu-class/notebook_resource.yaml

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
parameters:
4+
- name: NOTEBOOK_NAME
5+
required: true
6+
- name: RUN_NAME
7+
required: true
8+
- name: USERNAME
9+
required: true
10+
- name: IMAGE_NAME
11+
required: true
12+
- name: NAMESPACE
13+
required: true
14+
- name: OPENSHIFT_URL
15+
required: true
16+
- name: IMAGE_REPO
17+
required: true
18+
value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications"
19+
- name: HUB_HOST
20+
required: true
21+
- name: PVC_SIZE
22+
required: true
23+
value: "20Gi"
24+
- name: TOKEN
25+
required: false
26+
objects:
27+
- apiVersion: kubeflow.org/v1beta1
28+
kind: Notebook
29+
metadata:
30+
annotations:
31+
notebooks.opendatahub.io/inject-oauth: 'true'
32+
notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME}
33+
notebooks.opendatahub.io/last-size-selection: Small
34+
notebooks.opendatahub.io/oauth-logout-url: >-
35+
${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
36+
opendatahub.io/username: ${USERNAME}
37+
openshift.io/description: ''
38+
openshift.io/display-name: ${NOTEBOOK_NAME}
39+
opendatahub.io/image-display-name: ${IMAGE_NAME}
40+
name: ${NOTEBOOK_NAME}
41+
labels:
42+
ope-run: ${RUN_NAME}
43+
app: ${NOTEBOOK_NAME}
44+
opendatahub.io/dashboard: 'true'
45+
opendatahub.io/odh-managed: 'true'
46+
opendatahub.io/user: ${USERNAME}
47+
spec:
48+
affinity:
49+
nodeAffinity:
50+
preferredDuringSchedulingIgnoredDuringExecution:
51+
- preference:
52+
matchExpressions:
53+
- key: nvidia.com/gpu.present
54+
# set the value to 'true' to use nodes with GPUs
55+
operator: In
56+
values:
57+
- 'false'
58+
weight: 1
59+
template:
60+
spec:
61+
containers:
62+
- resources:
63+
limits:
64+
cpu: '2'
65+
memory: 8Gi
66+
requests:
67+
cpu: '1'
68+
memory: 8Gi
69+
readinessProbe:
70+
failureThreshold: 3
71+
httpGet:
72+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
73+
port: notebook-port
74+
scheme: HTTP
75+
initialDelaySeconds: 10
76+
periodSeconds: 5
77+
successThreshold: 1
78+
timeoutSeconds: 1
79+
name: ${NOTEBOOK_NAME}
80+
livenessProbe:
81+
failureThreshold: 3
82+
httpGet:
83+
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
84+
port: notebook-port
85+
scheme: HTTP
86+
initialDelaySeconds: 10
87+
periodSeconds: 5
88+
successThreshold: 1
89+
timeoutSeconds: 1
90+
env:
91+
- name: NOTEBOOK_ARGS
92+
value: |-
93+
--ServerApp.port=8888
94+
--ServerApp.token=${TOKEN}
95+
--ServerApp.password=''
96+
--ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME}
97+
--ServerApp.quit_button=False
98+
--ServerApp.tornado_settings={"user":"${USERNAME}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"}
99+
- name: JUPYTER_IMAGE
100+
value: >-
101+
${IMAGE_REPO}/${IMAGE_NAME}
102+
ports:
103+
- containerPort: 8888
104+
name: notebook-port
105+
protocol: TCP
106+
imagePullPolicy: Always
107+
volumeMounts:
108+
- mountPath: /opt/app-root/src
109+
name: ${NOTEBOOK_NAME}
110+
- mountPath: /dev/shm
111+
name: shm
112+
image: >-
113+
${IMAGE_REPO}/${IMAGE_NAME}
114+
workingDir: /opt/app-root/src
115+
- resources:
116+
limits:
117+
cpu: 100m
118+
memory: 64Mi
119+
requests:
120+
cpu: 100m
121+
memory: 64Mi
122+
readinessProbe:
123+
failureThreshold: 3
124+
httpGet:
125+
path: /oauth/healthz
126+
port: oauth-proxy
127+
scheme: HTTPS
128+
initialDelaySeconds: 5
129+
periodSeconds: 5
130+
successThreshold: 1
131+
timeoutSeconds: 1
132+
name: oauth-proxy
133+
livenessProbe:
134+
failureThreshold: 3
135+
httpGet:
136+
path: /oauth/healthz
137+
port: oauth-proxy
138+
scheme: HTTPS
139+
initialDelaySeconds: 30
140+
periodSeconds: 5
141+
successThreshold: 1
142+
timeoutSeconds: 1
143+
env:
144+
- name: NAMESPACE
145+
valueFrom:
146+
fieldRef:
147+
fieldPath: metadata.namespace
148+
ports:
149+
- containerPort: 8443
150+
name: oauth-proxy
151+
protocol: TCP
152+
imagePullPolicy: Always
153+
volumeMounts:
154+
- mountPath: /etc/oauth/config
155+
name: oauth-config
156+
- mountPath: /etc/tls/private
157+
name: tls-certificates
158+
image: >-
159+
registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46
160+
args:
161+
- '--provider=openshift'
162+
- '--https-address=:8443'
163+
- '--http-address='
164+
- '--openshift-service-account=${NOTEBOOK_NAME}'
165+
- '--cookie-secret-file=/etc/oauth/config/cookie_secret'
166+
- '--cookie-expire=24h0m0s'
167+
- '--tls-cert=/etc/tls/private/tls.crt'
168+
- '--tls-key=/etc/tls/private/tls.key'
169+
- '--upstream=http://localhost:8888'
170+
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
171+
- '--email-domain=*'
172+
- '--skip-provider-button'
173+
- >-
174+
--openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"}
175+
- >-
176+
--logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
177+
enableServiceLinks: false
178+
serviceAccountName: ${NOTEBOOK_NAME}
179+
volumes:
180+
- name: ${NOTEBOOK_NAME}
181+
persistentVolumeClaim:
182+
claimName: ${NOTEBOOK_NAME}
183+
- emptyDir:
184+
medium: Memory
185+
name: shm
186+
- name: oauth-config
187+
secret:
188+
defaultMode: 420
189+
secretName: ${NOTEBOOK_NAME}-oauth-config
190+
- name: tls-certificates
191+
secret:
192+
defaultMode: 420
193+
secretName: ${NOTEBOOK_NAME}-tls
194+
- apiVersion: v1
195+
kind: PersistentVolumeClaim
196+
metadata:
197+
name: ${NOTEBOOK_NAME}
198+
labels:
199+
app: ${NOTEBOOK_NAME}
200+
notebook-name: ${NOTEBOOK_NAME}
201+
ope-run: ${RUN_NAME}
202+
opendatahub.io/dashboard: 'true'
203+
spec:
204+
accessModes:
205+
- ReadWriteOnce
206+
resources:
207+
requests:
208+
storage: "${PVC_SIZE}"

0 commit comments

Comments
 (0)