Skip to content

Commit 45a2bde

Browse files
committed
set up for new gpu class, creating notebooks, localqueue config to point to clusterqueues, and observability for jobs through rolebinding
1 parent b59331c commit 45a2bde

File tree

5 files changed

+446
-3
lines changed

5 files changed

+446
-3
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
repos:
22
- repo: https://github.com/Lucas-C/pre-commit-hooks
3-
rev: v1.5.4
3+
rev: v1.5.5
44
hooks:
55
- id: remove-tabs
66

77
- repo: https://github.com/pre-commit/pre-commit-hooks
8-
rev: v4.5.0
8+
rev: v6.0.0
99
hooks:
1010
- id: trailing-whitespace
1111
- id: check-merge-conflict
@@ -18,7 +18,7 @@ repos:
1818
- id: detect-private-key
1919

2020
- repo: https://github.com/adrienverge/yamllint.git
21-
rev: v1.32.0
21+
rev: v1.37.1
2222
hooks:
2323
- id: yamllint
2424
files: \.(yaml|yml)$

gpu-class/gpu-class-setup.sh

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/bash
2+
3+
CLASS_NAME="csw991"
4+
5+
create_resource_command=(oc create -f -)
6+
openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.ocp-test.nerc.mghpcc.org/projects/ope-test?section=workbenches
7+
# split openshift url to provide as parameters
8+
host="${openshift_url%/projects*}" # get everything before projects
9+
hub_host=$host
10+
run_name="gpu_class_test"
11+
image_name="csw-dev-f25"
12+
13+
create_wb() {
14+
random_id=$(openssl rand -hex 3)
15+
16+
#set namespace
17+
namespace=$1
18+
19+
#get student sername from namespace
20+
username=$(echo "$ns" | awk -F'-' '{print $2}')
21+
22+
# give notebook within namespace a name
23+
notebook_name=${username,,}-${random_id}
24+
25+
params=(
26+
-p NOTEBOOK_NAME="$notebook_name"
27+
-p RUN_NAME="$run_name"
28+
-p USERNAME="$username"
29+
-p NAMESPACE="$namespace"
30+
-p IMAGE_NAME="$image_name"
31+
-p OPENSHIFT_URL="$openshift_url"
32+
-p HUB_HOST="$hub_host"
33+
)
34+
35+
oc process -f resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" 1>&2
36+
37+
echo "$notebook_name"
38+
}
39+
40+
apply_localqueue() {
41+
namespace=$1
42+
43+
local_params=(
44+
-p NAMESPACE="$namespace"
45+
)
46+
47+
oc process -f localqueue.yaml "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
48+
}
49+
50+
apply_rolebinding() {
51+
#set namespace and nb name
52+
namespace=$1
53+
notebook_name=$2
54+
55+
56+
rb_params=(
57+
-p NAMESPACE="$namespace"
58+
-p SERVICE_ACCOUNT="$notebook_name"
59+
)
60+
61+
oc process -f rb.yaml "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
62+
}
63+
64+
65+
oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
66+
oc project "$ns"
67+
68+
#create a workbench and save the name of the notebook to apply rolebindings
69+
nb_name="$(create_wb "$ns")"
70+
apply_rolebinding "$ns" "$nb_name"
71+
apply_localqueue "$ns"
72+
73+
done

gpu-class/localqueue.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
metadata:
4+
name: localqueue
5+
parameters:
6+
- name: NAMESPACE
7+
required: true
8+
objects:
9+
- apiVersion: kueue.x-k8s.io/v1beta1
10+
kind: LocalQueue
11+
metadata:
12+
name: v100-localqueue
13+
namespace: ${NAMESPACE}
14+
spec:
15+
clusterQueue: v100-clusterqueue
16+
- apiVersion: kueue.x-k8s.io/v1beta1
17+
kind: LocalQueue
18+
metadata:
19+
name: a100-localqueue
20+
namespace: ${NAMESPACE}
21+
spec:
22+
clusterQueue: a100-clusterqueue
23+
- apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: LocalQueue
25+
metadata:
26+
name: h100-localqueue
27+
namespace: ${NAMESPACE}
28+
spec:
29+
clusterQueue: h100-clusterqueue

gpu-class/rb.yaml

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
apiVersion: template.openshift.io/v1
2+
kind: Template
3+
metadata:
4+
name: rbac-default-edit-and-jobs
5+
parameters:
6+
- name: NAMESPACE
7+
required: true
8+
- name: SERVICE_ACCOUNT
9+
required: true
10+
objects:
11+
# ROLEBINDING FOR EXISTING EDIT ROLE
12+
- apiVersion: rbac.authorization.k8s.io/v1
13+
kind: RoleBinding
14+
metadata:
15+
name: default-edit
16+
namespace: ${NAMESPACE}
17+
roleRef:
18+
apiGroup: rbac.authorization.k8s.io
19+
kind: ClusterRole
20+
name: edit
21+
subjects:
22+
- kind: ServiceAccount
23+
name: ${SERVICE_ACCOUNT}
24+
namespace: ${NAMESPACE}
25+
26+
# CREATE ROLE DEFAULT-EDIT-JOBS
27+
- apiVersion: rbac.authorization.k8s.io/v1
28+
kind: Role
29+
metadata:
30+
name: default-edit-jobs
31+
namespace: ${NAMESPACE}
32+
rules:
33+
- apiGroups: ["batch"]
34+
resources: ["jobs"]
35+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
36+
37+
# BIND DEFAULT-EDIT-JOBS TO THE NOTEBOOK SA
38+
- apiVersion: rbac.authorization.k8s.io/v1
39+
kind: RoleBinding
40+
metadata:
41+
name: default-edit-jobs
42+
namespace: ${NAMESPACE}
43+
roleRef:
44+
apiGroup: rbac.authorization.k8s.io
45+
kind: Role
46+
name: default-edit-jobs
47+
subjects:
48+
- kind: ServiceAccount
49+
name: ${SERVICE_ACCOUNT}
50+
namespace: ${NAMESPACE}
51+
52+
# CREATE ROLE FOR POD-EXEC
53+
- apiVersion: rbac.authorization.k8s.io/v1
54+
kind: Role
55+
metadata:
56+
name: default-edit-pods-exec
57+
namespace: ${NAMESPACE}
58+
rules:
59+
- apiGroups: [""]
60+
resources: ["pods/exec"]
61+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
62+
63+
# BIND POD-EXEC TO THE NOTEBOOK SA
64+
- apiVersion: rbac.authorization.k8s.io/v1
65+
kind: RoleBinding
66+
metadata:
67+
name: default-edit-pods-exec
68+
namespace: ${NAMESPACE}
69+
roleRef:
70+
apiGroup: rbac.authorization.k8s.io
71+
kind: Role
72+
name: default-edit-pods-exec
73+
subjects:
74+
- kind: ServiceAccount
75+
name: ${SERVICE_ACCOUNT}
76+
namespace: ${NAMESPACE}
77+
78+
# CREATE ROLE FOR LOCAL QUEUE
79+
- apiVersion: rbac.authorization.k8s.io/v1
80+
kind: Role
81+
metadata:
82+
name: default-kueue-localqueue-reader
83+
namespace: ${NAMESPACE}
84+
rules:
85+
- apiGroups: ["kueue.x-k8s.io"]
86+
resources: ["localqueues"]
87+
verbs: ["get", "list", "watch"]
88+
89+
# BIND LOCAL QUEUE READER TO THE NOTEBOOK SA
90+
- apiVersion: rbac.authorization.k8s.io/v1
91+
kind: RoleBinding
92+
metadata:
93+
name: kueue-localqueue-reader
94+
namespace: ${NAMESPACE}
95+
roleRef:
96+
apiGroup: rbac.authorization.k8s.io
97+
kind: Role
98+
name: default-kueue-localqueue-reader
99+
subjects:
100+
- kind: ServiceAccount
101+
name: ${SERVICE_ACCOUNT}
102+
namespace: ${NAMESPACE}
103+
104+
# CREATE CLUSTERQUEUE ROLE
105+
- apiVersion: rbac.authorization.k8s.io/v1
106+
kind: ClusterRole
107+
metadata:
108+
name: default-kueue-clusterqueue-reader-${NAMESPACE}-${SERVICE_ACCOUNT}
109+
rules:
110+
- apiGroups: ["kueue.x-k8s.io"]
111+
resources: ["clusterqueues"]
112+
verbs: ["get", "list", "watch"]
113+
114+
- apiVersion: rbac.authorization.k8s.io/v1
115+
kind: ClusterRoleBinding
116+
metadata:
117+
name: kueue-clusterqueue-reader-${NAMESPACE}-${SERVICE_ACCOUNT}
118+
roleRef:
119+
apiGroup: rbac.authorization.k8s.io
120+
kind: ClusterRole
121+
name: default-kueue-clusterqueue-reader
122+
subjects:
123+
- kind: ServiceAccount
124+
name: ${SERVICE_ACCOUNT}
125+
namespace: ${NAMESPACE}

0 commit comments

Comments
 (0)