diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 75ebc91..81ef041 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.5.4 + rev: v1.5.5 hooks: - id: remove-tabs - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: check-merge-conflict @@ -18,7 +18,7 @@ repos: - id: detect-private-key - repo: https://github.com/adrienverge/yamllint.git - rev: v1.32.0 + rev: v1.37.1 hooks: - id: yamllint files: \.(yaml|yml)$ diff --git a/gpu-class/cleanup.sh b/gpu-class/cleanup.sh new file mode 100755 index 0000000..1e2783d --- /dev/null +++ b/gpu-class/cleanup.sh @@ -0,0 +1,7 @@ +pattern="^bu-cs599-pmpp-cuda-" + + for proj in $(oc get projects -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep "$pattern"); do + echo "deleting notebook + pvc" + oc -n "$proj" delete notebook --as system:admin --all --ignore-not-found --wait=true || true + oc -n "$proj" delete pvc --as system:admin --all --ignore-not-found --wait=true || true + done diff --git a/gpu-class/cluster_queue_role.yaml b/gpu-class/cluster_queue_role.yaml new file mode 100644 index 0000000..9990cce --- /dev/null +++ b/gpu-class/cluster_queue_role.yaml @@ -0,0 +1,8 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kueue-clusterqueue-reader +rules: + - apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues"] + verbs: ["get", "list", "watch"] diff --git a/gpu-class/gpu-class-setup.sh b/gpu-class/gpu-class-setup.sh new file mode 100755 index 0000000..098f94f --- /dev/null +++ b/gpu-class/gpu-class-setup.sh @@ -0,0 +1,100 @@ +#!/bin/bash + +CLASS_NAME="bu-cs599-pmpp-cuda" + +create_resource_command=(oc create -f -) +openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.edu.nerc.mghpcc.org/projects +# split openshift url to provide as parameters +host="${openshift_url%/projects*}" # get everything before projects +hub_host=$host +run_name="gpu_class_test" +image_name="csw-dev-f25" + +create_wb() { + random_id=$(openssl rand -hex 3) + + #set namespace + namespace=$1 + + username=$(oc -n "$ns" get rolebinding edit -o json \ + | jq -r ' + (.subjects // []) + | map(.name) + | map(select(. != "jappavoo-40bu-2edu")) + | map(select(. != "sdanni-40redhat-2com")) + | map(select(. != "istaplet")) + | .[] + ') + + user=$(oc -n "$ns" get rolebinding edit -o json \ + | jq -r ' + (.subjects // []) + | map(.name + | if test("@.*\\..*$") + then sub("@"; "-40") | gsub("\\.";"-2") + else . + end) + | map(select(. != "jappavoo-40bu-2edu")) + | map(select(. != "sdanni-40redhat-2com")) + | map(select(. != "istaplet")) + | .[] + ') + + # give notebook within namespace a name + notebook_name=cs599-${user}-wb + + params=( + -p NOTEBOOK_NAME="$notebook_name" + -p RUN_NAME="$run_name" + -p USERNAME="$username" + -p NAMESPACE="$namespace" + -p USER="$user" + -p IMAGE_NAME="$image_name" + -p OPENSHIFT_URL="$openshift_url" + -p HUB_HOST="$hub_host" + ) + + oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2 + + echo "$notebook_name" +} + +apply_localqueue() { + namespace=$1 + + local_params=( + -p NAMESPACE="$namespace" + ) + + oc process -f localqueue.yaml --local "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2 +} + +apply_rolebinding() { + #set namespace and nb name + namespace=$1 + notebook_name=$2 + + rb_params=( + -p NAMESPACE="$namespace" + -p SERVICE_ACCOUNT_NB="$notebook_name" + ) + + oc process -f rb.yaml --local "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin +} + +apply_clusterq() { + + oc apply -f cluster_queue_role.yaml --as system:admin +} + +apply_clusterq + +oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do + oc project "$ns" + + #create a workbench and save the name of the notebook to apply rolebindings + nb_name="$(create_wb "$ns")" + apply_rolebinding "$ns" "$nb_name" + apply_localqueue "$ns" + +done diff --git a/gpu-class/localqueue.yaml b/gpu-class/localqueue.yaml new file mode 100644 index 0000000..8beabb5 --- /dev/null +++ b/gpu-class/localqueue.yaml @@ -0,0 +1,29 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: localqueue +parameters: + - name: NAMESPACE + required: true +objects: + - apiVersion: kueue.x-k8s.io/v1beta1 + kind: LocalQueue + metadata: + name: v100-localqueue + namespace: ${NAMESPACE} + spec: + clusterQueue: v100-clusterqueue + - apiVersion: kueue.x-k8s.io/v1beta1 + kind: LocalQueue + metadata: + name: a100-localqueue + namespace: ${NAMESPACE} + spec: + clusterQueue: a100-clusterqueue + - apiVersion: kueue.x-k8s.io/v1beta1 + kind: LocalQueue + metadata: + name: h100-localqueue + namespace: ${NAMESPACE} + spec: + clusterQueue: h100-clusterqueue diff --git a/gpu-class/notebook_resource.yaml b/gpu-class/notebook_resource.yaml new file mode 100644 index 0000000..74557d2 --- /dev/null +++ b/gpu-class/notebook_resource.yaml @@ -0,0 +1,210 @@ +apiVersion: template.openshift.io/v1 +kind: Template +parameters: +- name: NOTEBOOK_NAME + required: true +- name: RUN_NAME + required: true +- name: USERNAME + required: true +- name: IMAGE_NAME + required: true +- name: NAMESPACE + required: true +- name: OPENSHIFT_URL + required: true +- name: USER + required: true +- name: IMAGE_REPO + required: true + value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications" +- name: HUB_HOST + required: true +- name: PVC_SIZE + required: true + value: "20Gi" +- name: TOKEN + required: false +objects: +- apiVersion: kubeflow.org/v1beta1 + kind: Notebook + metadata: + annotations: + notebooks.opendatahub.io/inject-oauth: 'true' + notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME} + notebooks.opendatahub.io/last-size-selection: Small + notebooks.opendatahub.io/oauth-logout-url: >- + ${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME} + opendatahub.io/username: ${USER} + openshift.io/description: '' + openshift.io/display-name: ${NOTEBOOK_NAME} + opendatahub.io/image-display-name: ${IMAGE_NAME} + name: ${NOTEBOOK_NAME} + labels: + ope-run: ${RUN_NAME} + app: ${NOTEBOOK_NAME} + opendatahub.io/dashboard: 'true' + opendatahub.io/odh-managed: 'true' + opendatahub.io/user: ${USER} + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: nvidia.com/gpu.present + # set the value to 'true' to use nodes with GPUs + operator: In + values: + - 'false' + weight: 1 + template: + spec: + containers: + - resources: + limits: + cpu: '2' + memory: 8Gi + requests: + cpu: '1' + memory: 8Gi + readinessProbe: + failureThreshold: 3 + httpGet: + path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api + port: notebook-port + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + name: ${NOTEBOOK_NAME} + livenessProbe: + failureThreshold: 3 + httpGet: + path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api + port: notebook-port + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + env: + - name: NOTEBOOK_ARGS + value: |- + --ServerApp.port=8888 + --ServerApp.token=${TOKEN} + --ServerApp.password='' + --ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME} + --ServerApp.quit_button=False + --ServerApp.tornado_settings={"user":"${USER}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"} + - name: JUPYTER_IMAGE + value: >- + ${IMAGE_REPO}/${IMAGE_NAME} + ports: + - containerPort: 8888 + name: notebook-port + protocol: TCP + imagePullPolicy: Always + volumeMounts: + - mountPath: /opt/app-root/src + name: ${NOTEBOOK_NAME} + - mountPath: /dev/shm + name: shm + image: >- + ${IMAGE_REPO}/${IMAGE_NAME} + workingDir: /opt/app-root/src + - resources: + limits: + cpu: 100m + memory: 64Mi + requests: + cpu: 100m + memory: 64Mi + readinessProbe: + failureThreshold: 3 + httpGet: + path: /oauth/healthz + port: oauth-proxy + scheme: HTTPS + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + name: oauth-proxy + livenessProbe: + failureThreshold: 3 + httpGet: + path: /oauth/healthz + port: oauth-proxy + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8443 + name: oauth-proxy + protocol: TCP + imagePullPolicy: Always + volumeMounts: + - mountPath: /etc/oauth/config + name: oauth-config + - mountPath: /etc/tls/private + name: tls-certificates + image: >- + registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46 + args: + - '--provider=openshift' + - '--https-address=:8443' + - '--http-address=' + - '--openshift-service-account=${NOTEBOOK_NAME}' + - '--cookie-secret-file=/etc/oauth/config/cookie_secret' + - '--cookie-expire=24h0m0s' + - '--tls-cert=/etc/tls/private/tls.crt' + - '--tls-key=/etc/tls/private/tls.key' + - '--upstream=http://localhost:8888' + - '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' + - '--email-domain=*' + - '--skip-provider-button' + - >- + --openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"} + - >- + --logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME} + enableServiceLinks: false + serviceAccountName: ${NOTEBOOK_NAME} + volumes: + - name: ${NOTEBOOK_NAME} + persistentVolumeClaim: + claimName: ${NOTEBOOK_NAME} + - emptyDir: + medium: Memory + name: shm + - name: oauth-config + secret: + defaultMode: 420 + secretName: ${NOTEBOOK_NAME}-oauth-config + - name: tls-certificates + secret: + defaultMode: 420 + secretName: ${NOTEBOOK_NAME}-tls +- apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: ${NOTEBOOK_NAME} + labels: + app: ${NOTEBOOK_NAME} + notebook-name: ${NOTEBOOK_NAME} + ope-run: ${RUN_NAME} + opendatahub.io/dashboard: 'true' + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "${PVC_SIZE}" diff --git a/gpu-class/rb.yaml b/gpu-class/rb.yaml new file mode 100644 index 0000000..cabb081 --- /dev/null +++ b/gpu-class/rb.yaml @@ -0,0 +1,148 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: rbac-default-edit-and-jobs +parameters: + - name: NAMESPACE + required: true + - name: SERVICE_ACCOUNT_NB + required: true +objects: + # OC AUTH WB (BINDINGS FOR WORKBOOKS) + # ROLEBINDING FOR EXISTING EDIT ROLE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: ${SERVICE_ACCOUNT_NB}-edit + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: edit + subjects: + - kind: ServiceAccount + name: ${SERVICE_ACCOUNT_NB} + namespace: ${NAMESPACE} + + # CREATE ROLE EDIT-JOBS + - apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: edit-jobs + namespace: ${NAMESPACE} + rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + + # BIND EDIT-JOBS TO THE NOTEBOOK SA + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: ${SERVICE_ACCOUNT_NB}-edit-jobs + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: edit-jobs + subjects: + - kind: ServiceAccount + name: ${SERVICE_ACCOUNT_NB} + namespace: ${NAMESPACE} + + # CREATE ROLE FOR LOCAL QUEUE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: kueue-localqueue-reader + namespace: ${NAMESPACE} + rules: + - apiGroups: ["kueue.x-k8s.io"] + resources: ["localqueues"] + verbs: ["get", "list", "watch"] + + # BIND LOCAL QUEUE READER TO THE NOTEBOOK SA + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: ${SERVICE_ACCOUNT_NB}-kueue-localqueue-reader + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kueue-localqueue-reader + subjects: + - kind: ServiceAccount + name: ${SERVICE_ACCOUNT_NB} + namespace: ${NAMESPACE} + + # CLUSTER QUEUE ROLEBINDING + - apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRoleBinding + metadata: + name: ${SERVICE_ACCOUNT_NB}-kueue-clusterqueue-reader + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kueue-clusterqueue-reader + subjects: + - kind: ServiceAccount + name: ${SERVICE_ACCOUNT_NB} + namespace: ${NAMESPACE} + + # OC AUTH EXEC (BINDINGS FOR DEFAULT SERVICE ACCOUNT) + # BIND TO EXISTING EDIT ROLE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: default-edit + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: edit + subjects: + - kind: ServiceAccount + name: default + namespace: ${NAMESPACE} + + # BIND TO INTERACTING WITH JOBS ROLE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: default-edit-jobs + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: edit-jobs + subjects: + - kind: ServiceAccount + name: default + namespace: ${NAMESPACE} + + # PODS/EXEC ROLE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: default-edit-pods-exec + namespace: ${NAMESPACE} + rules: + - apiGroups: [""] + resources: ["pods/exec"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + + # BIND PODS/EXEC ROLE + - apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: default-edit-pods-exec + namespace: ${NAMESPACE} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: default-edit-pods-exec + subjects: + - kind: ServiceAccount + name: default + namespace: ${NAMESPACE}