From 3c456bd435ddd595a4a4c2908b6276e69c20c451 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 12 Sep 2023 16:42:37 +0000 Subject: [PATCH 1/5] bump slurm version --- image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/image/Dockerfile b/image/Dockerfile index 0d00a6a..e0038a1 100644 --- a/image/Dockerfile +++ b/image/Dockerfile @@ -6,7 +6,7 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker- org.label-schema.docker.cmd="docker-compose up -d" \ maintainer="StackHPC" -ARG SLURM_TAG=slurm-23.02 +ARG SLURM_TAG=slurm-23-02-5-1 ARG GOSU_VERSION=1.11 COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo From 4278c2ea51d64844470029d8f1b38404d70bfa0f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Sep 2023 08:13:47 +0000 Subject: [PATCH 2/5] bump image, and change to slurm-k8s-cluster repo --- slurm-cluster-chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurm-cluster-chart/values.yaml b/slurm-cluster-chart/values.yaml index 086ccf1..c5928ca 100644 --- a/slurm-cluster-chart/values.yaml +++ b/slurm-cluster-chart/values.yaml @@ -1,4 +1,4 @@ -slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:1f51003 +slurmImage: ghcr.io/stackhpc/slurm-k8s-cluster:3c456bd login: # Deployment resource name From 54026e8768c9132215469b46605e5661c21fc785 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Sep 2023 09:32:17 +0000 Subject: [PATCH 3/5] use Dynamic nodes --- slurm-cluster-chart/files/slurm.conf | 4 ++-- slurm-cluster-chart/templates/slurmd.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/slurm-cluster-chart/files/slurm.conf b/slurm-cluster-chart/files/slurm.conf index a10c12b..32e2f99 100644 --- a/slurm-cluster-chart/files/slurm.conf +++ b/slurm-cluster-chart/files/slurm.conf @@ -47,12 +47,12 @@ AccountingStorageType=accounting_storage/slurmdbd AccountingStorageHost=slurmdbd AccountingStoragePort=6819 # -SlurmctldParameters=cloud_dns,cloud_reg_addrs +SlurmctldParameters=cloud_reg_addrs +TreeWidth=65533 CommunicationParameters=NoAddrCache # NODES MaxNodeCount=10 -NodeName=slurmd-[0-9] State=FUTURE CPUs=4 # PARTITIONS PartitionName=all Default=yes Nodes=ALL diff --git a/slurm-cluster-chart/templates/slurmd.yaml b/slurm-cluster-chart/templates/slurmd.yaml index bec55ce..09858db 100644 --- a/slurm-cluster-chart/templates/slurmd.yaml +++ b/slurm-cluster-chart/templates/slurmd.yaml @@ -23,7 +23,7 @@ spec: containers: - args: - slurmd - - -F + - -Z - -vvv - -N - "$(POD_NAME)" From bd3dc6edd05f76592860d9006474052548ddb162 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Sep 2023 09:32:32 +0000 Subject: [PATCH 4/5] update README --- README.md | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 46db25a..ec1541b 100644 --- a/README.md +++ b/README.md @@ -34,50 +34,38 @@ All config files in `slurm-cluster-chart/files` will be mounted into the contain ## Deploying the Cluster -### Generating Cluster Secrets +### Storage -On initial deployment ONLY, run -```console -./generate-secrets.sh [] -``` -This generates a set of secrets in the target namespace to be used by the Slurm cluster. If these need to be regenerated, see "Reconfiguring the Cluster" - -Be sure to take note of the Open Ondemand credentials, you will need them to access the cluster through a browser - -### Connecting RWX Volume - -A ReadWriteMany (RWX) volume is required for shared storage across cluster nodes. By default, the Rook NFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide a RWX capable Storage Class for the required shared volume. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. +A ReadWriteMany (RWX) volume is required to provision a shared volume across the Slurm pods. By default, a RookNFS Helm chart is installed as a dependency of the Slurm cluster chart in order to provide this capability. If the target Kubernetes cluster has an existing storage class which should be used instead, then `storageClass` in `values.yaml` should be set to the name of this existing class and the RookNFS dependency should be disabled by setting `rooknfs.enabled = false`. In either case, the storage capacity of the provisioned RWX volume can be configured by setting the value of `storage.capacity`. See the separate RookNFS chart [values.yaml](./rooknfs/values.yaml) for further configuration options when using the RookNFS to provide the shared storage volume. -### Supplying Public Keys +### Public Keys To access the cluster via `ssh`, you will need to make your public keys available. All your public keys from localhost can be added by running ```console ./publish-keys.sh [] ``` -where `` is the namespace in which the Slurm cluster chart will be deployed (i.e. using `helm install -n ...`). This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. +where `` is the namespace in which the Slurm cluster chart will be deployed. This will create a Kubernetes Secret in the appropriate namespace for the Slurm cluster to use. Omitting the namespace arg will install the secrets in the default namespace. -### Deploying with Helm +Alternatively public keys can be defined in `slurm-cluster-chart/values.yaml:sshPublicKey` -After configuring `kubectl` with the appropriate `kubeconfig` file, deploy the cluster using the Helm chart: -```console -helm install slurm-cluster-chart -``` +### Installation with Helm -NOTE: If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart -```console -helm dependency update slurm-cluster-chart -``` +- Configure `kubectl` with the appropriate `kubeconfig` file. -Subsequent releases can be deployed using: +- If necessary, change any configuration in `slurm-cluster-chart/values.yaml`, e.g. `openOnDemand.password`. -```console -helm upgrade slurm-cluster-chart -``` +- If using the RookNFS dependency, then the following must be run before installing the Slurm cluster chart + ```console + helm dependency update slurm-cluster-chart + ``` -Note: When updating the cluster with `helm upgrade`, a pre-upgrade hook will prevent upgrades if there are running jobs in the Slurm queue. Attempting to upgrade will set all Slurm nodes to `DRAINED` state. If an upgrade fails due to running jobs, you can undrain the nodes either by waiting for running jobs to complete and then retrying the upgrade or by manually undraining them by accessing the cluster as a privileged user. Alternatively you can bypass the hook by running `helm upgrade` with the `--no-hooks` flag (may result in running jobs being lost) +- Deploy the cluster using the Helm chart: + ```console + helm install slurm-cluster-chart + ``` ## Accessing the Cluster @@ -180,4 +168,13 @@ and then restart the other dependent deployments to propagate changes: kubectl rollout restart deployment slurmd slurmctld login slurmdbd ``` +## Upgrading the Cluster + +Subsequent Helm releases can be deployed using +```console +helm upgrade slurm-cluster-chart +``` + +A pre-upgrade hook will prevent upgrades if there are running jobs in the Slurm queue. Attempting to upgrade will set all Slurm nodes to `DRAINED` state. If an upgrade fails due to running jobs, you can undrain the nodes either by waiting for running jobs to complete and then retrying the upgrade or by manually undraining them by accessing the cluster as a privileged user. Alternatively you can bypass the hook by running `helm upgrade` with the `--no-hooks` flag (may result in running jobs being lost) + # Known Issues From fb61f584b0439e1ec1038c1a952fbd8991a3efe5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 13 Sep 2023 09:41:05 +0000 Subject: [PATCH 5/5] add Limitations to README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ec1541b..29d1464 100644 --- a/README.md +++ b/README.md @@ -177,4 +177,7 @@ helm upgrade slurm-cluster-chart A pre-upgrade hook will prevent upgrades if there are running jobs in the Slurm queue. Attempting to upgrade will set all Slurm nodes to `DRAINED` state. If an upgrade fails due to running jobs, you can undrain the nodes either by waiting for running jobs to complete and then retrying the upgrade or by manually undraining them by accessing the cluster as a privileged user. Alternatively you can bypass the hook by running `helm upgrade` with the `--no-hooks` flag (may result in running jobs being lost) -# Known Issues +# Known Issues and Limitations +- Single user (`rocky`) +- All nodes are in a single partition `all`. +- Scaling down the `slurmd` StatefulSet will not remove nodes from Slurm - they will eventually get marked DOWN. However they will go back to IDLE if the StatefulSet is scaled back up.