From c16547f322005a3f93360e77afb8211b5ee83703 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Tue, 27 Jun 2023 16:55:10 +0100 Subject: [PATCH 1/5] Update submodule to use branch with fixes --- .gitmodules | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitmodules b/.gitmodules index a8ef2e03..d56701db 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "vendor/stackhpc/ansible-slurm-appliance"] path = vendor/stackhpc/ansible-slurm-appliance url = https://github.com/stackhpc/ansible-slurm-appliance.git + branch = fix/caas-crd From e770e34a7f06d3c297359b5e3ecd0531f8866423 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 10 Jul 2023 16:11:36 +0100 Subject: [PATCH 2/5] Slurm appliance changes merged --- .gitmodules | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index d56701db..a8ef2e03 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,3 @@ [submodule "vendor/stackhpc/ansible-slurm-appliance"] path = vendor/stackhpc/ansible-slurm-appliance url = https://github.com/stackhpc/ansible-slurm-appliance.git - branch = fix/caas-crd From 179ed4bb856d74fcd4bf6497b617a941397aab27 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Mon, 10 Jul 2023 16:47:29 +0100 Subject: [PATCH 3/5] Make the resource consumtion more obvious --- group_vars/openstack.yml | 3 -- group_vars/prometheus.yml | 3 +- ui-meta/slurm-infra-fast-volume-type.yml | 43 +++++++++++++++++------- ui-meta/slurm-infra.yml | 43 +++++++++++++++++------- 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/group_vars/openstack.yml b/group_vars/openstack.yml index e8a99007..02f8135f 100644 --- a/group_vars/openstack.yml +++ b/group_vars/openstack.yml @@ -17,9 +17,6 @@ terraform_project_path: "{{ playbook_dir }}/terraform" terraform_state: "{{ cluster_state | default('present') }}" cluster_ssh_user: rocky -# Set the size of the state volume to metrics_db_maximum_size + 10 -state_volume_size: "{{ metrics_db_maximum_size + 10 }}" - # Provision a single "standard" compute partition using the supplied # node count and flavor openhpc_slurm_partitions: diff --git a/group_vars/prometheus.yml b/group_vars/prometheus.yml index 3ea28289..50066d6b 100644 --- a/group_vars/prometheus.yml +++ b/group_vars/prometheus.yml @@ -8,4 +8,5 @@ openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if 'ope prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if ( 'openondemand' in groups ) else [] ) }}" # Set Prometheus storage retention size -prometheus_storage_retention_size: "{{ metrics_db_maximum_size }}GB" \ No newline at end of file +# We reserve 10GB of the state volume for cluster state, the rest is for metrics +prometheus_storage_retention_size: "{{ state_volume_size - 10 }}GB" diff --git a/ui-meta/slurm-infra-fast-volume-type.yml b/ui-meta/slurm-infra-fast-volume-type.yml index 899e3b43..6221f802 100644 --- a/ui-meta/slurm-infra-fast-volume-type.yml +++ b/ui-meta/slurm-infra-fast-volume-type.yml @@ -12,6 +12,24 @@ parameters: kind: cloud.ip immutable: true + - name: login_flavor + label: Login node size + description: The size to use for the login node. + kind: cloud.size + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: control_flavor + label: Control node size + description: The size to use for the control node. + kind: cloud.size + immutable: true + options: + min_ram: 2048 + min_disk: 20 + - name: compute_count label: Compute node count description: The number of compute nodes in the cluster. @@ -23,7 +41,7 @@ parameters: - name: compute_flavor label: Compute node size description: The size to use for the compute node. - kind: "cloud.size" + kind: cloud.size immutable: true options: min_ram: 2048 @@ -31,8 +49,8 @@ parameters: - name: home_volume_size label: Home volume size (GB) - description: The size of the cloud volume to use for home directories - kind: integer + description: The size of the cloud volume to use for home directories. + kind: cloud.volume_size immutable: true options: min: 10 @@ -51,19 +69,20 @@ parameters: options: checkboxLabel: Put home directories on high-performance storage? - - name: metrics_db_maximum_size - label: Metrics database size (GB) + - name: state_volume_size + label: State volume size (GB) description: | + The size of the state volume, used to hold and persist important files and data. Of + this volume, 10GB is set aside for cluster state and the remaining space is used + to store cluster metrics. + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be - discarded to ensure that the database does not grow larger than this size. - - **A cloud volume of this size +10GB will be created to hold and persist the metrics - database and important Slurm files.** - kind: integer + discarded to ensure that the database does not grow larger than this volume. + kind: cloud.volume_size immutable: true options: - min: 10 - default: 10 + min: 20 + default: 20 - name: cluster_run_validation label: Post-configuration validation diff --git a/ui-meta/slurm-infra.yml b/ui-meta/slurm-infra.yml index ed953b92..78830234 100644 --- a/ui-meta/slurm-infra.yml +++ b/ui-meta/slurm-infra.yml @@ -12,6 +12,24 @@ parameters: kind: cloud.ip immutable: true + - name: login_flavor + label: Login node size + description: The size to use for the login node. + kind: cloud.size + immutable: true + options: + min_ram: 2048 + min_disk: 20 + + - name: control_flavor + label: Control node size + description: The size to use for the control node. + kind: cloud.size + immutable: true + options: + min_ram: 2048 + min_disk: 20 + - name: compute_count label: Compute node count description: The number of compute nodes in the cluster. @@ -23,7 +41,7 @@ parameters: - name: compute_flavor label: Compute node size description: The size to use for the compute node. - kind: "cloud.size" + kind: cloud.size immutable: true options: min_ram: 2048 @@ -31,26 +49,27 @@ parameters: - name: home_volume_size label: Home volume size (GB) - description: The size of the cloud volume to use for home directories - kind: integer + description: The size of the cloud volume to use for home directories. + kind: cloud.volume_size immutable: true options: min: 10 default: 100 - - name: metrics_db_maximum_size - label: Metrics database size (GB) + - name: state_volume_size + label: State volume size (GB) description: | + The size of the state volume, used to hold and persist important files and data. Of + this volume, 10GB is set aside for cluster state and the remaining space is used + to store cluster metrics. + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be - discarded to ensure that the database does not grow larger than this size. - - **A cloud volume of this size +10GB will be created to hold and persist the metrics - database and important Slurm files.** - kind: integer + discarded to ensure that the database does not grow larger than this volume. + kind: cloud.volume_size immutable: true options: - min: 10 - default: 10 + min: 20 + default: 20 - name: cluster_run_validation label: Post-configuration validation From 63e6ec80c20c43d63bdf95cdc7b9622d483c777a Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 12 Jul 2023 15:13:18 +0100 Subject: [PATCH 4/5] Add count_variable option for compute nodes --- ui-meta/slurm-infra.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ui-meta/slurm-infra.yml b/ui-meta/slurm-infra.yml index 78830234..f876987a 100644 --- a/ui-meta/slurm-infra.yml +++ b/ui-meta/slurm-infra.yml @@ -44,6 +44,7 @@ parameters: kind: cloud.size immutable: true options: + count_variable: compute_count min_ram: 2048 min_disk: 20 From ada2e4e92a97edeeccc30157070f03043e207553 Mon Sep 17 00:00:00 2001 From: Matt Pryor Date: Wed, 12 Jul 2023 15:21:18 +0100 Subject: [PATCH 5/5] Use the proper name --- ui-meta/slurm-infra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui-meta/slurm-infra.yml b/ui-meta/slurm-infra.yml index f876987a..9dbe2587 100644 --- a/ui-meta/slurm-infra.yml +++ b/ui-meta/slurm-infra.yml @@ -44,7 +44,7 @@ parameters: kind: cloud.size immutable: true options: - count_variable: compute_count + count_parameter: compute_count min_ram: 2048 min_disk: 20