Skip to content

Commit 2855e1e

Browse files
committed
Support latest ansible-slurm-appliance
* Support Rocky 8.5 * Remove support for Kibana * Remove ad-hoc Grafana install * Support updated HPCtests * Use latest stackhpc.terraform-infra
1 parent ec810a0 commit 2855e1e

File tree

9 files changed

+53
-47
lines changed

9 files changed

+53
-47
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# caas-slurm-appliance
22

33
This repository adapts the [StackHPC Slurm Appliance](https://github.com/stackhpc/ansible-slurm-appliance)
4-
for use within the Cluster-as-a-Service system of the [Cloud Portal](https://github.com/stackhpc/jasmin-cloud).
4+
for use within the Cluster-as-a-Service system of the [Azimuth Cloud Portal](https://github.com/stackhpc/azimuth).
5+

ansible.cfg

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ host_key_checking = False
66
remote_tmp = /tmp
77
# Enable our custom vars plugin that parses variables from the current working directory
88
vars_plugins_enabled = host_group_vars,cwd_host_group_vars
9+
roles_path = roles:vendor/stackhpc/ansible-slurm-appliance/ansible/roles
910

1011
[ssh_connection]
1112
ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null
@@ -14,10 +15,3 @@ pipelining = True
1415
# This ensures that if the proxy connection is interrupted, rendering the other hosts
1516
# unreachable, the connection is retried instead of failing the entire play
1617
retries = 10
17-
18-
[tags]
19-
# Temporarily, the grafana_install tag is skipped and we install Grafana manually
20-
# https://github.com/grafana/grafana/issues/36935
21-
# TODO: REMOVE THIS
22-
# By default, we also skip the HPL tests
23-
skip = grafana_install,hpl-solo,hpl-all

group_vars/cluster.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
update_enable: "{{ cluster_upgrade_system_packages | default('false') | bool }}"
33

44
# Read the secrets from the Ansible local facts on the control host
5-
secrets_openhpc_grafana_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.grafana_admin_password }}"
6-
secrets_openhpc_elasticsearch_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.elasticsearch_admin_password }}"
7-
secrets_openhpc_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.elasticsearch_kibana_password }}"
8-
secrets_openhpc_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.mysql_root_password }}"
9-
secrets_openhpc_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.mysql_slurm_password }}"
10-
secrets_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.mungekey }}"
5+
vault_grafana_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_grafana_admin_password }}"
6+
vault_elasticsearch_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_admin_password }}"
7+
vault_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_elasticsearch_kibana_password }}"
8+
vault_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_root_password }}"
9+
vault_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_slurm_password }}"
10+
vault_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_openhpc_mungekey }}"
1111

1212
# Override this to cope with the case where the podman group just doesn't exist
1313
appliances_local_users_podman_enable: "{{ groups.get('podman', []) | length > 0 }}"

group_vars/openstack.yml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
# The default Terraform state key for backends that support it
22
terraform_state_key: "cluster/{{ cluster_id }}/tfstate"
33

4+
terraform_backend_config_defaults:
5+
consul:
6+
path: "{{ terraform_state_key }}"
7+
gzip: "true"
8+
local: {}
9+
410
#####
511
## WARNING
612
##
@@ -24,16 +30,15 @@ cluster_groups_required:
2430

2531
# These are the additional groups required for monitoring (see everything layout)
2632
cluster_groups_monitoring:
27-
podman: [opendistro, kibana, filebeat]
33+
podman: [opendistro, filebeat]
2834
prometheus: [control]
2935
grafana: [control]
3036
alertmanager: [control]
3137
node_exporter: [cluster]
3238
opendistro: [control]
33-
kibana: [control]
3439
slurm_stats: [control]
3540
filebeat: [slurm_stats]
3641

3742
# Additional groups for running the cluster validation
3843
cluster_groups_validation:
39-
openhpc_tests: [cluster]
44+
hpctests: [login]

requirements.yml

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,36 @@
11
---
22
roles:
33
- src: stackhpc.nfs
4-
- src: stackhpc.openhpc
5-
- src: cloudalchemy.node_exporter
4+
- name: stackhpc.openhpc
5+
src: https://github.com/stackhpc/ansible-role-openhpc
6+
type: git
7+
version: v0.10.0
8+
#- src: cloudalchemy.node_exporter
9+
- src: https://github.com/stackhpc/ansible-node-exporter.git
10+
name: cloudalchemy.node_exporter
11+
version: support-rhel-clones # Fixes for Rocky
612
- src: cloudalchemy.blackbox-exporter
7-
- src: cloudalchemy.prometheus
13+
#- src: cloudalchemy.prometheus
14+
- src: https://github.com/cloudalchemy/ansible-prometheus.git
15+
name: cloudalchemy.prometheus
16+
version: 0795bdb1a5a424044dc7e8200129f61e53140523 # Fixes for Rocky
817
- src: cloudalchemy.alertmanager
918
- src: cloudalchemy.grafana
1019
- src: geerlingguy.mysql
1120
- src: jriguera.configdrive
1221
- name: stackhpc.terraform-infra
1322
src: https://github.com/stackhpc/ansible-role-terraform-infra
1423
type: git
15-
version: b750fd38be4eedf111eff0ed8833295efe3b6422
24+
version: 4d9d67b5a1866edf6988a1f7e9e64868df8f65ae
1625

1726
collections:
1827
- name: ansible.posix
28+
- name: ansible.netcommon
1929
- name: community.general
2030
- name: community.grafana
2131
- name: community.mysql
2232
- name: containers.podman
2333
- name: openstack.cloud
2434
- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools
2535
type: git
26-
version: main
36+
version: v0.1.0

roles/persist_openhpc_secrets/tasks/main.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,8 @@
1717
template:
1818
src: openhpc_secrets.fact
1919
dest: /etc/ansible/facts.d/openhpc_secrets.fact
20+
21+
- name: Re-read facts after adding custom fact
22+
ansible.builtin.setup:
23+
filter: ansible_local
2024
when: "not openhpc_secrets_stat.stat.exists"
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
2-
"grafana_admin_password": "{{ lookup('password', '/dev/null') }}",
3-
"elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}",
4-
"elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}",
5-
"mysql_root_password": "{{ lookup('password', '/dev/null') }}",
6-
"mysql_slurm_password": "{{ lookup('password', '/dev/null') }}",
7-
"mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}"
2+
"vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}",
3+
"vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}",
4+
"vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}",
5+
"vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}",
6+
"vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}",
7+
"vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}"
88
}

slurm-infra.yml

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,14 @@
1414
roles:
1515
- role: stackhpc.terraform-infra
1616
vars:
17+
# Image cloud user, use extraVars.__ALL__.cluster_user in values.yaml
18+
# to change if required
19+
cluster_ssh_user: "{{ cluster_user | default('rocky') }}"
1720
# Variables controlling the Terraform provisioning
1821
terraform_project_path: "{{ playbook_dir }}/terraform"
1922
terraform_state: "{{ cluster_state | default('present') }}"
23+
terraform_backend_type: "{{ ( 'CONSUL_HTTP_ADDR' in ansible_env ) | ternary('consul', 'local') }}"
24+
terraform_backend_config: "{{ terraform_backend_config_defaults[terraform_backend_type] }}"
2025
terraform_variables:
2126
cluster_name: "{{ cluster_name }}"
2227
cluster_network: "{{ cluster_network }}"
@@ -77,7 +82,11 @@
7782
# Configure the hosts as a Slurm cluster
7883
# Use the playbooks invidually rather than the site playbook as it avoids the
7984
# need to define the environment variables referencing an environment
85+
86+
# validate.yml asserts presence of a control group which doesn't exist when
87+
# destroying infra, so only validate when we're not destroying
8088
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml
89+
when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent")
8190

8291
# The first task in the bootstrap playbook causes the home directory of the centos user to be moved
8392
# on the first run
@@ -100,24 +109,7 @@
100109
meta: reset_connection
101110

102111
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml
103-
104112
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml
105113
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml
106-
107-
# Temporarily, the grafana_install tag is skipped (see ansible.cfg) and we install Grafana manually here
108-
# https://github.com/grafana/grafana/issues/36935
109-
# TODO: REMOVE THIS
110-
- name: Install Grafana
111-
hosts: grafana
112-
tags: grafana
113-
gather_facts: no
114-
become: yes
115-
tasks:
116-
- name: Install Grafana from RPM
117-
yum:
118-
name: https://dl.grafana.com/oss/release/grafana-8.0.6-1.x86_64.rpm
119-
disable_gpg_check: yes
120-
state: present
121-
122114
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml
123-
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/test.yml
115+
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml

0 commit comments

Comments
 (0)