Skip to content

Commit 19a7032

Browse files
authored
Merge pull request #10 from stackhpc/update/2022-9-29
Updated slurm app with fixed monitoring and home, state volumes
2 parents b877bee + 59a7d28 commit 19a7032

File tree

12 files changed

+102
-26
lines changed

12 files changed

+102
-26
lines changed

group_vars/cluster.yml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
1-
# Convert the variable supplied by the portal into the one expected by the Slurm appliance
2-
update_enable: "{{ cluster_upgrade_system_packages | default('false') | bool }}"
3-
# The update logs are written on the Ansible controller
4-
# In CaaS, the Ansible controller is an ephemeral AWX pod, so all that matters is that
5-
# this is a location that is writable by the container user
6-
update_log_path: "{{ playbook_dir }}/.tmp/logs/{{ inventory_hostname }}-updates.log"
7-
# Same for the hpctests output directory
8-
hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests"
1+
# Account for the fact we are running outside of the expected environment system:
2+
# NB: this only works for playbooks in ansible/*, not in ansible/adhoc!
3+
appliances_repository_root: "{{ playbook_dir }}/../"
94

105
# Read the secrets from the Ansible local facts on the control host
116
vault_azimuth_user_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password }}"

group_vars/control.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Define path for persistent state on control node volume:
2+
appliances_state_dir: /var/lib/state

group_vars/filebeat.yml

Lines changed: 0 additions & 2 deletions
This file was deleted.

group_vars/hpctests.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Skip plotting pingpong as matplotlib not in runner environment
2+
hpctests_pingpong_plot: false
3+
4+
# In CaaS, the Ansible controller is an ephemeral AWX pod, so all that matters is that
5+
# this is a location that is writable by the container user
6+
hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests"

group_vars/opendistro.yml

Lines changed: 0 additions & 2 deletions
This file was deleted.

group_vars/openstack.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,8 @@ terraform_project_path: "{{ playbook_dir }}/terraform"
1616

1717
terraform_state: "{{ cluster_state | default('present') }}"
1818
cluster_ssh_user: rocky
19+
20+
state_volume_size: 150 # GB
21+
22+
state_volume_device_path: "{{ cluster_state_volume_device_path | default('/dev/vdb') }}"
23+
home_volume_device_path: "{{ cluster_home_volume_device_path | default('/dev/vdc') }}"

requirements.yml

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ roles:
33
- src: stackhpc.nfs
44
version: v21.2.1
55
- src: https://github.com/stackhpc/ansible-role-openhpc.git
6-
version: v0.12.0
6+
version: v0.16.0
77
name: stackhpc.openhpc
88
- src: https://github.com/stackhpc/ansible-node-exporter.git
9-
version: support-rhel-clones
9+
version: feature/no-install
1010
name: cloudalchemy.node_exporter
1111
- src: cloudalchemy.blackbox-exporter
1212
version: 1.0.0
@@ -15,27 +15,30 @@ roles:
1515
name: cloudalchemy.prometheus
1616
- src: cloudalchemy.alertmanager
1717
version: 0.19.1
18-
- src: cloudalchemy.grafana
19-
version: 0.18.0
20-
- src: geerlingguy.mysql
21-
version: 3.3.2
18+
- src: https://github.com/stackhpc/ansible-grafana.git
19+
name: cloudalchemy.grafana
20+
version: service-state
2221
- src: jriguera.configdrive
22+
# No versions available
2323
- src: https://github.com/OSC/ood-ansible.git
2424
name: osc.ood
2525
version: v2.0.5
2626

2727
collections:
28+
- name: containers.podman
29+
- name: community.grafana
30+
- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools
31+
type: git
32+
version: v0.2.0
2833
- name: ansible.posix
2934
- name: ansible.netcommon
3035
- name: community.general
3136
version: 4.5.0 # https://github.com/ansible-collections/community.general/pull/4281
32-
- name: community.grafana
37+
3338
- name: community.mysql
34-
- name: containers.podman
39+
3540
- name: openstack.cloud
3641
- name: https://github.com/stackhpc/ansible-collection-terraform
3742
type: git
3843
version: ae1dc46a9d266bcdc6e79a6e290edbb080596f7f
39-
- name: https://github.com/stackhpc/ansible_collection_slurm_openstack_tools
40-
type: git
41-
version: v0.1.0
44+

roles/cluster_infra/defaults/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ cluster_groups_required:
1818
mysql: [control]
1919
update: [cluster]
2020
basic_users: [cluster]
21+
fail2ban: [login]
22+
firewalld: [fail2ban]
23+
# ignore these for the moment:
24+
#etc_hosts: []
25+
# cloud_init: [etc_hosts]
26+
systemd: [opendistro, grafana, control, prometheus]
2127

2228
# These are the additional groups required for monitoring (see everything layout)
2329
cluster_groups_monitoring:

roles/cluster_infra/templates/resources.tf.j2

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,22 @@ resource "openstack_networking_secgroup_rule_v2" "secgroup_slurm_login_rule_ingr
6969
security_group_id = "${openstack_networking_secgroup_v2.secgroup_slurm_login.id}"
7070
}
7171

72+
#####
73+
##### Volumes
74+
#####
75+
resource "openstack_blockstorage_volume_v3" "state" {
76+
name = "{{ cluster_name }}-state"
77+
description = "State for control node"
78+
size = "{{ state_volume_size }}"
79+
}
80+
81+
resource "openstack_blockstorage_volume_v3" "home" {
82+
name = "{{ cluster_name }}-home"
83+
description = "Home for control node"
84+
size = "{{ home_volme_size }}"
85+
}
86+
87+
7288
#####
7389
##### Cluster nodes
7490
#####
@@ -111,12 +127,50 @@ resource "openstack_compute_instance_v2" "control" {
111127
name = "{{ cluster_network }}"
112128
}
113129
security_groups = ["${openstack_networking_secgroup_v2.secgroup_slurm_cluster.name}"]
114-
# Use cloud-init to inject the SSH keys
130+
131+
# root device:
132+
block_device {
133+
uuid = "{{ cluster_image }}"
134+
source_type = "image"
135+
destination_type = "local"
136+
boot_index = 0
137+
delete_on_termination = true
138+
}
139+
140+
# state volume:
141+
block_device {
142+
destination_type = "volume"
143+
source_type = "volume"
144+
boot_index = -1
145+
uuid = openstack_blockstorage_volume_v3.state.id
146+
}
147+
148+
# home volume:
149+
block_device {
150+
destination_type = "volume"
151+
source_type = "volume"
152+
boot_index = -1
153+
uuid = openstack_blockstorage_volume_v3.home.id
154+
}
155+
156+
# Use cloud-init to a) inject SSH keys b) configure volumes
115157
user_data = <<-EOF
116158
#cloud-config
117159
ssh_authorized_keys:
118160
- {{ cluster_deploy_ssh_public_key }}
119161
- {{ cluster_user_ssh_public_key }}
162+
fs_setup:
163+
- label: state
164+
filesystem: ext4
165+
device: {{ state_volume_device_path }}
166+
partition: auto
167+
- label: home
168+
filesystem: ext4
169+
device: {{ home_volume_device_path }}
170+
partition: auto
171+
mounts:
172+
- [LABEL=state, /var/lib/state]
173+
- [LABEL=home, /exports/home, auto, "x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service"]
120174
EOF
121175
}
122176

slurm-infra.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
gather_facts: false
4141
tasks:
4242
- name: Set up Ansible user
43-
user: "{{ appliances_local_users_ansible_user }}"
43+
user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}"
4444
become_method: "sudo"
4545
# Need to change working directory otherwise we try to switch back to non-existent directory.
4646
become_flags: '-i'

0 commit comments

Comments
 (0)