Skip to content

Commit 1462904

Browse files
committed
add nhc
1 parent 986a6bc commit 1462904

File tree

18 files changed

+245
-16
lines changed

18 files changed

+245
-16
lines changed

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,5 @@ roles/*
9090
!roles/gateway/**
9191
!roles/alertmanager/
9292
!roles/alertmanager/**
93+
!roles/nhc/
94+
!roles/nhc/**

ansible/bootstrap.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@
134134

135135
- hosts: dnf_repos
136136
become: yes
137+
tags: dnf_repos
137138
tasks:
138139
- name: Check that creds won't be leaked to users
139140
ansible.builtin.assert:

ansible/extras.yml

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,6 @@
5858
- import_role:
5959
name: persist_hostkeys
6060

61-
62-
- name: Setup NFS export for compute node configuration
63-
hosts: compute_init:!builder
64-
# NB: has to be after eeesi and os-manila-mount
65-
tags: compute_init
66-
become: yes
67-
tasks:
68-
- include_role:
69-
name: compute_init
70-
tasks_from: export.yml
71-
7261
- name: Install k9s
7362
become: yes
7463
hosts: k9s
@@ -85,3 +74,21 @@
8574
- name: Install additional packages
8675
dnf:
8776
name: "{{ appliances_extra_packages }}"
77+
78+
- hosts: nhc
79+
become: yes
80+
tags: nhc
81+
tasks:
82+
- name: Configure node health checks
83+
import_role:
84+
name: nhc
85+
86+
- name: Setup NFS export for compute_init
87+
hosts: compute_init:!builder
88+
# NB: done last so other orles can prepare configuration etc
89+
tags: compute_init
90+
become: yes
91+
tasks:
92+
- include_role:
93+
name: compute_init
94+
tasks_from: export.yml

ansible/roles/compute_init/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ it also requires an image build with the role name added to the
8484
| slurm.yml | openhpc [10] | All slurmd functionality | No |
8585
| slurm.yml | (set memory limits) | Fully supported | No |
8686
| slurm.yml | (block ssh) | Fully supported | No |
87+
| slurm.yml | nhc | Fully supported | No |
8788
| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a |
8889
| portal.yml | (openondemand vnc desktop) | None required - use image build | No |
8990
| portal.yml | (openondemand jupyter server) | None required - use image build | No |

ansible/roles/compute_init/files/compute-init.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
2020
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
2121
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
22+
enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"
2223

2324
# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2425
resolv_conf_nameservers: []
@@ -350,6 +351,11 @@
350351
enabled: true
351352
state: started
352353

354+
- name: Provide NHC configuration
355+
ansible.builtin.include_role:
356+
name: nhc
357+
tasks_from: import.yml
358+
when: enable_nhc
353359

354360
- name: Ensure node is resumed
355361
# TODO: consider if this is always safe for all job states?

ansible/roles/compute_init/tasks/export.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,9 @@
9898
name: sshd
9999
tasks_from: export.yml
100100
when: "'sshd' in group_names"
101+
102+
- name: Retrieve generated NHC config
103+
import_role:
104+
name: nhc
105+
tasks_from: export.yml
106+
when: "'nhc' in group_names"

ansible/roles/compute_init/tasks/install.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
dest: roles/
5050
- src: ../../lustre
5151
dest: roles/
52+
- src: ../../nhc
53+
dest: roles/
5254

5355
- name: Add filter_plugins to ansible.cfg
5456
lineinfile:

ansible/roles/nhc/README.md

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Node Health Checks (nhc)
2+
3+
Deploys and configures the LBNL [Node Health Check](https://github.com/mej/nhc)
4+
(NHC) which will put nodes in `DOWN` state if they fail periodic checks on
5+
various aspects.
6+
7+
Due to the integration with Slurm this is tightly linked to the configuration
8+
for the [stackhpc.openhpc](../stackhpc.openhpc/README.md) role.
9+
10+
## Enabling
11+
12+
By [default](../../../environments/common/inventory/group_vars/all/openhpc.yml)
13+
the required `nhc-ohpc` packages are installed in all images.
14+
15+
To enable node health checks, ensure the `nhc` group contains the `compute` group:
16+
17+
```yaml
18+
# environments/site/inventory/groups:
19+
[nhc:children]
20+
# Hosts to configure for node health checks
21+
compute
22+
```
23+
24+
This will:
25+
1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file.
26+
The default configuration is defined in `openhpc_config_nhc`
27+
(see [environments/common/inventory/group_vars/all/openhpc.yml](../../../environments/common/inventory/group_vars/all/openhpc.yml)).
28+
It will run healthchecks on all `IDLE` nodes which are not `DRAINED` or `NOT_RESPONDING`
29+
every 300 seconds. See [slurm.conf parameters](https://slurm.schedmd.com/slurm.conf.html)
30+
`HealthCheckInterval`, `HealthCheckNodeState`, `HealthCheckProgram`. These may
31+
be overriden if required by redefining `openhpc_config_nhc` in e.g.
32+
`environments/site/inventory/group_vars/nhc/yml`.
33+
34+
2. Define a default configuration for health checks for each compute node
35+
individually using [nhc-genconf](https://github.com/mej/nhc?tab=readme-ov-file#config-file-auto-generation)
36+
The generated checks include:
37+
- Filesystem mounts
38+
- Filesystem space
39+
- CPU info
40+
- Memory and swap
41+
- Network interfaces
42+
- Various processes
43+
44+
See `/etc/nhc/nhc.conf` on a compute node for the full configuration.
45+
46+
The automatically generated checks may be modified or disabled using the
47+
`nhc_replacements` role variable described below.
48+
49+
If a node healthcheck run fails, Slurm will mark the node `DOWN`. With the
50+
default [alerting configuration](../../../docs/alerting.md) this will trigger
51+
an alert.
52+
53+
## Updating Health Checks
54+
55+
The above approach assumes that when the `site.yml` playbook is run all nodes
56+
are functioning correctly. Therefore if changes are made to aspects covered by
57+
the healthchecks (see above) without re-running this playbook, use the following
58+
to update the autogenerated health checks:
59+
60+
```shell
61+
ansible-playbook ansible/extras.yml --tags nhc
62+
```
63+
64+
## Role Variables
65+
66+
- `nhc_replacements`: Optional, default empty list. A list of mappings
67+
defining replacements in the autogenerated health checks. Items must have
68+
keys `regexp` and `replace` which are as for [ansible.builtin.replace](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/replace_module.html).
69+
Note that the NHC [configuration line format](https://github.com/mej/nhc?tab=readme-ov-file#configuration-file-syntax) is:
70+
71+
TARGET || CHECK
72+
73+
where for autogenerated checks `TARGET` is the hostname. So a regex like:
74+
75+
'^(\s+\S+\s+)\|\|(\s+.*)$'
76+
77+
captures the `TARGET` and `||` separator as `\1` and the actual check as `\2`.
78+
Hence the following item would comment-out checks on a particular interface
79+
on all nodes:
80+
81+
- regexp: '^(\s+\S+\s+\|\|\s+)(check_hw_eth eth0)$'
82+
replace: '#\1\2'
83+
84+
See documentation for `ansible.builtin.replace` for more information. This is
85+
an example only - for this actual case removing the line entirely with
86+
`replace: ''` might be better. Using https://regex101.com/ (in Python
87+
mode) or similar may be useful during development.
88+
89+
- `nhc_replacements_default`: Optional. As above, but by default includes a
90+
mapping to remote the autogenerated timestamp line from the check configuration
91+
file for idempotency.
92+
93+
## Structure
94+
95+
This role contains 3x task files, which run at different times:
96+
- `main.yml`: Runs from `site.yml` -> `slurm.yml`. Generates health check
97+
configuration.
98+
- `export.yml`: Runs from `site.yml` -> `extras.yml` via role `compute_init`
99+
tasks `export.yml`. Copies the generated health check configuration to the
100+
control node NFS share for compute-init.
101+
- `import.yml`: Runs on boot via `compute_init/files/compute-init.yml` and
102+
copies the node's generated health check configuration from the control node
103+
NFS share to local disk.
104+
105+
Note that the `stackhpc.openhpc` role:
106+
- Installs the required package
107+
- Configures slurm
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
nhc_replacements_default:
2+
# note this matches a multiline string
3+
- regexp: '# This file was automatically generated by nhc-genconf\n#.*'
4+
replace: '# This file was automatically generated by nhc-genconf\n# (timestamp removed for idempotency)'
5+
6+
nhc_replacements: []

ansible/roles/nhc/tasks/export.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
- name: Fetch generated NHC config to control node storage
2+
ansible.builtin.fetch:
3+
src: /etc/nhc/nhc.conf
4+
flat: true
5+
dest: "/exports/cluster/hostconfig{{ inventory_hostname }}/nhc.conf"

0 commit comments

Comments
 (0)