Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1462904
add nhc
sjpb Apr 23, 2025
98d6f81
change nhc configuration from replace to lineinfile
sjpb Apr 23, 2025
fe4c25a
move things around
sjpb Apr 23, 2025
42d6124
Merge branch 'main' into feat/nhc-v2
sjpb Jun 18, 2025
b63bb2a
revert doca move
sjpb Jun 18, 2025
531b3fc
revert slurm changes
sjpb Jun 18, 2025
39257e3
tweak NHC docs
sjpb Jun 18, 2025
1740599
remove debugging stop
sjpb Jun 18, 2025
91da774
rename final playbook and let dnf repos work for post-hook in both si…
sjpb Jun 18, 2025
4127848
note returntoservice bug
sjpb Jun 18, 2025
4695dfc
revert slurm playbook changes
sjpb Jun 18, 2025
e6148ce
revert ordering change for repo disable - see issue 708
sjpb Jun 18, 2025
e2dc409
bump CI image
sjpb Jun 18, 2025
a62ca70
fix exporting NHC config
sjpb Jun 18, 2025
983fca1
rememeber compute-init config is synced to /var/tmp
sjpb Jun 18, 2025
9e2fa05
enable NHC during rebuild for stackhpc
sjpb Jun 18, 2025
9322e0e
fix path to shared compute-init files for NHC
sjpb Jun 18, 2025
6e6058d
don't write compute-init share into fstab for reliability -is unmount…
sjpb Jun 18, 2025
bf632f1
bump CI image
sjpb Jun 18, 2025
aff28f7
change NHC to use templating instead of autoconfiguration
sjpb Jun 18, 2025
761031a
fix nhc task file from compute-init
sjpb Jun 18, 2025
f71fc6e
bump CI image
sjpb Jun 18, 2025
8bf8d04
fix NHC configuration directory
sjpb Jun 19, 2025
4209e40
Merge branch 'main' into feat/nhc-v2
sjpb Jun 19, 2025
a94f3e3
bump CI image
sjpb Jun 19, 2025
2d95b7d
fix NHC nodename/hostname mismatch
sjpb Jun 20, 2025
c38f9fd
remove un-needed nhc conf dir tasks
sjpb Jun 20, 2025
4def7a5
bump CI image
sjpb Jun 20, 2025
3ec4868
Revert "bump CI image" 4def7a5
sjpb Jun 20, 2025
c9ca0d2
Revert "remove un-needed nhc conf dir tasks" c38f9fd
sjpb Jun 20, 2025
d606a45
fix nhc mid-upgrade
sjpb Jun 20, 2025
2cec9d7
bump CI image
sjpb Jun 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,5 @@ roles/*
!roles/gateway/**
!roles/alertmanager/
!roles/alertmanager/**
!roles/nhc/
!roles/nhc/**
1 change: 1 addition & 0 deletions ansible/bootstrap.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@

- hosts: dnf_repos
become: yes
tags: dnf_repos
tasks:
- name: Check that creds won't be leaked to users
ansible.builtin.assert:
Expand Down
8 changes: 0 additions & 8 deletions ansible/disable-repos.yml

This file was deleted.

11 changes: 0 additions & 11 deletions ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,6 @@
- import_role:
name: persist_hostkeys


- name: Setup NFS export for compute node configuration
hosts: compute_init:!builder
# NB: has to be after eeesi and os-manila-mount
tags: compute_init
become: yes
tasks:
- include_role:
name: compute_init
tasks_from: export.yml

- name: Install k9s
become: yes
hosts: k9s
Expand Down
2 changes: 1 addition & 1 deletion ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- import_playbook: disable-repos.yml
- import_playbook: final.yml

- hosts: builder
become: yes
Expand Down
18 changes: 18 additions & 0 deletions ansible/final.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
- hosts: dnf_repos
become: yes
tags: dnf_repos
tasks:
- name: Disable pulp repos
ansible.builtin.include_role:
name: dnf_repos
tasks_from: disable_repos.yml

- name: Setup NFS export for compute_init
hosts: compute_init:!builder
# NB: done last so other roles can prepare configuration etc
tags: compute_init
become: yes
tasks:
- include_role:
name: compute_init
tasks_from: export.yml
1 change: 1 addition & 0 deletions ansible/roles/compute_init/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ it also requires an image build with the role name added to the
| slurm.yml | openhpc [10] | All slurmd functionality | No |
| slurm.yml | (set memory limits) | Fully supported | No |
| slurm.yml | (block ssh) | Fully supported | No |
| slurm.yml | nhc | Fully supported | No |
| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a |
| portal.yml | (openondemand vnc desktop) | None required - use image build | No |
| portal.yml | (openondemand jupyter server) | None required - use image build | No |
Expand Down
10 changes: 8 additions & 2 deletions ansible/roles/compute_init/files/compute-init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"

# TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
resolv_conf_nameservers: []
Expand Down Expand Up @@ -63,12 +64,12 @@
mode: u=rX,g=rwX,o=

- name: Mount /mnt/cluster
mount:
ansible.posix.mount:
path: /mnt/cluster
src: "{{ server_node_ip }}:/exports/cluster"
fstype: nfs
opts: ro,sync
state: mounted
state: ephemeral # will be unmounted after sync, don't want it in fstab
register: _mount_mnt_cluster
ignore_errors: true
# exits from playbook if this failed below, allowing ansible-init to
Expand Down Expand Up @@ -350,6 +351,11 @@
enabled: true
state: started

- name: Provide NHC configuration
ansible.builtin.include_role:
name: nhc
tasks_from: boot.yml
when: enable_nhc

- name: Ensure node is resumed
# TODO: consider if this is always safe for all job states?
Expand Down
6 changes: 6 additions & 0 deletions ansible/roles/compute_init/tasks/export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,9 @@
name: sshd
tasks_from: export.yml
when: "'sshd' in group_names"

- name: Export generated NHC config
import_role:
name: nhc
tasks_from: export.yml
when: "'nhc' in group_names"
2 changes: 2 additions & 0 deletions ansible/roles/compute_init/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
dest: roles/
- src: ../../lustre
dest: roles/
- src: ../../nhc
dest: roles/

- name: Add filter_plugins to ansible.cfg
lineinfile:
Expand Down
66 changes: 66 additions & 0 deletions ansible/roles/nhc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Node Health Checks (nhc)

Deploys and configures the LBNL [Node Health Check](https://github.com/mej/nhc)
(NHC) which will put nodes in `DOWN` state if they fail periodic checks on
various aspects.

Due to the integration with Slurm this is tightly linked to the configuration
for the [stackhpc.openhpc](../stackhpc.openhpc/README.md) role.

## Enabling

By [default](../../../environments/common/inventory/group_vars/all/openhpc.yml)
the required `nhc-ohpc` packages are installed in all images.

To enable node health checks, ensure the `nhc` group contains the `compute` group:

```yaml
# environments/site/inventory/groups:
[nhc:children]
# Hosts to configure for node health checks
compute
```

When the `anisble/site.yml` playbook is run this will automatically:
1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file.
The default configuration is defined in `openhpc_config_nhc`
(see [environments/common/inventory/group_vars/all/openhpc.yml](../../../environments/common/inventory/group_vars/all/openhpc.yml)).
It will run healthchecks on all `IDLE` nodes which are not `DRAINED` or
`NOT_RESPONDING` every 300 seconds. See [slurm.conf parameters](https://slurm.schedmd.com/slurm.conf.html)
`HealthCheckInterval`, `HealthCheckNodeState`, `HealthCheckProgram`. These
may be overriden if required by redefining `openhpc_config_nhc` in e.g.
`environments/site/inventory/group_vars/nhc/yml`.

2. Template out node health check rules using Ansible facts for each compute
node. Currently these check:
- Filesystem mounts
- Ethernet interfaces

See `/etc/nhc/nhc.conf` on a compute node for the full configuration.

If a node healthcheck run fails, Slurm will mark the node `DOWN`. With the
default [alerting configuration](../../../docs/alerting.md) this will trigger
an alert.

## Role Variables

- `nhc_config_template`: Template to use. Default is the in-role template
providing rules described above.
- `nhc_config_extra`: Possibly multiline string defining [additional rules](https://github.com/mej/nhc/blob/master/README.md) to
add. Jinja templating may be used. Default is empty string.

## Structure

This role contains 3x task files, which run at different times:
- `main.yml`: Runs from `site.yml` -> `slurm.yml`. Templates health check
configuration to nodes.
- `export.yml`: Runs from `site.yml` -> `final.yml` via role `compute_init`
tasks `export.yml`. Templates health check configuration to the cluster NFS
share for compute-init.
- `boot.yml`: Runs on boot via `compute_init/files/compute-init.yml`. Copies
the node's generated health check configuration from the cluster share to
local disk.

Note that the `stackhpc.openhpc` role:
- Installs the required package
- Configures slurm.conf parameterss
2 changes: 2 additions & 0 deletions ansible/roles/nhc/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nhc_config_template: nhc.conf.j2
nhc_config_extra: ''
8 changes: 8 additions & 0 deletions ansible/roles/nhc/tasks/boot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- name: Copy stored NHC configuration to active location
ansible.builtin.copy:
remote_src: true
src: "/var/tmp/cluster/hostconfig/{{ ansible_hostname }}/nhc.conf"
dest: /etc/nhc/nhc.conf
owner: root
group: root
mode: u=rw,go=
6 changes: 6 additions & 0 deletions ansible/roles/nhc/tasks/export.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Used for compute-init
- name: Template out host specific NHC config
ansible.builtin.template:
src: "{{ nhc_config_template }}"
dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/nhc.conf"
delegate_to: "{{ groups['control'] | first }}"
20 changes: 20 additions & 0 deletions ansible/roles/nhc/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

- name: Ensure NHC configuration directory exists
# When running site.yml after login/control upgrade, nhc group might be
# enabled in repo, but as the compute nodes have not yet been upgraded they
# do not have the package and hence this directory
ansible.builtin.file:
path: /etc/nhc
state: directory
# to match nhc-ohpc install:
owner: root
group: root
mode: u=rwX,go=

- name: Template out NHC configuration
ansible.builtin.template:
src: "{{ nhc_config_template }}"
dest: /etc/nhc/nhc.conf
owner: root
group: root
mode: u=rw,go= # HealthChecProgram is run by root
18 changes: 18 additions & 0 deletions ansible/roles/nhc/templates/nhc.conf.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# {{ ansible_managed }}

# Nodes have long FQDN hostnames but short nodenames
* || HOSTNAME="$HOSTNAME_S"

## Filesystem checks
{% for mount in ansible_mounts %}
{% set mount_mode = 'rw' if 'rw' in mount.options.split(',') else 'ro' %}
{{ ansible_fqdn }} || check_fs_mount_{{ mount_mode }} -t "{{ mount.fstype }}" -s "{{ mount.device }}" -f "{{ mount.mount }}"
{% endfor %}

## Ethernet interface checks
{% for iface in ansible_interfaces | select('match', 'eth') %}
{{ ansible_fqdn }} || check_hw_eth {{ iface }}
{% endfor %}

## Site-specific checks
{{ nhc_config_extra }}
10 changes: 1 addition & 9 deletions ansible/site.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
- import_playbook: slurm.yml
- import_playbook: portal.yml
- import_playbook: monitoring.yml
- import_playbook: disable-repos.yml
- import_playbook: final.yml

- name: Run post.yml hook
vars:
Expand All @@ -37,12 +37,4 @@
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- name: Clean up and shutdown Packer VM
hosts: builder
gather_facts: no
become: yes
tasks:
- import_tasks: cleanup.yml
- community.general.shutdown:

...
9 changes: 9 additions & 0 deletions ansible/slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,12 @@
- include_role:
name: stackhpc.openhpc
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}"

- name: Setup Node Health Checks
# Has to be done here as it requires openhpc repos etc for installation
hosts: nhc:!builder
become: yes
tags: nhc
tasks:
- include_role:
name: nhc
2 changes: 2 additions & 0 deletions docs/production.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ and referenced from the `site` and `production` environments, e.g.:
is properly tuned, again, demonstrated through testing.

- Enable alertmanager if Slack is available - see [docs/alerting.md](./alerting.md).

- Enable node health checks - see [ansible/roles/nhc/README.md](../ansible/roles/nhc/README.md).
4 changes: 2 additions & 2 deletions environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-250617-1558-2065722e",
"RL9": "openhpc-RL9-250617-1557-2065722e"
"RL8": "openhpc-RL8-250620-1251-d606a45c",
"RL9": "openhpc-RL9-250620-1251-d606a45c"
}
}
2 changes: 1 addition & 1 deletion environments/.stackhpc/tofu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ module "cluster" {
standard = { # NB: can't call this default!
nodes = ["compute-0", "compute-1"]
flavor = var.other_node_flavor
compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"]
compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"]
ignore_image_changes = true
}
# Normally-empty partition for testing:
Expand Down
33 changes: 29 additions & 4 deletions environments/common/inventory/group_vars/all/openhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,26 +37,51 @@ openhpc_packages_default:
- slurm-libpmi-ohpc # to allow intel mpi to work properly
- ohpc-gnu12-openmpi4-perf-tools # for hpctests
- openblas-gnu12-ohpc # for hpctests (HPL)
- nhc-ohpc # node health checks
# EPEL packages:
- apptainer
- podman-compose
openhpc_packages_extra: []
openhpc_packages: "{{ (openhpc_packages_default + openhpc_packages_extra) | select | list }}"
openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}"
openhpc_login_only_nodes: login
openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}"

# additional site/environment-specific slurm.conf parameters:
# NB: override in environments/site/inventory/group_vars/all/openhpc.yml, not here:
openhpc_config_extra: {}

# default additional slurm.conf parameters for the appliance:
openhpc_config_default:
SlurmctldParameters:
- enable_configless
TaskPlugin: task/cgroup,task/affinity
ReturnToService: 2 # is stackhpc.openhpc default, but templating bug means it is needed here too
ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0

# default additional slurm.conf parameters when "rebuild" enabled:
openhpc_config_rebuild:
RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild
SlurmctldParameters:
- reboot_from_controller
ResumeTimeout: 300
openhpc_config_extra: {}
openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_rebuild if groups['rebuild'] | length > 0 else {}, openhpc_config_extra, list_merge='append') }}"
openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}"

# default additional slurm.conf parameters when "nhc" enabled:
openhpc_config_nhc:
HealthCheckProgram: /usr/sbin/nhc
HealthCheckInterval: 300
HealthCheckNodeState: NONDRAINED_IDLE

# indirection to allow automatic construction of slurm.conf parameters:
openhpc_config_groups:
- enabled: "{{ groups['rebuild'] | length > 0 }}"
config: "{{ openhpc_config_rebuild }}"
- enabled: "{{ groups['nhc'] | length > 0 }}"
config: "{{ openhpc_config_nhc }}"
- enabled: true
config: "{{ openhpc_config_extra }}"

# constructed slurm.conf parameters for stackpc.openhpc role var:
openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_groups | selectattr('enabled') | map(attribute='config'), list_merge='append') }}"

openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326

Expand Down
3 changes: 3 additions & 0 deletions environments/common/inventory/groups
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,6 @@ extra_packages

[gateway]
# Add builder to this group to install gateway ansible-init playbook into image

[nhc]
# Hosts to configure for node health checks - either entire 'compute' group or empty
4 changes: 4 additions & 0 deletions environments/common/layouts/everything
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,7 @@ builder
[gateway:children]
# Add builder to this group to install gateway ansible-init playbook into image
builder

[nhc:children]
# Hosts to configure for node health checks
compute