diff --git a/ansible/.gitignore b/ansible/.gitignore index 58c4c3511..d7f3e99b1 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -90,3 +90,5 @@ roles/* !roles/gateway/** !roles/alertmanager/ !roles/alertmanager/** +!roles/nhc/ +!roles/nhc/** diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 3fd7f267d..30a8abafa 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -134,6 +134,7 @@ - hosts: dnf_repos become: yes + tags: dnf_repos tasks: - name: Check that creds won't be leaked to users ansible.builtin.assert: diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml deleted file mode 100644 index 3b68aee68..000000000 --- a/ansible/disable-repos.yml +++ /dev/null @@ -1,8 +0,0 @@ -- hosts: dnf_repos - become: yes - tasks: - - name: Disable pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: disable_repos.yml - when: not dnf_repos_enabled | default(false) | bool diff --git a/ansible/extras.yml b/ansible/extras.yml index c7cacb877..8e3248d3f 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -58,17 +58,6 @@ - import_role: name: persist_hostkeys - -- name: Setup NFS export for compute node configuration - hosts: compute_init:!builder - # NB: has to be after eeesi and os-manila-mount - tags: compute_init - become: yes - tasks: - - include_role: - name: compute_init - tasks_from: export.yml - - name: Install k9s become: yes hosts: k9s diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 0b4335b14..422cc07c4 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -257,7 +257,7 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: disable-repos.yml +- import_playbook: final.yml - hosts: builder become: yes diff --git a/ansible/final.yml b/ansible/final.yml new file mode 100644 index 000000000..57c8fce8b --- /dev/null +++ b/ansible/final.yml @@ -0,0 +1,18 @@ +- hosts: dnf_repos + become: yes + tags: dnf_repos + tasks: + - name: Disable pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: disable_repos.yml + +- name: Setup NFS export for compute_init + hosts: compute_init:!builder + # NB: done last so other roles can prepare configuration etc + tags: compute_init + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: export.yml diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 81a62bade..d1a7e854d 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -84,6 +84,7 @@ it also requires an image build with the role name added to the | slurm.yml | openhpc [10] | All slurmd functionality | No | | slurm.yml | (set memory limits) | Fully supported | No | | slurm.yml | (block ssh) | Fully supported | No | +| slurm.yml | nhc | Fully supported | No | | portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | | portal.yml | (openondemand vnc desktop) | None required - use image build | No | | portal.yml | (openondemand jupyter server) | None required - use image build | No | diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 086585a8d..96722e95c 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -19,6 +19,7 @@ enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}" enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}" enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}" + enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}" # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] @@ -63,12 +64,12 @@ mode: u=rX,g=rwX,o= - name: Mount /mnt/cluster - mount: + ansible.posix.mount: path: /mnt/cluster src: "{{ server_node_ip }}:/exports/cluster" fstype: nfs opts: ro,sync - state: mounted + state: ephemeral # will be unmounted after sync, don't want it in fstab register: _mount_mnt_cluster ignore_errors: true # exits from playbook if this failed below, allowing ansible-init to @@ -350,6 +351,11 @@ enabled: true state: started + - name: Provide NHC configuration + ansible.builtin.include_role: + name: nhc + tasks_from: boot.yml + when: enable_nhc - name: Ensure node is resumed # TODO: consider if this is always safe for all job states? diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 3226e13b8..f5c594cbc 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -98,3 +98,9 @@ name: sshd tasks_from: export.yml when: "'sshd' in group_names" + +- name: Export generated NHC config + import_role: + name: nhc + tasks_from: export.yml + when: "'nhc' in group_names" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 0638f7011..67f339c33 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -49,6 +49,8 @@ dest: roles/ - src: ../../lustre dest: roles/ + - src: ../../nhc + dest: roles/ - name: Add filter_plugins to ansible.cfg lineinfile: diff --git a/ansible/roles/nhc/README.md b/ansible/roles/nhc/README.md new file mode 100644 index 000000000..8831e0eee --- /dev/null +++ b/ansible/roles/nhc/README.md @@ -0,0 +1,66 @@ +# Node Health Checks (nhc) + +Deploys and configures the LBNL [Node Health Check](https://github.com/mej/nhc) +(NHC) which will put nodes in `DOWN` state if they fail periodic checks on +various aspects. + +Due to the integration with Slurm this is tightly linked to the configuration +for the [stackhpc.openhpc](../stackhpc.openhpc/README.md) role. + +## Enabling + +By [default](../../../environments/common/inventory/group_vars/all/openhpc.yml) +the required `nhc-ohpc` packages are installed in all images. + +To enable node health checks, ensure the `nhc` group contains the `compute` group: + +```yaml +# environments/site/inventory/groups: +[nhc:children] +# Hosts to configure for node health checks +compute +``` + +When the `anisble/site.yml` playbook is run this will automatically: +1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file. + The default configuration is defined in `openhpc_config_nhc` + (see [environments/common/inventory/group_vars/all/openhpc.yml](../../../environments/common/inventory/group_vars/all/openhpc.yml)). + It will run healthchecks on all `IDLE` nodes which are not `DRAINED` or + `NOT_RESPONDING` every 300 seconds. See [slurm.conf parameters](https://slurm.schedmd.com/slurm.conf.html) + `HealthCheckInterval`, `HealthCheckNodeState`, `HealthCheckProgram`. These + may be overriden if required by redefining `openhpc_config_nhc` in e.g. + `environments/site/inventory/group_vars/nhc/yml`. + +2. Template out node health check rules using Ansible facts for each compute + node. Currently these check: + - Filesystem mounts + - Ethernet interfaces + + See `/etc/nhc/nhc.conf` on a compute node for the full configuration. + +If a node healthcheck run fails, Slurm will mark the node `DOWN`. With the +default [alerting configuration](../../../docs/alerting.md) this will trigger +an alert. + +## Role Variables + +- `nhc_config_template`: Template to use. Default is the in-role template + providing rules described above. +- `nhc_config_extra`: Possibly multiline string defining [additional rules](https://github.com/mej/nhc/blob/master/README.md) to + add. Jinja templating may be used. Default is empty string. + +## Structure + +This role contains 3x task files, which run at different times: +- `main.yml`: Runs from `site.yml` -> `slurm.yml`. Templates health check + configuration to nodes. +- `export.yml`: Runs from `site.yml` -> `final.yml` via role `compute_init` + tasks `export.yml`. Templates health check configuration to the cluster NFS + share for compute-init. +- `boot.yml`: Runs on boot via `compute_init/files/compute-init.yml`. Copies + the node's generated health check configuration from the cluster share to + local disk. + +Note that the `stackhpc.openhpc` role: +- Installs the required package +- Configures slurm.conf parameterss diff --git a/ansible/roles/nhc/defaults/main.yml b/ansible/roles/nhc/defaults/main.yml new file mode 100644 index 000000000..d6ea0a0d5 --- /dev/null +++ b/ansible/roles/nhc/defaults/main.yml @@ -0,0 +1,2 @@ +nhc_config_template: nhc.conf.j2 +nhc_config_extra: '' diff --git a/ansible/roles/nhc/tasks/boot.yml b/ansible/roles/nhc/tasks/boot.yml new file mode 100644 index 000000000..b00da79b1 --- /dev/null +++ b/ansible/roles/nhc/tasks/boot.yml @@ -0,0 +1,8 @@ +- name: Copy stored NHC configuration to active location + ansible.builtin.copy: + remote_src: true + src: "/var/tmp/cluster/hostconfig/{{ ansible_hostname }}/nhc.conf" + dest: /etc/nhc/nhc.conf + owner: root + group: root + mode: u=rw,go= diff --git a/ansible/roles/nhc/tasks/export.yml b/ansible/roles/nhc/tasks/export.yml new file mode 100644 index 000000000..afa440ffb --- /dev/null +++ b/ansible/roles/nhc/tasks/export.yml @@ -0,0 +1,6 @@ +# Used for compute-init +- name: Template out host specific NHC config + ansible.builtin.template: + src: "{{ nhc_config_template }}" + dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/nhc.conf" + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/nhc/tasks/main.yml b/ansible/roles/nhc/tasks/main.yml new file mode 100644 index 000000000..5f6034f7d --- /dev/null +++ b/ansible/roles/nhc/tasks/main.yml @@ -0,0 +1,20 @@ + +- name: Ensure NHC configuration directory exists + # When running site.yml after login/control upgrade, nhc group might be + # enabled in repo, but as the compute nodes have not yet been upgraded they + # do not have the package and hence this directory + ansible.builtin.file: + path: /etc/nhc + state: directory + # to match nhc-ohpc install: + owner: root + group: root + mode: u=rwX,go= + +- name: Template out NHC configuration + ansible.builtin.template: + src: "{{ nhc_config_template }}" + dest: /etc/nhc/nhc.conf + owner: root + group: root + mode: u=rw,go= # HealthChecProgram is run by root diff --git a/ansible/roles/nhc/templates/nhc.conf.j2 b/ansible/roles/nhc/templates/nhc.conf.j2 new file mode 100644 index 000000000..cefad76a0 --- /dev/null +++ b/ansible/roles/nhc/templates/nhc.conf.j2 @@ -0,0 +1,18 @@ +# {{ ansible_managed }} + +# Nodes have long FQDN hostnames but short nodenames +* || HOSTNAME="$HOSTNAME_S" + +## Filesystem checks +{% for mount in ansible_mounts %} +{% set mount_mode = 'rw' if 'rw' in mount.options.split(',') else 'ro' %} +{{ ansible_fqdn }} || check_fs_mount_{{ mount_mode }} -t "{{ mount.fstype }}" -s "{{ mount.device }}" -f "{{ mount.mount }}" +{% endfor %} + +## Ethernet interface checks +{% for iface in ansible_interfaces | select('match', 'eth') %} +{{ ansible_fqdn }} || check_hw_eth {{ iface }} +{% endfor %} + +## Site-specific checks +{{ nhc_config_extra }} diff --git a/ansible/site.yml b/ansible/site.yml index d973d9cb3..a211da17b 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -27,7 +27,7 @@ - import_playbook: slurm.yml - import_playbook: portal.yml - import_playbook: monitoring.yml -- import_playbook: disable-repos.yml +- import_playbook: final.yml - name: Run post.yml hook vars: @@ -37,12 +37,4 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- name: Clean up and shutdown Packer VM - hosts: builder - gather_facts: no - become: yes - tasks: - - import_tasks: cleanup.yml - - community.general.shutdown: - ... \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index d1bb93a9f..1583f97ba 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -63,3 +63,12 @@ - include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" + +- name: Setup Node Health Checks + # Has to be done here as it requires openhpc repos etc for installation + hosts: nhc:!builder + become: yes + tags: nhc + tasks: + - include_role: + name: nhc diff --git a/docs/production.md b/docs/production.md index e52e9d180..cf36779cb 100644 --- a/docs/production.md +++ b/docs/production.md @@ -166,3 +166,5 @@ and referenced from the `site` and `production` environments, e.g.: is properly tuned, again, demonstrated through testing. - Enable alertmanager if Slack is available - see [docs/alerting.md](./alerting.md). + +- Enable node health checks - see [ansible/roles/nhc/README.md](../ansible/roles/nhc/README.md). diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index e2913c3f2..181aa7aa7 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250617-1558-2065722e", - "RL9": "openhpc-RL9-250617-1557-2065722e" + "RL8": "openhpc-RL8-250620-1251-d606a45c", + "RL9": "openhpc-RL9-250620-1251-d606a45c" } } diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index ad1549164..c58fb3fc5 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -78,7 +78,7 @@ module "cluster" { standard = { # NB: can't call this default! nodes = ["compute-0", "compute-1"] flavor = var.other_node_flavor - compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"] + compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"] ignore_image_changes = true } # Normally-empty partition for testing: diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 005bdf04d..9c6aca272 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -37,6 +37,7 @@ openhpc_packages_default: - slurm-libpmi-ohpc # to allow intel mpi to work properly - ohpc-gnu12-openmpi4-perf-tools # for hpctests - openblas-gnu12-ohpc # for hpctests (HPL) + - nhc-ohpc # node health checks # EPEL packages: - apptainer - podman-compose @@ -44,19 +45,43 @@ openhpc_packages_extra: [] openhpc_packages: "{{ (openhpc_packages_default + openhpc_packages_extra) | select | list }}" openhpc_munge_key: "{{ vault_openhpc_mungekey | b64decode }}" openhpc_login_only_nodes: login +openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +# additional site/environment-specific slurm.conf parameters: +# NB: override in environments/site/inventory/group_vars/all/openhpc.yml, not here: +openhpc_config_extra: {} + +# default additional slurm.conf parameters for the appliance: openhpc_config_default: SlurmctldParameters: - enable_configless TaskPlugin: task/cgroup,task/affinity - ReturnToService: 2 # is stackhpc.openhpc default, but templating bug means it is needed here too + ReturnToService: 2 # workaround for templating bug TODO: Remove once on stackhpc.openhpc v1.2.0 + +# default additional slurm.conf parameters when "rebuild" enabled: openhpc_config_rebuild: RebootProgram: /opt/slurm-tools/bin/slurm-openstack-rebuild SlurmctldParameters: - reboot_from_controller ResumeTimeout: 300 -openhpc_config_extra: {} -openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_rebuild if groups['rebuild'] | length > 0 else {}, openhpc_config_extra, list_merge='append') }}" -openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" + +# default additional slurm.conf parameters when "nhc" enabled: +openhpc_config_nhc: + HealthCheckProgram: /usr/sbin/nhc + HealthCheckInterval: 300 + HealthCheckNodeState: NONDRAINED_IDLE + +# indirection to allow automatic construction of slurm.conf parameters: +openhpc_config_groups: + - enabled: "{{ groups['rebuild'] | length > 0 }}" + config: "{{ openhpc_config_rebuild }}" + - enabled: "{{ groups['nhc'] | length > 0 }}" + config: "{{ openhpc_config_nhc }}" + - enabled: true + config: "{{ openhpc_config_extra }}" + +# constructed slurm.conf parameters for stackpc.openhpc role var: +openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_groups | selectattr('enabled') | map(attribute='config'), list_merge='append') }}" openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhpc/ansible-slurm-appliance/pull/326 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index b6c216ef6..5bf38aaa9 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -182,3 +182,6 @@ extra_packages [gateway] # Add builder to this group to install gateway ansible-init playbook into image + +[nhc] +# Hosts to configure for node health checks - either entire 'compute' group or empty diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 9f2c7c706..5590c8bb6 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -125,3 +125,7 @@ builder [gateway:children] # Add builder to this group to install gateway ansible-init playbook into image builder + +[nhc:children] +# Hosts to configure for node health checks +compute