diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 348edff35..9d8de76d7 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -109,7 +109,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook ansible/adhoc/generate-passwords.yml echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -135,6 +134,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml @@ -170,6 +170,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 93dbd9502..58c4c3511 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -88,3 +88,5 @@ roles/* !roles/slurm_tools/** !roles/gateway/ !roles/gateway/** +!roles/alertmanager/ +!roles/alertmanager/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 5e515614a..0b4335b14 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -178,6 +178,12 @@ slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" + - name: Install alertmanager + include_role: + name: alertmanager + tasks_from: install.yml + when: "'alertmanager' in group_names" + - hosts: prometheus become: yes gather_facts: yes diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 508f794cc..b5b92ed7e 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -11,16 +11,26 @@ import os.path import re -def prometheus_node_exporter_targets(hosts, env): +def prometheus_node_exporter_targets(hosts, hostvars, env_key, group): + """ Return a mapping in cloudalchemy.nodeexporter prometheus_targets + format. + + hosts: list of inventory_hostnames + hostvars: Ansible hostvars variable + env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') + group: string to add as label 'group' + """ result = [] per_env = defaultdict(list) for host in hosts: - per_env[env].append(host) + host_env = hostvars[host].get(env_key, 'ungrouped') + per_env[host_env].append(host) for env, hosts in per_env.items(): target = { - "targets": ["{target}:9100".format(target=target) for target in hosts], + "targets": [f"{target}:9100" for target in hosts], "labels": { - "env": env + 'env': env, + 'group': group } } result.append(target) diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index 44cbcf749..e97946212 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -86,3 +86,14 @@ grafana_dashboards: [] - import_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards + +- name: Deploy alertmanager + hosts: alertmanager + tags: alertmanager + become: yes + gather_facts: false + tasks: + - name: Configure alertmanager + include_role: + name: alertmanager + tasks_from: configure.yml diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md new file mode 100644 index 000000000..612761731 --- /dev/null +++ b/ansible/roles/alertmanager/README.md @@ -0,0 +1,97 @@ +# alertmanager + +Deploy [alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) +to route Prometheus alerts to a receiver. Currently Slack is the only supported +receiver. + +Note that: +- HA configuration is not supported +- Alertmanager state is not preserved when the node it runs on (by default, + control node) is reimaged, so any alerts silenced via the GUI will reoccur. +- No Grafana dashboard for alerts is currently provided. + +Alertmanager is enabled by default on the `control` node in the +[everything](../../../environments/common/layouts/everything) template which +`cookiecutter` uses for a new environment's `inventory/groups` file. + +In general usage may only require: +- Adding the `control` node into the `alertmanager` group in `environments/site/groups` + if upgrading an existing environment. +- Enabling the Slack integration (see section below). +- Possibly setting `alertmanager_web_external_url`. + +The web UI is available on `alertmanager_web_external_url`. + +## Role variables + +All variables are optional. See [defaults/main.yml](defaults/main.yml) for +all default values. + +General variables: +- `alertmanager_version`: String, version (no leading 'v') +- `alertmanager_download_checksum`: String, checksum for relevant version from + [prometheus.io download page](https://prometheus.io/download/), in format + `type:value`. +- `alertmanager_download_dest`: String, path of temporary directory used for + download. Must exist. +- `alertmanager_binary_dir`: String, path of directory to install alertmanager + binary to. Must exist. +- `alertmanager_started`: Bool, whether the alertmanager service should be started. +- `alertmanager_enabled`: Bool, whether the alertmanager service should be enabled. +- `alertmanager_system_user`: String, name of user to run alertmanager as. Will be created. +- `alertmanager_system_group`: String, name of group of alertmanager user. +- `alertmanager_port`: Port to listen on. + +The following variables are equivalent to similarly-named arguments to the +`alertmanager` binary. See `man alertmanager` for more info: + +- `alertmanager_config_file`: String, path the main alertmanager config file + will be written to. Parent directory will be created if necessary. +- `alertmanager_web_config_file`: String, path alertmanager web config file + will be written to. Parent directory will be created if necessary. +- `alertmanager_storage_path`: String, base path for data storage. +- `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. +- `alertmanager_web_external_url`: String, the URL under which Alertmanager is + externally reachable - defaults to host IP address and `alertmanager_port`. + See man page for more details if proxying alertmanager. +- `alertmanager_data_retention`: String, how long to keep data for +- `alertmanager_data_maintenance_interval`: String, interval between garbage + collection and snapshotting to disk of the silences and the notification logs. +- `alertmanager_config_flags`: Mapping. Keys/values in here are written to the + alertmanager commandline as `--{{ key }}={{ value }}`. +- `alertmanager_default_receivers`: + +The following variables are templated into the alertmanager [main configuration](https://prometheus.io/docs/alerting/latest/configuration/): +- `alertmanager_config_template`: String, path to configuration template. The default + is to template in `alertmanager_config_default` and `alertmanager_config_extra`. +- `alertmanager_config_default`: Mapping with default configuration for the + top-level `route` and `receivers` keys. The default is to send all alerts to + the Slack receiver, if that has been enabled (see below). +- `alertmanager_receivers`: A list of [receiver](https://prometheus.io/docs/alerting/) + mappings to define under the top-level `receivers` configuration key. This + will contain the Slack receiver if that has been enabled (see below). +- `alertmanager_extra_receivers`: A list of additional [receiver](https://prometheus.io/docs/alerting/), + mappings to add, by default empty. +- `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in +`environments/common/inventory/group_vars/all/alertmanager.yml`. +- `alertmanager_slack_receiver_name`: String, name for the above Slack reciever. +- `alertmanager_slack_receiver_send_resolved`: Bool, whether to send resolved alerts via the above Slack reciever. +- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. +- `alertmanager_config_extra`: Mapping with additional configuration. Keys in + this become top-level keys in the configuration. E.g this might be: + ```yaml + alertmanager_config_extra: + global: + smtp_from: smtp.example.org:587 + time_intervals: + - name: monday-to-friday + time_intervals: + - weekdays: ['monday:friday'] + ``` + Note that `route` and `receivers` keys should not be added here. + +The following variables are templated into the alertmanager [web configuration](https://prometheus.io/docs/alerting/latest/https/): +- `alertmanager_web_config_default`: Mapping with default configuration for + `basic_auth_users` providing the default web user. +- `alertmanager_alertmanager_web_config_extra`: Mapping with additional web + configuration. Keys in this become top-level keys in the web configuration. diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml new file mode 100644 index 000000000..b30301739 --- /dev/null +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -0,0 +1,50 @@ +alertmanager_version: '0.28.1' +alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311' +alertmanager_download_dest: /tmp/alertmanager.tar.gz +alertmanager_binary_dir: /usr/local/bin +alertmanager_started: true +alertmanager_enabled: true + +alertmanager_system_user: alertmanager +alertmanager_system_group: "{{ alertmanager_system_user }}" +alertmanager_config_file: /etc/alertmanager/alertmanager.yml +alertmanager_web_config_file: /etc/alertmanager/alertmanager-web.yml +alertmanager_storage_path: /var/lib/alertmanager + +alertmanager_port: '9093' +alertmanager_web_listen_addresses: + - ":{{ alertmanager_port }}" +alertmanager_web_external_url: '' # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility + +alertmanager_data_retention: '120h' +alertmanager_data_maintenance_interval: '15m' +alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` +alertmanager_config_template: alertmanager.yml.j2 +alertmanager_web_config_template: alertmanager-web.yml.j2 + +alertmanager_web_config_default: + basic_auth_users: + alertmanager: "{{ vault_alertmanager_admin_password | password_hash('bcrypt', '1234567890123456789012', ident='2b') }}" +alertmanager_alertmanager_web_config_extra: {} # top-level only + +# Variables below are interpolated into alertmanager_config_default: + +# Uncomment below and add Slack bot app creds for Slack integration +# alertmanager_slack_integration: +# channel: '#alerts' +# app_creds: + +alertmanager_null_receiver: + name: 'null' +alertmanager_slack_receiver: {} # defined in environments/common/inventory/group_vars/all/alertmanager.yml as it needs prometheus_address +alertmanager_extra_receivers: [] +alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}" +alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" + +alertmanager_config_default: + route: + group_by: ['...'] + receiver: "{{ alertmanager_slack_receiver_name if alertmanager_slack_integration is defined else 'null' }}" + receivers: "{{ alertmanager_receivers }}" + +alertmanager_config_extra: {} # top-level only diff --git a/ansible/roles/alertmanager/handlers/main.yml b/ansible/roles/alertmanager/handlers/main.yml new file mode 100644 index 000000000..ee87e1e3b --- /dev/null +++ b/ansible/roles/alertmanager/handlers/main.yml @@ -0,0 +1,6 @@ +- name: Restart alertmanager + systemd: + name: alertmanager + state: restarted + daemon_reload: "{{ _alertmanager_service.changed | default(false) }}" + when: alertmanager_started | bool diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml new file mode 100644 index 000000000..a43ec2041 --- /dev/null +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -0,0 +1,47 @@ +- name: Create alertmanager directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rwX,go=rX + loop: + - "{{ alertmanager_config_file | dirname }}" + - "{{ alertmanager_web_config_file | dirname }}" + - "{{ alertmanager_storage_path }}" + +- name: Create alertmanager service file with immutable options + template: + src: alertmanager.service.j2 + dest: /usr/lib/systemd/system/alertmanager.service + owner: root + group: root + mode: u=rw,go=r + register: _alertmanager_service + notify: Restart alertmanager + +- name: Template alertmanager config + ansible.builtin.template: + src: "{{ alertmanager_config_template }}" + dest: "{{ alertmanager_config_file }}" + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rw,go= + notify: Restart alertmanager + +- name: Template alertmanager web config + ansible.builtin.template: + src: "{{ alertmanager_web_config_template }}" + dest: "{{ alertmanager_web_config_file }}" + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rw,go= + notify: Restart alertmanager + +- meta: flush_handlers + +- name: Ensure alertmanager service state + systemd: + name: alertmanager + state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}" + enabled: "{{ alertmanager_enabled | bool }}" diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml new file mode 100644 index 000000000..0f655da3d --- /dev/null +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -0,0 +1,25 @@ +- name: Create alertmanager system user + ansible.builtin.user: + name: "{{ alertmanager_system_user }}" + system: true + create_home: false + +- name: Download alertmanager binary + ansible.builtin.get_url: + url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz" + dest: "{{ alertmanager_download_dest }}" + owner: root + group: root + mode: u=rw,go= + checksum: "{{ alertmanager_download_checksum }}" + +- name: Unpack alertmanager binary + ansible.builtin.unarchive: + src: "{{ alertmanager_download_dest }}" + include: "alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager" + dest: "{{ alertmanager_binary_dir }}" + owner: root + group: root + mode: u=rwx,go=rx + remote_src: true + extra_opts: ['--strip-components=1', '--show-stored-names'] diff --git a/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 b/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 new file mode 100644 index 000000000..ba69f1694 --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} + +{{ alertmanager_web_config_default | to_nice_yaml }} +{{ alertmanager_alertmanager_web_config_extra | to_nice_yaml if alertmanager_alertmanager_web_config_extra | length > 0 else '' }} diff --git a/ansible/roles/alertmanager/templates/alertmanager.service.j2 b/ansible/roles/alertmanager/templates/alertmanager.service.j2 new file mode 100644 index 000000000..ac457aaea --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager.service.j2 @@ -0,0 +1,53 @@ + + + +{{ ansible_managed | comment }} +[Unit] +Description=Prometheus Alertmanager +After=network-online.target +StartLimitInterval=0 +StartLimitIntervalSec=0 + +[Service] +Type=simple +PIDFile=/run/alertmanager.pid +User={{ alertmanager_system_user }} +Group={{ alertmanager_system_group }} +ExecReload=/bin/kill -HUP $MAINPID +ExecStart={{ alertmanager_binary_dir }}/alertmanager \ + --cluster.listen-address='' \ + --config.file={{ alertmanager_config_file }} \ + --storage.path={{ alertmanager_storage_path }} \ + --data.retention={{ alertmanager_data_retention }} \ + --data.maintenance-interval={{ alertmanager_data_maintenance_interval }} \ +{% for address in alertmanager_web_listen_addresses %} + --web.listen-address={{ address }} \ +{% endfor %} + --web.external-url={{ alertmanager_web_external_url }} \ + --web.config.file={{ alertmanager_web_config_file }} \ +{% for flag, flag_value in alertmanager_config_flags.items() %} + --{{ flag }}={{ flag_value }} \ +{% endfor %} + +SyslogIdentifier=alertmanager +Restart=always +RestartSec=5 + +CapabilityBoundingSet=CAP_SET_UID +LockPersonality=true +NoNewPrivileges=true +MemoryDenyWriteExecute=true +PrivateTmp=true +ProtectHome=true +ReadWriteDirectories={{ alertmanager_storage_path }} +RemoveIPC=true +RestrictSUIDSGID=true + +PrivateUsers=true +ProtectControlGroups=true +ProtectKernelModules=true +ProtectKernelTunables=yes +ProtectSystem=strict + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/alertmanager/templates/alertmanager.yml.j2 b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..6f0c1d126 --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} + +{{ alertmanager_config_default | to_nice_yaml }} +{{ alertmanager_config_extra | to_nice_yaml if alertmanager_config_extra | length > 0 else '' }} diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 139726c83..086585a8d 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -299,7 +299,9 @@ # if not the case - name: Write Munge key copy: - content: "{{ openhpc_munge_key }}" + # NB: openhpc_munge_key is *binary* and may not survive json encoding + # so do same as environments/common/inventory/group_vars/all/openhpc.yml + content: "{{ vault_openhpc_mungekey | b64decode }}" dest: "/etc/munge/munge.key" owner: munge group: munge diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 6ec340249..8b6f6cdec 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -35,6 +35,13 @@ - python3.9 - dbus-x11 +- name: Stop turbovnc service + # This is not actually required + systemd: + name: tvncserver + state: stopped + enabled: false + - name: Replace OFED-installed init scripts ansible.builtin.copy: src: /etc/init.d.orig/ # trailing / to get contents diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 0dbe66dd8..95e3b6aca 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -11,6 +11,7 @@ slurm_appliance_secrets: vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" + vault_alertmanager_admin_password: "{{ vault_alertmanager_admin_password | default(lookup('password', '/dev/null')) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/docs/alerting.md b/docs/alerting.md new file mode 100644 index 000000000..b53c0fa40 --- /dev/null +++ b/docs/alerting.md @@ -0,0 +1,142 @@ +# Alerting + +The [prometheus.io docs](https://prometheus.io/docs/alerting/latest/overview/) +describe the overall alerting process: + +> Alerting with Prometheus is separated into two parts. Alerting rules in + Prometheus servers send alerts to an Alertmanager. The Alertmanager then + manages those alerts, including silencing, inhibition, aggregation and + sending out notifications via methods such as email, on-call notification + systems, and chat platforms. + +The general Prometheus configuration is described in +[monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note that +section specifies some role variables which commonly need modification. + +The alertmanager server is defined by the [ansible/roles/alertmanager](../ansible/roles/alertmanager/README.md), +and all the configuration options and defaults are defined there. The defaults +are fully functional, except that a [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) +must be configured to generate notifications. + +## Enabling alertmanager + +1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the +control node - for new environments the `cookiecutter` tool will have done +this: + + ```ini + # environments/site/groups: + [prometheus:children] + control + + [alertmanager:children] + control + ``` + +2. If the appliance was deployed before the alertmanager functionality was included, +generate a password for the alertmanager UI user: + + ```shell + ansible-playbook ansible/adhoc/generate-passwords.yml + ``` + +3. Configure a receiver to generate notifications from alerts. Currently a Slack +integration is provided (see below) but alternative receivers could be defined +via overriding role defaults. + +4. If desired, any other [role defaults](../ansible/roles/alertmanager/README.md) +may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. + +5. Run the `monitoring.yml` playbook (if the cluster is already up) to configure +both alertmanager and prometheus: + + ```shell + ansible-playbook ansible/monitoring.yml + ``` + +## Access + +There is a web interface provided by the alertmanager server. The default +address can be seen using: + +```shell +ansible localhost -m debug -a var=alertmanager_web_external_url +``` + +The user is `alertmanager` and the autogenerated password can be seen using: + +```shell +ansible localhost -m debug -a var=vault_alertmanager_admin_password +``` + +## Slack receiver + +This section describes how to enable the Slack receiver to provide notifications +of alerts via Slack. + +1. Create an app with a bot token: + +- Go to https://api.slack.com/apps +- select "Create an App" +- select "From scratch" +- Set app name and workspace fields, select "Create" +- Fill out "Short description" and "Background color" fields, select "Save changes" +- Select "OAuth & Permissions" on left menu +- Under "Scopes : Bot Token Scopes", select "Add an OAuth Scope", add + `chat:write` and select "Save changes" +- Select "Install App" on left menu, select "Install to your-workspace", select Allow +- Copy the Bot User OAuth token shown + +2. Add the bot token into the config and enable Slack integration: + +- Open `environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml` +- Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token +- Vault-encrypt that file: + + ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml + +- Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` +- Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name + +3. Invite the bot to your alerts channel +- In the appropriate Slack channel type: + + /invite @YOUR_BOT_NAME + + +## Alerting Rules + +These are part of [Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +which is defined for the appliance at +[environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). + +Two [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) +role variables are relevant: +- `prometheus_alert_rules_files`: Paths to check for files providing rules. + Note these are copied to Prometheus config directly, so jinja expressions for + Prometheus do not need escaping. +- `prometheus_alert_rules`: Yaml-format rules. Jinja templating here will be +interpolated by Ansible, so templating intended for Prometheus must be escaped +using `{% raw %}`/`{% endraw %}` tags. + +By default, `prometheus_alert_rules_files` is set so that any `*.rules` files +in a directory `files/prometheus/rules` in the current environment or *any* +parent environment are loaded. So usually, site-specific alerts should be added +by creating additional rules files in `environments/site/files/prometheus/rules`. +If the same file exists in more than one environment, the "child" file will take +precedence and any rules in the "parent" file will be ignored. + +A set of default alert rule files is provided at `environments/common/files/prometheus/rules/`. +These cover: +- Some node-exporter metrics for disk, filesystems, memory and clock. Note + no alerts are triggered on memory for compute nodes due to the intended use + of those nodes. +- Slurm nodes in DOWN or FAIL states, or the Slurm DBD message queue being too + large, usually indicating a database problem. + +When defining additional rules, note the [labels defined](./monitoring-and-logging.md#prometheus_node_exporter_targets) for node-exporter targets. + +In future more alerts may be added for: +- smartctl-exporter-based rules for baremetal nodes where there is no + infrastructure-level smart monitoring +- loss of "up" network interfaces diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index db228d410..6913c285f 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -215,6 +215,12 @@ Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) +Prometheus will be functional by default but the following variables should +commonly be modified: +- `prometheus_web_external_url` +- `prometheus_storage_retention` +- `prometheus_storage_retention_size` + ### Placement The `prometheus` group determines the placement of the prometheus service. Load balancing is currently unsupported so it is important that you only assign one host to this group. @@ -240,12 +246,7 @@ This appliance provides a default set of recording rules which can be found here The intended purpose is to pre-compute some expensive queries that are used in the reference set of grafana dashboards. -To add new, or to remove rules you will be to adjust the `prometheus_alert_rules_files` variable. The default value can be found in: - -> [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) - -You can extend this variable in your environment specific configuration to reference extra files or to remove the defaults. The reference set of dashboards expect these variables to be defined, so if you remove them, you -will also have to update your dashboards. +For information on configuring alerting rules see [docs/alerting.md#alerting-rules](./alerting.md#alerting-rules). ### node_exporter @@ -273,7 +274,14 @@ Variables in this file should *not* be customised directly, but should be overri #### prometheus_node_exporter_targets -Groups prometheus targets into per environment groups. The ansible variable, `env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `env: $env`, where `$env` is the value of the `env` variable for that host. +Groups prometheus targets. Metrics from `node_exporter` hosts have two labels +applied: + - `env`: This is set from the Ansible variable `prometheus_env` if present + (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be + used to group metrics by some arbitrary "environment", e.g. rack. + - `group`: This refers to the "top-level" inventory group for the host and + is one of `control`, `login`, `compute` or `other`. This can be used to + define rules for specific host functionalities. ## slurm-stats diff --git a/docs/production.md b/docs/production.md index c15298887..7fcff1d7e 100644 --- a/docs/production.md +++ b/docs/production.md @@ -149,3 +149,5 @@ and referenced from the `site` and `production` environments, e.g.: raised using [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html). In general it should be possible to raise this value to 50-100 if the cloud is properly tuned, again, demonstrated through testing. + +- Enable alertmanager if Slack is available - see [docs/alerting.md](./alerting.md). diff --git a/environments/.caas/inventory/everything b/environments/.caas/inventory/everything deleted file mode 120000 index dc66b9576..000000000 --- a/environments/.caas/inventory/everything +++ /dev/null @@ -1 +0,0 @@ -../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups new file mode 100644 index 000000000..f5665790f --- /dev/null +++ b/environments/.caas/inventory/groups @@ -0,0 +1,127 @@ +[nfs:children] +openhpc + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager] +# Don't want this for caas + +[node_exporter:children] +cluster + +[opensearch:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +# NB: [rebuild] not defined here as likely to need features not currently supported + +[update:children] + +[fail2ban:children] +# Hosts to install fail2ban on to protect SSH +login + +[block_devices:children] +# Environment-specific so not defined here + +[basic_users:children] +# Add `openhpc` group to add Slurm users via creation of users on each node. +openhpc + +[openondemand:children] +# Host to run Open Ondemand server on - subset of login +login + +[openondemand_desktop:children] +# Subset of compute to run a interactive desktops on via Open Ondemand +compute + +[openondemand_jupyter:children] +# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand +compute + +[etc_hosts:children] +# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md +cluster + +[cuda] +# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md + +[eessi:children] +# Hosts on which EESSI stack should be configured +openhpc + +[resolv_conf] +# Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md + +[proxy] +# Hosts to configure http/s proxies - see ansible/roles/proxy/README.md + +[manila] +# Hosts to configure for manila fileshares + +[persist_hostkeys:children] +# Hosts to use common set of hostkeys which persist across reimaging. +login +openondemand + +[squid] +# Hosts to run squid proxy + +[tuned:children] +# Hosts to run TuneD configuration + +[ansible_init:children] +# Hosts to run linux-anisble-init +cluster + +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + +[compute_init] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on + +[k3s_server:children] +# Hosts to run k3s server (should only be single node i.e control node) +#control + +[k3s_agent:children] +# Hosts to run k3s agent +#compute +#login + +[k9s:children] +# Hosts to install k9s on +#control + +[lustre] +# Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +builder + +[cacerts] +# Hosts to configure CA certificates and trusts on + +[chrony] +# Hosts where crony configuration is applied. See docs/chrony.md for more details. + +[gateway:children] +# Add builder to this group to install gateway ansible-init playbook into image +builder diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 3f449a8bf..56d3e237c 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250409-0953-f5aefb1e", - "RL9": "openhpc-RL9-250409-0953-f5aefb1e" + "RL8": "openhpc-RL8-250422-1328-1a6eff86", + "RL9": "openhpc-RL9-250422-1328-1a6eff86" } } diff --git a/environments/common/files/prometheus/rules/node-exporter.rules b/environments/common/files/prometheus/rules/node-exporter.rules new file mode 100644 index 000000000..95c3ea9c9 --- /dev/null +++ b/environments/common/files/prometheus/rules/node-exporter.rules @@ -0,0 +1,177 @@ +# Mostly taken from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware +# If modified, this is noted in a comment. +# +# In general have ignored lack of resources (memory, cpu) on compute nodes as +# this is expected, and ignored things which will be hard to threshold due to +# the nature of a Slurm cluster. + +groups: +- name: node-exporter + rules: + + # Modified: ignore compute nodes + - alert: HostOutOfMemory + expr: ( + node_memory_MemAvailable_bytes{group!~"compute"} / + node_memory_MemTotal_bytes{group!~"compute"} + < .10 + ) + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: HostOutOfInodes + expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Modified: ignore /run/credentials paths + - alert: HostFilesystemDeviceError + expr: node_filesystem_device_error{ + fstype!~"^(fuse.*|tmpfs|cifs|nfs)", + mountpoint!~"/run/credentials/.*" + } == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # TODO: make tunable + - alert: HostUnusualDiskWriteLatency + expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: (node_systemd_unit_state{state="failed"} == 1) + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSoftwareRaidDiskFailure + expr: (node_md_disks{state="failed"} > 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host software RAID disk failure (instance {{ $labels.instance }}) + description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Modified: ignore compute nodes + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill{group!~"compute"}[1m]) > 0) + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: (node_edac_uncorrectable_errors_total > 0) + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: ((node_bonding_active - node_bonding_slaves) != 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: (node_reboot_required > 0) + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/environments/common/files/prometheus/rules/slurm.rules b/environments/common/files/prometheus/rules/slurm.rules new file mode 100644 index 000000000..be17d66a2 --- /dev/null +++ b/environments/common/files/prometheus/rules/slurm.rules @@ -0,0 +1,16 @@ + +groups: +- name: Slurm + rules: + - alert: SlurmNodeDown + annotations: + description: '{{ $value }} Slurm nodes are in down status' + summary: 'At least one Slurm node is down.' + expr: "slurm_nodes_down > 0\n" + labels: + severity: critical + - alert: SlurmNodeFail + annotations: + description: '{{ $value }} Slurm nodes are in fail status' + summary: 'At least one Slurm node is failed.' + expr: "slurm_nodes_fail > 0\n" diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml new file mode 100644 index 000000000..c677aaa29 --- /dev/null +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -0,0 +1,18 @@ + +alertmanager_port: '9093' # defined here as required for prometheus + +alertmanager_slack_receiver_name: slack-receiver +alertmanager_slack_receiver_send_resolved: true +alertmanager_slack_receiver: # defined here as needs prometheus address + name: "{{ alertmanager_slack_receiver_name }}" + slack_configs: + - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" + api_url: https://slack.com/api/chat.postMessage + http_config: + authorization: + credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" + text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" + title_link: "{{ prometheus_web_external_url }}/alerts?receiver={{ alertmanager_slack_receiver_name }}" + send_resolved: "{{ alertmanager_slack_receiver_send_resolved }}" + +alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 27a4ee0e6..1809d3485 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -22,7 +22,7 @@ prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}" k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}" - +alertmanager_address: "{{ hostvars[groups['alertmanager'].0].api_address }}" ############################# bootstrap: local user configuration ######################### # Note RockyLinux 8.5 defines system user/groups in range 201-999 diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 87da90e4a..6c40e66bb 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -4,22 +4,42 @@ # for variable definitions prometheus_version: 2.27.0 # default from ansible/roles/cloudalchemy.prometheus/defaults/main.yml -prometheus_web_external_url: "http://{{ prometheus_address }}:9090" +prometheus_web_external_url: "http://{{ hostvars[groups['prometheus'].0].ansible_host }}:9090/" # default to host IP address prometheus_storage_retention: "31d" prometheus_storage_retention_size: "100GB" prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" -prometheus_alertmanager_config: [] +prometheus_alertmanager_config_default: + - static_configs: + - targets: + - "{{ alertmanager_address }}:{{ alertmanager_port }}" + basic_auth: + username: alertmanager + password: "{{ vault_alertmanager_admin_password }}" + +prometheus_alertmanager_config_extra: [] +prometheus_alertmanager_config: "{{ (prometheus_alertmanager_config_default if groups['alertmanager'] else []) + prometheus_alertmanager_config_extra }}" -prometheus_alert_rules_files: -- "{{ appliances_repository_root }}/environments/common/files/prometheus/rules/*.rules" +# By default, find rule files from the following path relative to current and all parent environment inventory directories: +# Note: If the same file exists in parent and child environments, only the file in the latter has any effect. +prometheus_alert_rules_files_inventory_glob: ../files/prometheus/rules/*.rules +prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}" -prometheus_alert_rules: [] +prometheus_alert_rules: + - alert: SlurmDBDQueueLarge + # NB: {{ templates }} in annotations.description are interpolated by prometheus, in expr by ansible + annotations: + description: '{% raw %}Slurm DBD message queue size {{ $value }} is larger than half Slurm parameter MaxDBDMsgs - check database health{% endraw %}' + summary: 'Slurm DBD message queue is large.' + expr: "slurm_scheduler_dbd_queue_size > {{ hostvars[groups['control'].0].ansible_local.slurm.MaxDBDMsgs | int }}" -# Can set a hostvar 'env' to an arbitrary string to group prometheus targets, e.g. rack. -# env: location-1 +# Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack. prometheus_targets: - node: "{{ groups.get('node_exporter', []) | reject('equalto', 'localhost') | prometheus_node_exporter_targets(env | default('ungrouped')) }}" + control: "{{ groups.get('node_exporter', []) | intersect(groups['control']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'control') }}" + login: "{{ groups.get('node_exporter', []) | intersect(groups['login']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'login') }}" + compute: "{{ groups.get('node_exporter', []) | intersect(groups['compute']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'compute') }}" + # openhpc is defined as control+login+compute so this gets any other node exporter targets: + other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}" prometheus_scrape_configs_default: - job_name: "prometheus" @@ -34,7 +54,10 @@ prometheus_scrape_configs_default: - job_name: "node" file_sd_configs: - files: - - "/etc/prometheus/file_sd/node.yml" + - /etc/prometheus/file_sd/control.yml + - /etc/prometheus/file_sd/login.yml + - /etc/prometheus/file_sd/compute.yml + - /etc/prometheus/file_sd/other.yml relabel_configs: # strip off port - source_labels: ['__address__'] diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 2b1d0ce81..1fc2a8424 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -37,7 +37,7 @@ mysql # Single node to host monitoring dashboards. [alertmanager] -# TODO: +# Single node to host alertmanager [opensearch] # Single node to host ElasticSearch search engine for Slurm monitoring. diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml new file mode 100644 index 000000000..4a46b7976 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml @@ -0,0 +1,6 @@ +# Uncomment below and add Slack bot app creds in the adjacent file +# vault_alertmanager.ym for Slack integration: +# +# alertmanager_slack_integration: +# channel: '#alerts' +# app_creds: "{{ vault_alertmanager_slack_integration_app_creds }}" diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml new file mode 100644 index 000000000..4375ed725 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml @@ -0,0 +1,3 @@ +# Add a bot token here THEN VAULT-ENCRYPT this file! + +#vault_alertmanager_slack_integration_app_creds: '' diff --git a/requirements.yml b/requirements.yml index 15a6e5c4b..87b2a6263 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.27.0 + version: v0.28.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc