From 52e4daa74ec3586b9fc13ac5e62a16c4893e4c01 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Apr 2025 16:11:09 +0000 Subject: [PATCH 01/33] WIP: add alertmanager --- ansible/.gitignore | 2 + ansible/monitoring.yml | 16 ++++++ ansible/roles/alertmanager/README.md | 6 +++ ansible/roles/alertmanager/defaults/main.yml | 45 +++++++++++++++++ ansible/roles/alertmanager/handlers/main.yml | 6 +++ .../roles/alertmanager/tasks/configure.yml | 40 +++++++++++++++ ansible/roles/alertmanager/tasks/install.yml | 25 ++++++++++ ansible/roles/alertmanager/tasks/main.yml | 0 .../templates/alertmanager.service.j2 | 50 +++++++++++++++++++ .../templates/alertmanager.yml.j2 | 4 ++ .../inventory/group_vars/all/alertmanager.yml | 11 ++++ environments/common/inventory/groups | 2 +- 12 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/alertmanager/README.md create mode 100644 ansible/roles/alertmanager/defaults/main.yml create mode 100644 ansible/roles/alertmanager/handlers/main.yml create mode 100644 ansible/roles/alertmanager/tasks/configure.yml create mode 100644 ansible/roles/alertmanager/tasks/install.yml create mode 100644 ansible/roles/alertmanager/tasks/main.yml create mode 100644 ansible/roles/alertmanager/templates/alertmanager.service.j2 create mode 100644 ansible/roles/alertmanager/templates/alertmanager.yml.j2 create mode 100644 environments/common/inventory/group_vars/all/alertmanager.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 93dbd9502..58c4c3511 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -88,3 +88,5 @@ roles/* !roles/slurm_tools/** !roles/gateway/ !roles/gateway/** +!roles/alertmanager/ +!roles/alertmanager/** diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index 44cbcf749..f7ca2edb1 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -86,3 +86,19 @@ grafana_dashboards: [] - import_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards + +- name: Deploy alertmanager + hosts: alertmanager + tags: alertmanager + become: yes + gather_facts: false + tasks: + # TODO: move elsewhere, still needs become + - name: Install alertmanager + include_role: + name: alertmanager + tasks_from: install.yml + - name: Configure alertmanager + include_role: + name: alertmanager + tasks_from: configure.yml diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md new file mode 100644 index 000000000..323bebdef --- /dev/null +++ b/ansible/roles/alertmanager/README.md @@ -0,0 +1,6 @@ +# alertmanager + + +notes: +- HA is not supported +- state ("notification state and configured silences") is not preserved across rebuild diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml new file mode 100644 index 000000000..9ba8f919c --- /dev/null +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -0,0 +1,45 @@ +alertmanager_version: '0.28.1' +alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311' +alertmanager_download_dest: /tmp/alertmanager.tar.gz +alertmanager_binary_dir: /usr/local/bin +alertmanager_started: true +alertmanager_enabled: true + +alertmanager_system_user: alertmanager +alertmanager_system_group: alertmanager +alertmanager_config_path: /etc/alertmanager/alertmanager.yml +alertmanager_storage_dir: /var/lib/alertmanager +alertmanager_web_listen_addresses: + - ':9100' +alertmanager_web_external_url: http://localhost:9093/ +alertmanager_config_flags: {} +# TODO: data retention? +alertmanager_config_template: alertmanager.yml.j2 + + +# everything below here is interpolated into alertmanager_config_default: + +# Uncomment below and add Slack bot app creds for Slack integration +# alertmanager_slack_integration: +# channel: '#alerts' +# app_creds: + + +alertmanager_default_receivers: + - name: 'null' + +alertmanager_slack_receiver: {} # really defined in common as it needs prometheus_address + +alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}" + +alertmanager_config_default: + route: + group_by: ['...'] + receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}" + receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" + +alertmanager_config_extra: {} # top-level only + + +# TODO: routes?? +# TODO: see PR with additional alerts diff --git a/ansible/roles/alertmanager/handlers/main.yml b/ansible/roles/alertmanager/handlers/main.yml new file mode 100644 index 000000000..ee87e1e3b --- /dev/null +++ b/ansible/roles/alertmanager/handlers/main.yml @@ -0,0 +1,6 @@ +- name: Restart alertmanager + systemd: + name: alertmanager + state: restarted + daemon_reload: "{{ _alertmanager_service.changed | default(false) }}" + when: alertmanager_started | bool diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml new file mode 100644 index 000000000..5c01f3f98 --- /dev/null +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -0,0 +1,40 @@ +- name: Create alertmanager directories + ansible.builtin.file: + path: "{{ item }}" + state: directory + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rwX,go=rX + loop: + - "{{ alertmanager_config_path | dirname }}" + - "{{ alertmanager_storage_dir }}" + +# TODO: selinux? + +- name: Create alertmanager service file with immutable options + template: + src: alertmanager.service.j2 + dest: /usr/lib/systemd/system/alertmanager.service + owner: root + group: root + mode: u=rw,go=r + register: _alertmanager_service + notify: Restart alertmanager + # TODO: how do we cope with the binary changing? + +- name: Template alertmanager config + ansible.builtin.template: + src: "{{ alertmanager_config_template }}" + dest: "{{ alertmanager_config_path }}" + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rw,go=r # TODO: check there are no sensitive things in here! + notify: Restart alertmanager + +- meta: flush_handlers + +- name: Ensure alertmanager service state + systemd: + name: alertmanager + state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}" + enabled: "{{ alertmanager_enabled | bool }}" diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml new file mode 100644 index 000000000..0f655da3d --- /dev/null +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -0,0 +1,25 @@ +- name: Create alertmanager system user + ansible.builtin.user: + name: "{{ alertmanager_system_user }}" + system: true + create_home: false + +- name: Download alertmanager binary + ansible.builtin.get_url: + url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz" + dest: "{{ alertmanager_download_dest }}" + owner: root + group: root + mode: u=rw,go= + checksum: "{{ alertmanager_download_checksum }}" + +- name: Unpack alertmanager binary + ansible.builtin.unarchive: + src: "{{ alertmanager_download_dest }}" + include: "alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager" + dest: "{{ alertmanager_binary_dir }}" + owner: root + group: root + mode: u=rwx,go=rx + remote_src: true + extra_opts: ['--strip-components=1', '--show-stored-names'] diff --git a/ansible/roles/alertmanager/tasks/main.yml b/ansible/roles/alertmanager/tasks/main.yml new file mode 100644 index 000000000..e69de29bb diff --git a/ansible/roles/alertmanager/templates/alertmanager.service.j2 b/ansible/roles/alertmanager/templates/alertmanager.service.j2 new file mode 100644 index 000000000..b62b4764a --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager.service.j2 @@ -0,0 +1,50 @@ + + + +{{ ansible_managed | comment }} +[Unit] +Description=Prometheus Alertmanager +After=network-online.target +StartLimitInterval=0 +StartLimitIntervalSec=0 + +[Service] +Type=simple +PIDFile=/run/alertmanager.pid +User={{ alertmanager_system_user }} +Group={{ alertmanager_system_group }} +ExecReload=/bin/kill -HUP $MAINPID +ExecStart={{ alertmanager_binary_dir }}/alertmanager \ + --cluster.listen-address='' \ + --config.file={{ alertmanager_config_path }} \ + --storage.path={{ alertmanager_storage_dir }} \ +{% for address in alertmanager_web_listen_addresses %} + --web.listen-address={{ address }} \ +{% endfor %} + --web.external-url={{ alertmanager_web_external_url }} \ +{% for flag, flag_value in alertmanager_config_flags.items() %} + --{{ flag }}={{ flag_value }} \ +{% endfor %} + +SyslogIdentifier=alertmanager +Restart=always +RestartSec=5 + +CapabilityBoundingSet=CAP_SET_UID +LockPersonality=true +NoNewPrivileges=true +MemoryDenyWriteExecute=true +PrivateTmp=true +ProtectHome=true +ReadWriteDirectories={{ alertmanager_storage_dir }} +RemoveIPC=true +RestrictSUIDSGID=true + +PrivateUsers=true +ProtectControlGroups=true +ProtectKernelModules=true +ProtectKernelTunables=yes +ProtectSystem=strict + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/alertmanager/templates/alertmanager.yml.j2 b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..539bbbf9a --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} + +{{ alertmanager_config_default }} +{{ alertmanager_config_extra }} diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml new file mode 100644 index 000000000..588efe988 --- /dev/null +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -0,0 +1,11 @@ +alertmanager_slack_receiver: + name: slack-receiver + slack_configs: + - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" + api_url: https://slack.com/api/chat.postMessage + http_config: + authorization: + credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" + text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" + title_link: "http://{{ prometheus_address }}:9090/alertmanager/#/alerts?receiver=slack-receiver" + send_resolved: true diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 2b1d0ce81..1fc2a8424 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -37,7 +37,7 @@ mysql # Single node to host monitoring dashboards. [alertmanager] -# TODO: +# Single node to host alertmanager [opensearch] # Single node to host ElasticSearch search engine for Slurm monitoring. From 63409476969bceedb11a311db7eb7aad24f1d538 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 08:01:22 +0000 Subject: [PATCH 02/33] disable alertmananger for caas --- environments/.caas/inventory/everything | 1 - environments/.caas/inventory/groups | 127 ++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) delete mode 120000 environments/.caas/inventory/everything create mode 100644 environments/.caas/inventory/groups diff --git a/environments/.caas/inventory/everything b/environments/.caas/inventory/everything deleted file mode 120000 index dc66b9576..000000000 --- a/environments/.caas/inventory/everything +++ /dev/null @@ -1 +0,0 @@ -../../../environments/common/layouts/everything \ No newline at end of file diff --git a/environments/.caas/inventory/groups b/environments/.caas/inventory/groups new file mode 100644 index 000000000..f5665790f --- /dev/null +++ b/environments/.caas/inventory/groups @@ -0,0 +1,127 @@ +[nfs:children] +openhpc + +[mysql:children] +control + +[prometheus:children] +control + +[grafana:children] +control + +[alertmanager] +# Don't want this for caas + +[node_exporter:children] +cluster + +[opensearch:children] +control + +[slurm_stats:children] +control + +[filebeat:children] +slurm_stats + +# NB: [rebuild] not defined here as likely to need features not currently supported + +[update:children] + +[fail2ban:children] +# Hosts to install fail2ban on to protect SSH +login + +[block_devices:children] +# Environment-specific so not defined here + +[basic_users:children] +# Add `openhpc` group to add Slurm users via creation of users on each node. +openhpc + +[openondemand:children] +# Host to run Open Ondemand server on - subset of login +login + +[openondemand_desktop:children] +# Subset of compute to run a interactive desktops on via Open Ondemand +compute + +[openondemand_jupyter:children] +# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand +compute + +[etc_hosts:children] +# Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md +cluster + +[cuda] +# Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md + +[eessi:children] +# Hosts on which EESSI stack should be configured +openhpc + +[resolv_conf] +# Allows defining nameservers in /etc/resolv.conf - see ansible/roles/resolv_conf/README.md + +[proxy] +# Hosts to configure http/s proxies - see ansible/roles/proxy/README.md + +[manila] +# Hosts to configure for manila fileshares + +[persist_hostkeys:children] +# Hosts to use common set of hostkeys which persist across reimaging. +login +openondemand + +[squid] +# Hosts to run squid proxy + +[tuned:children] +# Hosts to run TuneD configuration + +[ansible_init:children] +# Hosts to run linux-anisble-init +cluster + +[sssd] +# Hosts to configure sssd on + +[sshd] +# Hosts where the OpenSSH server daemon should be configured + +[compute_init] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on + +[k3s_server:children] +# Hosts to run k3s server (should only be single node i.e control node) +#control + +[k3s_agent:children] +# Hosts to run k3s agent +#compute +#login + +[k9s:children] +# Hosts to install k9s on +#control + +[lustre] +# Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +builder + +[cacerts] +# Hosts to configure CA certificates and trusts on + +[chrony] +# Hosts where crony configuration is applied. See docs/chrony.md for more details. + +[gateway:children] +# Add builder to this group to install gateway ansible-init playbook into image +builder From a1cc078df3b64cb5d2e0fc45f47cf0d0e338049c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 11:50:58 +0000 Subject: [PATCH 03/33] get slack integration working, with node down alert --- ansible/roles/alertmanager/README.md | 68 +++++++++++++++++++ ansible/roles/alertmanager/defaults/main.yml | 20 ++++-- .../roles/alertmanager/tasks/configure.yml | 6 +- .../templates/alertmanager.service.j2 | 8 ++- .../templates/alertmanager.yml.j2 | 4 +- .../common/files/prometheus/rules/slurm.rules | 11 +++ .../inventory/group_vars/all/alertmanager.yml | 5 +- .../inventory/group_vars/all/defaults.yml | 2 +- .../inventory/group_vars/all/prometheus.yml | 14 ++-- .../inventory/group_vars/all/alertmanager.yml | 6 ++ .../group_vars/all/vault_alertmanager.yml | 3 + 11 files changed, 126 insertions(+), 21 deletions(-) create mode 100644 environments/common/files/prometheus/rules/slurm.rules create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml create mode 100644 environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 323bebdef..93d538aff 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -4,3 +4,71 @@ notes: - HA is not supported - state ("notification state and configured silences") is not preserved across rebuild +- not used for caas +- no dashboard + + + +## Role variables + +The following variables are equivalent to similarly-named arguments to the +`alertmanager` binary. See `man alertmanager` for more info: + +- TODO: + +The following variables are templated into the alertmanager configuration file: + +- TODO: + +Other variables: +- TODO: + + +## TODO + +memory usage looks a bit close: + +``` +[root@RL9-control rocky]# free -h + total used free shared buff/cache available +Mem: 3.6Gi 2.4Gi 168Mi 11Mi 1.5Gi 1.2Gi +Swap: 0B 0B 0B +``` + + + +## Slack Integration + +1. Create an app with a bot token: + +- Go to https://api.slack.com/apps +- select "Create an App" +- select "From scratch" +- Set app name and workspacef fields, select "Create" +- Fill out "Short description" and "Background color" fields, select "Save changes" +- Select "OAuth & Permissions" on left menu +- Under "Scopes : Bot Token Scopes", select "Add an OAuth Scope", add + `chat:write` and select "Save changes" +- Select "Install App" on left menu, select "Install to your-workspace", select Allow +- Copy the Bot User OAuth token shown + +2. Add the bot token into the config and enable Slurm integration + +- Open `environments/site/inventory/group_vars/all/vault_alertmanager.yml` +- Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token +- Vault-encrypt that file: + + ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml + +- Open `environments/site/inventory/group_vars/all/alertmanager.yml` +- Uncomment the config and set your alert channel name + +3. Invite the bot to your alerts channel +- In the appropriate Slack channel type: + + /invite @YOUR_BOT_NAME + + +## Adding Rules + +TODO: describe how prom config works \ No newline at end of file diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index 9ba8f919c..bc2d3a934 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -7,12 +7,18 @@ alertmanager_enabled: true alertmanager_system_user: alertmanager alertmanager_system_group: alertmanager -alertmanager_config_path: /etc/alertmanager/alertmanager.yml -alertmanager_storage_dir: /var/lib/alertmanager -alertmanager_web_listen_addresses: - - ':9100' -alertmanager_web_external_url: http://localhost:9093/ -alertmanager_config_flags: {} +alertmanager_config_file: /etc/alertmanager/alertmanager.yml # --config.file: Alertmanager configuration file name +alertmanager_storage_path: /var/lib/alertmanager # --storage.path: Base path for data storage + +alertmanager_port: '9093' +alertmanager_web_listen_addresses: # elements of --web.listen-address + - ":{{ alertmanager_port }}" +alertmanager_web_external_url: "http://localhost:{{ alertmanager_port}}/" # --web.external-url: The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically +# TODO: work out how we proxy this through ondemand + +alertmanager_data_retention: '120h' # --data.retention # How long to keep data for +alertmanager_data_maintenance_interval: '15m' # --data.maintenance-interval: Interval between garbage collection and snapshotting to disk of the silences and the notification logs +alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` # TODO: data retention? alertmanager_config_template: alertmanager.yml.j2 @@ -28,7 +34,7 @@ alertmanager_config_template: alertmanager.yml.j2 alertmanager_default_receivers: - name: 'null' -alertmanager_slack_receiver: {} # really defined in common as it needs prometheus_address +alertmanager_slack_receiver: {} # defined in common env as it needs prometheus_address alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}" diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index 5c01f3f98..5c3d651cd 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -6,8 +6,8 @@ group: "{{ alertmanager_system_group }}" mode: u=rwX,go=rX loop: - - "{{ alertmanager_config_path | dirname }}" - - "{{ alertmanager_storage_dir }}" + - "{{ alertmanager_config_file | dirname }}" + - "{{ alertmanager_storage_path }}" # TODO: selinux? @@ -25,7 +25,7 @@ - name: Template alertmanager config ansible.builtin.template: src: "{{ alertmanager_config_template }}" - dest: "{{ alertmanager_config_path }}" + dest: "{{ alertmanager_config_file }}" owner: "{{ alertmanager_system_user }}" group: "{{ alertmanager_system_group }}" mode: u=rw,go=r # TODO: check there are no sensitive things in here! diff --git a/ansible/roles/alertmanager/templates/alertmanager.service.j2 b/ansible/roles/alertmanager/templates/alertmanager.service.j2 index b62b4764a..e58382cc0 100644 --- a/ansible/roles/alertmanager/templates/alertmanager.service.j2 +++ b/ansible/roles/alertmanager/templates/alertmanager.service.j2 @@ -16,8 +16,10 @@ Group={{ alertmanager_system_group }} ExecReload=/bin/kill -HUP $MAINPID ExecStart={{ alertmanager_binary_dir }}/alertmanager \ --cluster.listen-address='' \ - --config.file={{ alertmanager_config_path }} \ - --storage.path={{ alertmanager_storage_dir }} \ + --config.file={{ alertmanager_config_file }} \ + --storage.path={{ alertmanager_storage_path }} \ + --data.retention={{ alertmanager_data_retention }} \ + --data.maintenance-interval={{ alertmanager_data_maintenance_interval }} \ {% for address in alertmanager_web_listen_addresses %} --web.listen-address={{ address }} \ {% endfor %} @@ -36,7 +38,7 @@ NoNewPrivileges=true MemoryDenyWriteExecute=true PrivateTmp=true ProtectHome=true -ReadWriteDirectories={{ alertmanager_storage_dir }} +ReadWriteDirectories={{ alertmanager_storage_path }} RemoveIPC=true RestrictSUIDSGID=true diff --git a/ansible/roles/alertmanager/templates/alertmanager.yml.j2 b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 index 539bbbf9a..6f0c1d126 100644 --- a/ansible/roles/alertmanager/templates/alertmanager.yml.j2 +++ b/ansible/roles/alertmanager/templates/alertmanager.yml.j2 @@ -1,4 +1,4 @@ {{ ansible_managed | comment }} -{{ alertmanager_config_default }} -{{ alertmanager_config_extra }} +{{ alertmanager_config_default | to_nice_yaml }} +{{ alertmanager_config_extra | to_nice_yaml if alertmanager_config_extra | length > 0 else '' }} diff --git a/environments/common/files/prometheus/rules/slurm.rules b/environments/common/files/prometheus/rules/slurm.rules new file mode 100644 index 000000000..396db0d4b --- /dev/null +++ b/environments/common/files/prometheus/rules/slurm.rules @@ -0,0 +1,11 @@ + +groups: +- name: Slurm + rules: + - alert: SlurmNodeDown + annotations: + description: '{{ $value }} Slurm nodes are in down status' + summary: 'At least one Slurm node is down.' + expr: "slurm_nodes_down > 0\n" + labels: + severity: critical diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 588efe988..3c02598c6 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,4 +1,7 @@ -alertmanager_slack_receiver: + +alertmanager_port: '9093' # defined here as required for prometheus + +alertmanager_slack_receiver: # defined here as needs prometheus address name: slack-receiver slack_configs: - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 27a4ee0e6..1809d3485 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -22,7 +22,7 @@ prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}" k3s_server_name: "{{ hostvars[groups['k3s_server'] | first].ansible_host }}" - +alertmanager_address: "{{ hostvars[groups['alertmanager'].0].api_address }}" ############################# bootstrap: local user configuration ######################### # Note RockyLinux 8.5 defines system user/groups in range 201-999 diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 87da90e4a..9a4349942 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -9,10 +9,16 @@ prometheus_storage_retention: "31d" prometheus_storage_retention_size: "100GB" prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" -prometheus_alertmanager_config: [] - -prometheus_alert_rules_files: -- "{{ appliances_repository_root }}/environments/common/files/prometheus/rules/*.rules" +prometheus_alertmanager_config_default: + - static_configs: + - targets: + - "{{ alertmanager_address }}:{{ alertmanager_port }}" +prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if groups['alertmanager'] else {} }}" + +# by default, use rule files from the following path relative to current and all parent environment inventory directories: +prometheus_alert_rules_files_inventory_glob: ../files/prometheus/rules/*.rules +prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}" +# TODO: find a way to include/exclude files? prometheus_alert_rules: [] diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml new file mode 100644 index 000000000..4a46b7976 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/alertmanager.yml @@ -0,0 +1,6 @@ +# Uncomment below and add Slack bot app creds in the adjacent file +# vault_alertmanager.ym for Slack integration: +# +# alertmanager_slack_integration: +# channel: '#alerts' +# app_creds: "{{ vault_alertmanager_slack_integration_app_creds }}" diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml new file mode 100644 index 000000000..4375ed725 --- /dev/null +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml @@ -0,0 +1,3 @@ +# Add a bot token here THEN VAULT-ENCRYPT this file! + +#vault_alertmanager_slack_integration_app_creds: '' From a44f777dbd9aa712e9d10d0a9aff5c1511ddd4ac Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 12:14:31 +0000 Subject: [PATCH 04/33] add node-exporter disk space alert --- .../files/prometheus/rules/node-exporter.rules | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 environments/common/files/prometheus/rules/node-exporter.rules diff --git a/environments/common/files/prometheus/rules/node-exporter.rules b/environments/common/files/prometheus/rules/node-exporter.rules new file mode 100644 index 000000000..60e40313e --- /dev/null +++ b/environments/common/files/prometheus/rules/node-exporter.rules @@ -0,0 +1,14 @@ +groups: +- name: node-exporter + rules: + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of disk space (instance {{ $labels.instance }}) + description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' From e9ffefdcc8f26eb59d49cfc422d552b0efab9d23 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 12:18:29 +0000 Subject: [PATCH 05/33] setup fatimage/site --- ansible/fatimage.yml | 5 +++++ ansible/monitoring.yml | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 5e515614a..983aa5f10 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -178,6 +178,11 @@ slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" + - name: Install alertmanager + include_role: + name: alertmanager + tasks_from: install.yml + - hosts: prometheus become: yes gather_facts: yes diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index f7ca2edb1..e97946212 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -93,11 +93,6 @@ become: yes gather_facts: false tasks: - # TODO: move elsewhere, still needs become - - name: Install alertmanager - include_role: - name: alertmanager - tasks_from: install.yml - name: Configure alertmanager include_role: name: alertmanager From 4b437aaf863d6af5a57bbf7e093a2637bb2d814f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 13:27:44 +0000 Subject: [PATCH 06/33] fix bug where prometheus environments didn't work --- ansible/filter_plugins/utils.py | 5 +++-- .../common/inventory/group_vars/all/prometheus.yml | 7 +++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 508f794cc..999fc763c 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -11,11 +11,12 @@ import os.path import re -def prometheus_node_exporter_targets(hosts, env): +def prometheus_node_exporter_targets(hosts, hostvars, env_key): result = [] per_env = defaultdict(list) for host in hosts: - per_env[env].append(host) + host_env = hostvars[host].get(env_key, 'ungrouped') + per_env[host_env].append(host) for env, hosts in per_env.items(): target = { "targets": ["{target}:9100".format(target=target) for target in hosts], diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 9a4349942..adfdcaa43 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -22,10 +22,9 @@ prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheu prometheus_alert_rules: [] -# Can set a hostvar 'env' to an arbitrary string to group prometheus targets, e.g. rack. -# env: location-1 +# Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack. prometheus_targets: - node: "{{ groups.get('node_exporter', []) | reject('equalto', 'localhost') | prometheus_node_exporter_targets(env | default('ungrouped')) }}" + node: "{{ groups.get('node_exporter', []) | prometheus_node_exporter_targets(hostvars, 'prometheus_env') }}" prometheus_scrape_configs_default: - job_name: "prometheus" @@ -40,7 +39,7 @@ prometheus_scrape_configs_default: - job_name: "node" file_sd_configs: - files: - - "/etc/prometheus/file_sd/node.yml" + - /etc/prometheus/file_sd/node.yml relabel_configs: # strip off port - source_labels: ['__address__'] From 16b3a9bf3c0e644b99ebc78433f537890812132c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 14:07:51 +0000 Subject: [PATCH 07/33] add group label (login,control,compute) to prom targets --- ansible/filter_plugins/utils.py | 15 ++++++++++++--- .../inventory/group_vars/all/prometheus.yml | 8 ++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 999fc763c..b5b92ed7e 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -11,7 +11,15 @@ import os.path import re -def prometheus_node_exporter_targets(hosts, hostvars, env_key): +def prometheus_node_exporter_targets(hosts, hostvars, env_key, group): + """ Return a mapping in cloudalchemy.nodeexporter prometheus_targets + format. + + hosts: list of inventory_hostnames + hostvars: Ansible hostvars variable + env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') + group: string to add as label 'group' + """ result = [] per_env = defaultdict(list) for host in hosts: @@ -19,9 +27,10 @@ def prometheus_node_exporter_targets(hosts, hostvars, env_key): per_env[host_env].append(host) for env, hosts in per_env.items(): target = { - "targets": ["{target}:9100".format(target=target) for target in hosts], + "targets": [f"{target}:9100" for target in hosts], "labels": { - "env": env + 'env': env, + 'group': group } } result.append(target) diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index adfdcaa43..dc616de53 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -24,7 +24,9 @@ prometheus_alert_rules: [] # Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack. prometheus_targets: - node: "{{ groups.get('node_exporter', []) | prometheus_node_exporter_targets(hostvars, 'prometheus_env') }}" + control: "{{ groups.get('node_exporter', []) | intersect(groups['control']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'control') }}" + login: "{{ groups.get('node_exporter', []) | intersect(groups['login']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'login') }}" + compute: "{{ groups.get('node_exporter', []) | intersect(groups['compute']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'compute') }}" prometheus_scrape_configs_default: - job_name: "prometheus" @@ -39,7 +41,9 @@ prometheus_scrape_configs_default: - job_name: "node" file_sd_configs: - files: - - /etc/prometheus/file_sd/node.yml + - /etc/prometheus/file_sd/control.yml + - /etc/prometheus/file_sd/login.yml + - /etc/prometheus/file_sd/compute.yml relabel_configs: # strip off port - source_labels: ['__address__'] From a9cd55e5d5e01a40673ae2ba83243cd0649664d2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 14:15:04 +0000 Subject: [PATCH 08/33] add node-exporter rules --- .../prometheus/rules/node-exporter.rules | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/environments/common/files/prometheus/rules/node-exporter.rules b/environments/common/files/prometheus/rules/node-exporter.rules index 60e40313e..6bee723b1 100644 --- a/environments/common/files/prometheus/rules/node-exporter.rules +++ b/environments/common/files/prometheus/rules/node-exporter.rules @@ -1,6 +1,26 @@ +# Mostly derived from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware +# In general have ignored lack of resources (memory, cpu) on compute nodes as +# this is expected, and ignored things which will be hard to threshold due to +# the nature of a Slurm cluster. + groups: - name: node-exporter rules: + + # Modified: only on login/control nodes + - alert: HostOutOfMemory + expr: ( + node_memory_MemAvailable_bytes{group=~"login|control"} / + node_memory_MemTotal_bytes{group=~"login|control"} + < .10 + ) + for: 2m + labels: + severity: warning + annotations: + summary: Host out of memory (instance {{ $labels.instance }}) + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please add ignored mountpoints in node_exporter parameters like # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. @@ -12,3 +32,140 @@ groups: annotations: summary: Host out of disk space (instance {{ $labels.instance }}) description: 'Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: HostOutOfInodes + expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) + for: 2m + labels: + severity: critical + annotations: + summary: Host out of inodes (instance {{ $labels.instance }}) + description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostFilesystemDeviceError + expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # TODO: make tunable + - alert: HostUnusualDiskWriteLatency + expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host unusual disk write latency (instance {{ $labels.instance }}) + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostCpuHighIowait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10 + for: 0m + labels: + severity: warning + annotations: + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSystemdServiceCrashed + expr: (node_systemd_unit_state{state="failed"} == 1) + for: 0m + labels: + severity: warning + annotations: + summary: Host systemd service crashed (instance {{ $labels.instance }}) + description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostSoftwareRaidDiskFailure + expr: (node_md_disks{state="failed"} > 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host software RAID disk failure (instance {{ $labels.instance }}) + description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Modified: only on login/control nodes + - alert: HostOomKillDetected + expr: (increase(node_vmstat_oom_kill{group=~"login|control"}[1m]) > 0) + for: 0m + labels: + severity: warning + annotations: + summary: Host OOM kill detected (instance {{ $labels.instance }}) + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: (node_edac_uncorrectable_errors_total > 0) + for: 0m + labels: + severity: warning + annotations: + summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Receive Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Transmit Errors (instance {{ $labels.instance }}) + description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostNetworkBondDegraded + expr: ((node_bonding_active - node_bonding_slaves) != 0) + for: 2m + labels: + severity: warning + annotations: + summary: Host Network Bond Degraded (instance {{ $labels.instance }}) + description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostConntrackLimit + expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) + for: 5m + labels: + severity: warning + annotations: + summary: Host conntrack limit (instance {{ $labels.instance }}) + description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockSkew + expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) + for: 10m + labels: + severity: warning + annotations: + summary: Host clock skew (instance {{ $labels.instance }}) + description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostClockNotSynchronising + expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) + for: 2m + labels: + severity: warning + annotations: + summary: Host clock not synchronising (instance {{ $labels.instance }}) + description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostRequiresReboot + expr: (node_reboot_required > 0) + for: 4h + labels: + severity: info + annotations: + summary: Host requires reboot (instance {{ $labels.instance }}) + description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From d806dee96d337f475cb146d3efdab1b50ce394f3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 15:21:38 +0000 Subject: [PATCH 09/33] alertmanager docs/defaults --- ansible/roles/alertmanager/README.md | 125 +++++++++++++++--- ansible/roles/alertmanager/defaults/main.yml | 20 ++- docs/production.md | 3 + .../inventory/group_vars/all/prometheus.yml | 4 + 4 files changed, 120 insertions(+), 32 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 93d538aff..f361f2c94 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -1,28 +1,90 @@ # alertmanager +Deploy [alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) +to route Prometheus alerts to a receiver. Currently Slack is the only supported +receiver. -notes: -- HA is not supported -- state ("notification state and configured silences") is not preserved across rebuild -- not used for caas -- no dashboard +Note that: +- HA configuration is not supported +- Alertmanager state is not preserved when the node it runs on (by default, + control node) is reimaged, so any alerts silenced via the GUI will reoccur. +- No Grafana dashboard for alerts is currently provided. +- not used for caas - todo - maybe we should disable by default, unless everyone has slack? +- have fixed bug with `env` hostvar for prom, now called `prometheus_env` +- have added label "group" to prom for control,compute,login +Alertmanager is enabled by default on the `control` node in the +[everything](../../../environments/common/layouts/everything) template which +`cookiecutter` uses for a new environment's `inventory/groups` file. + +In general usage may only require: +- Adding the `control` node into the `alertmanager` group in `environments/site/groups` + if upgrading an existing environment. +- Enabling the Slack integration (see below). ## Role variables +All variables are optional. See [defaults/main.yml](defaults/main.yml) for +all default values. + +General variables: +- `alertmanager_version`: String, version (no leading 'v') +- `alertmanager_download_checksum`: String, checksum for relevant version from + [prometheus.io download page](https://prometheus.io/download/), in format + `type:value`. +- `alertmanager_download_dest`: String, path of temporary directory used for + download. Must exist. +- `alertmanager_binary_dir`: String, path of directory to install alertmanager + binary to. Must exist. +- `alertmanager_started`: Bool, whether the alertmanager service should be started. +- `alertmanager_enabled`: Bool, whether the alertmanager service should be enabled. +- `alertmanager_system_user`: String, name of user to run alertmanager as. Will be created. +- `alertmanager_system_group`: String, name of group of alertmanager user. +- `alertmanager_port`: Port to listen on. + The following variables are equivalent to similarly-named arguments to the `alertmanager` binary. See `man alertmanager` for more info: -- TODO: - -The following variables are templated into the alertmanager configuration file: - -- TODO: - -Other variables: -- TODO: - +- `alertmanager_config_file`: String, path alertmanager config file will be + written to. Parent directory will be created if necessary. +- `alertmanager_storage_path`: String, base path for data storage. +- `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. +- `alertmanager_web_external_url`: String, the URL under which Alertmanager is + externally reachable. See man page for more details if proxying alertmanager. +- `alertmanager_data_retention`: String, how long to keep data for +- `alertmanager_data_maintenance_interval`: String, interval between garbage + collection and snapshotting to disk of the silences and the notification logs. +- `alertmanager_config_flags`: Mapping. Keys/values in here are written to the + alertmanager commandline as `--{{ key }}={{ value }}`. +- `alertmanager_default_receivers`: + +The following variables are templated into the [alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/): +- `alertmanager_config_template`: String, path to configuration template. The default + is to template in `alertmanager_config_default` and `alertmanager_config_extra`. +- `alertmanager_config_default`: Mapping with default configuration for the + top-level `route` and `receivers` keys. The default is to send all alerts to + the Slack receiver, if that has been enabled (see below). +- `alertmanager_receivers`: A list of [receiver](https://prometheus.io/docs/alerting/) + mappings to define under the top-level `receivers` configuration key. This + will contain the Slack receiver if that has been enabled (see below). +- `alertmanager_extra_receivers`: A list of additional [receiver](https://prometheus.io/docs/alerting/), + mappings to add, by default empty. +- `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in +`environments/common/inventory/group_vars/all/alertmanager.yml`. +- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. +- `alertmanager_config_extra`: Mapping with additional configuration. Keys in + this become top-level keys in the configuration. E.g this might be: + ```yaml + alertmanager_config_extra: + global: + smtp_from: smtp.example.org:587 + time_intervals: + - name: monday-to-friday + time_intervals: + - weekdays: ['monday:friday'] + ``` + Note that `route` and `receivers` keys should not be added here. ## TODO @@ -54,21 +116,42 @@ Swap: 0B 0B 0B 2. Add the bot token into the config and enable Slurm integration -- Open `environments/site/inventory/group_vars/all/vault_alertmanager.yml` +- Open `environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml` - Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token - Vault-encrypt that file: - ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml + ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml -- Open `environments/site/inventory/group_vars/all/alertmanager.yml` -- Uncomment the config and set your alert channel name +- Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` +- Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name 3. Invite the bot to your alerts channel - In the appropriate Slack channel type: - /invite @YOUR_BOT_NAME + /invite @YOUR_BOT_NAME + + +## Alert Rules + +These are part of [Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/), defined in the appliance at +[environments/common/inventory/group_vars/all/prometheus.yml](../../../environments/common/inventory/group_vars/all/prometheus.yml). + +A fairly-minimal set of default alert rule files is provided at +`environments/common/files/prometheus/rules/`. Because compute nodes are expected +to operate with heavy CPU and memory load, no alerting on those parameters is +defined for those nodes. +By default `prometheus_alert_rules_files` is set such that any `*.rules` files +in a directory `files/prometheus/rules` in the current environment or *any* +parent environment are loaded. So usually, site-specific alerts should be added +by creating additional rules files in `environments/site/files/prometheus/rules`. -## Adding Rules +Note that the Prometheus targets are defined such that each node will have labels: + - `env`: `ungrouped`, by default, unless a group/host var `prometheus_env` is set + - `group`: One of `login`, `control`, `compute` or `other` +These may be used to limit alerts to specific sets of nodes. -TODO: describe how prom config works \ No newline at end of file +Some ideas for future alerts which could be useful: +- smartctl-exporter-based rules for baremetal nodes where the is no + infrastructure-level smart monitoring +- loss of "up" network interfaces diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index bc2d3a934..897ce51a5 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -6,15 +6,14 @@ alertmanager_started: true alertmanager_enabled: true alertmanager_system_user: alertmanager -alertmanager_system_group: alertmanager +alertmanager_system_group: "{{ alertmanager_system_user }}" alertmanager_config_file: /etc/alertmanager/alertmanager.yml # --config.file: Alertmanager configuration file name alertmanager_storage_path: /var/lib/alertmanager # --storage.path: Base path for data storage alertmanager_port: '9093' -alertmanager_web_listen_addresses: # elements of --web.listen-address +alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" -alertmanager_web_external_url: "http://localhost:{{ alertmanager_port}}/" # --web.external-url: The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy). Used for generating relative and absolute links back to Alertmanager itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Alertmanager. If omitted, relevant URL components will be derived automatically -# TODO: work out how we proxy this through ondemand +alertmanager_web_external_url: "http://localhost:{{ alertmanager_port}}/" # TODO: is this right?? alertmanager_data_retention: '120h' # --data.retention # How long to keep data for alertmanager_data_maintenance_interval: '15m' # --data.maintenance-interval: Interval between garbage collection and snapshotting to disk of the silences and the notification logs @@ -30,19 +29,18 @@ alertmanager_config_template: alertmanager.yml.j2 # channel: '#alerts' # app_creds: - -alertmanager_default_receivers: - - name: 'null' - +alertmanager_null_receiver: + name: 'null' alertmanager_slack_receiver: {} # defined in common env as it needs prometheus_address - -alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}" +alertmanager_extra_receivers: [] +alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}" +alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" alertmanager_config_default: route: group_by: ['...'] receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}" - receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" + receivers: "{{ alertmanager_receivers }}" alertmanager_config_extra: {} # top-level only diff --git a/docs/production.md b/docs/production.md index c15298887..7e8130de3 100644 --- a/docs/production.md +++ b/docs/production.md @@ -149,3 +149,6 @@ and referenced from the `site` and `production` environments, e.g.: raised using [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html). In general it should be possible to raise this value to 50-100 if the cloud is properly tuned, again, demonstrated through testing. + +- Enable alertmanager following the [role docs](../ansible/roles/alertmanager/README.md) + if Slack is available. diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index dc616de53..3381f22a7 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -27,6 +27,9 @@ prometheus_targets: control: "{{ groups.get('node_exporter', []) | intersect(groups['control']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'control') }}" login: "{{ groups.get('node_exporter', []) | intersect(groups['login']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'login') }}" compute: "{{ groups.get('node_exporter', []) | intersect(groups['compute']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'compute') }}" + # openhpc is defined as control+login+compute so this gets anything else: + other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}" + # TODO: check empty list gets coped with correctly! prometheus_scrape_configs_default: - job_name: "prometheus" @@ -44,6 +47,7 @@ prometheus_scrape_configs_default: - /etc/prometheus/file_sd/control.yml - /etc/prometheus/file_sd/login.yml - /etc/prometheus/file_sd/compute.yml + - /etc/prometheus/file_sd/other.yml relabel_configs: # strip off port - source_labels: ['__address__'] From ce9ed5bf03d9931a537baa76bb0f7a4e81c3b5e1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Apr 2025 15:45:04 +0000 Subject: [PATCH 10/33] add node failure alert --- environments/common/files/prometheus/rules/slurm.rules | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/common/files/prometheus/rules/slurm.rules b/environments/common/files/prometheus/rules/slurm.rules index 396db0d4b..be17d66a2 100644 --- a/environments/common/files/prometheus/rules/slurm.rules +++ b/environments/common/files/prometheus/rules/slurm.rules @@ -9,3 +9,8 @@ groups: expr: "slurm_nodes_down > 0\n" labels: severity: critical + - alert: SlurmNodeFail + annotations: + description: '{{ $value }} Slurm nodes are in fail status' + summary: 'At least one Slurm node is failed.' + expr: "slurm_nodes_fail > 0\n" From 9c2469fd15631f5246abc99d7d0d376f742c3813 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:07:50 +0000 Subject: [PATCH 11/33] update alertmanager comments --- ansible/roles/alertmanager/README.md | 3 ++- ansible/roles/alertmanager/defaults/main.yml | 10 ++-------- ansible/roles/alertmanager/tasks/configure.yml | 4 +--- environments/common/files/prometheus/rules/slurm.rules | 3 +++ 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index f361f2c94..c4ca25536 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -21,7 +21,8 @@ Alertmanager is enabled by default on the `control` node in the In general usage may only require: - Adding the `control` node into the `alertmanager` group in `environments/site/groups` if upgrading an existing environment. -- Enabling the Slack integration (see below). +- Enabling the Slack integration (see section below). +- Possibly setting `alertmanager_web_external_url`. ## Role variables diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index 897ce51a5..9f5e30ba3 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -15,13 +15,11 @@ alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" alertmanager_web_external_url: "http://localhost:{{ alertmanager_port}}/" # TODO: is this right?? -alertmanager_data_retention: '120h' # --data.retention # How long to keep data for -alertmanager_data_maintenance_interval: '15m' # --data.maintenance-interval: Interval between garbage collection and snapshotting to disk of the silences and the notification logs +alertmanager_data_retention: '120h' +alertmanager_data_maintenance_interval: '15m' alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` -# TODO: data retention? alertmanager_config_template: alertmanager.yml.j2 - # everything below here is interpolated into alertmanager_config_default: # Uncomment below and add Slack bot app creds for Slack integration @@ -43,7 +41,3 @@ alertmanager_config_default: receivers: "{{ alertmanager_receivers }}" alertmanager_config_extra: {} # top-level only - - -# TODO: routes?? -# TODO: see PR with additional alerts diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index 5c3d651cd..14e47d505 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -9,8 +9,6 @@ - "{{ alertmanager_config_file | dirname }}" - "{{ alertmanager_storage_path }}" -# TODO: selinux? - - name: Create alertmanager service file with immutable options template: src: alertmanager.service.j2 @@ -20,7 +18,7 @@ mode: u=rw,go=r register: _alertmanager_service notify: Restart alertmanager - # TODO: how do we cope with the binary changing? + - name: Template alertmanager config ansible.builtin.template: diff --git a/environments/common/files/prometheus/rules/slurm.rules b/environments/common/files/prometheus/rules/slurm.rules index be17d66a2..474e5a19a 100644 --- a/environments/common/files/prometheus/rules/slurm.rules +++ b/environments/common/files/prometheus/rules/slurm.rules @@ -14,3 +14,6 @@ groups: description: '{{ $value }} Slurm nodes are in fail status' summary: 'At least one Slurm node is failed.' expr: "slurm_nodes_fail > 0\n" + +# TODO: alert on slurm_scheduler_dbd_queue_size - see vpenso exporter, man sdiag, and MaxDBDMsgs +# but node its dynamic From 72245f9a67a4ce6d9d89356c129be7e64322c228 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:13:44 +0000 Subject: [PATCH 12/33] fix slack creds being exposed in alertmanager config --- ansible/roles/alertmanager/tasks/configure.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index 14e47d505..7028147d5 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -26,7 +26,7 @@ dest: "{{ alertmanager_config_file }}" owner: "{{ alertmanager_system_user }}" group: "{{ alertmanager_system_group }}" - mode: u=rw,go=r # TODO: check there are no sensitive things in here! + mode: u=rw,go= notify: Restart alertmanager - meta: flush_handlers From 43f4a8740bc7b9f8274b2ae14c597ec744f68d44 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:19:04 +0000 Subject: [PATCH 13/33] change alerts to ignore compute --- .../files/prometheus/rules/node-exporter.rules | 14 ++++++++------ .../common/inventory/group_vars/all/prometheus.yml | 3 +-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/environments/common/files/prometheus/rules/node-exporter.rules b/environments/common/files/prometheus/rules/node-exporter.rules index 6bee723b1..7fa4c66ca 100644 --- a/environments/common/files/prometheus/rules/node-exporter.rules +++ b/environments/common/files/prometheus/rules/node-exporter.rules @@ -1,4 +1,6 @@ -# Mostly derived from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware +# Mostly taken from https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware +# If modified, this is noted in a comment. +# # In general have ignored lack of resources (memory, cpu) on compute nodes as # this is expected, and ignored things which will be hard to threshold due to # the nature of a Slurm cluster. @@ -7,11 +9,11 @@ groups: - name: node-exporter rules: - # Modified: only on login/control nodes + # Modified: ignore compute nodes - alert: HostOutOfMemory expr: ( - node_memory_MemAvailable_bytes{group=~"login|control"} / - node_memory_MemTotal_bytes{group=~"login|control"} + node_memory_MemAvailable_bytes{group!~"compute"} / + node_memory_MemTotal_bytes{group!~"compute"} < .10 ) for: 2m @@ -88,9 +90,9 @@ groups: summary: Host software RAID disk failure (instance {{ $labels.instance }}) description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Modified: only on login/control nodes + # Modified: ignore compute nodes - alert: HostOomKillDetected - expr: (increase(node_vmstat_oom_kill{group=~"login|control"}[1m]) > 0) + expr: (increase(node_vmstat_oom_kill{group!~"compute"}[1m]) > 0) for: 0m labels: severity: warning diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 3381f22a7..0e7839601 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -27,9 +27,8 @@ prometheus_targets: control: "{{ groups.get('node_exporter', []) | intersect(groups['control']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'control') }}" login: "{{ groups.get('node_exporter', []) | intersect(groups['login']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'login') }}" compute: "{{ groups.get('node_exporter', []) | intersect(groups['compute']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'compute') }}" - # openhpc is defined as control+login+compute so this gets anything else: + # openhpc is defined as control+login+compute so this gets any other node exporter targets: other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}" - # TODO: check empty list gets coped with correctly! prometheus_scrape_configs_default: - job_name: "prometheus" From 4fee13c30e93dea1b9b62e4afc502071bc66fa0b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:27:33 +0000 Subject: [PATCH 14/33] update rules comments --- ansible/roles/alertmanager/README.md | 6 ++++++ environments/common/inventory/group_vars/all/prometheus.yml | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index c4ca25536..8798a7903 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -24,6 +24,8 @@ In general usage may only require: - Enabling the Slack integration (see section below). - Possibly setting `alertmanager_web_external_url`. +TODO: explain where alertmanager web GUI is + ## Role variables All variables are optional. See [defaults/main.yml](defaults/main.yml) for @@ -156,3 +158,7 @@ Some ideas for future alerts which could be useful: - smartctl-exporter-based rules for baremetal nodes where the is no infrastructure-level smart monitoring - loss of "up" network interfaces + + +TODO: suggest awesome alerts +TODO: note that child env rule files override parent envs diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 0e7839601..ba363018c 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -15,10 +15,10 @@ prometheus_alertmanager_config_default: - "{{ alertmanager_address }}:{{ alertmanager_port }}" prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if groups['alertmanager'] else {} }}" -# by default, use rule files from the following path relative to current and all parent environment inventory directories: +# By default, find rule files from the following path relative to current and all parent environment inventory directories: +# Note: If the same file exists in parent and child environments, only the file in the latter has any effect. prometheus_alert_rules_files_inventory_glob: ../files/prometheus/rules/*.rules prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}" -# TODO: find a way to include/exclude files? prometheus_alert_rules: [] From ccd3014fd10c533412e5159ca9b8a92f0c442c18 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:35:08 +0000 Subject: [PATCH 15/33] change alertmanager external web url to use host IP --- ansible/roles/alertmanager/README.md | 3 ++- ansible/roles/alertmanager/defaults/main.yml | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 8798a7903..4558f3910 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -54,7 +54,8 @@ The following variables are equivalent to similarly-named arguments to the - `alertmanager_storage_path`: String, base path for data storage. - `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. - `alertmanager_web_external_url`: String, the URL under which Alertmanager is - externally reachable. See man page for more details if proxying alertmanager. + externally reachable - defaults to host IP address and `alertmanager_port`. + See man page for more details if proxying alertmanager. - `alertmanager_data_retention`: String, how long to keep data for - `alertmanager_data_maintenance_interval`: String, interval between garbage collection and snapshotting to disk of the silences and the notification logs. diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index 9f5e30ba3..7ff8a9841 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -7,13 +7,13 @@ alertmanager_enabled: true alertmanager_system_user: alertmanager alertmanager_system_group: "{{ alertmanager_system_user }}" -alertmanager_config_file: /etc/alertmanager/alertmanager.yml # --config.file: Alertmanager configuration file name -alertmanager_storage_path: /var/lib/alertmanager # --storage.path: Base path for data storage +alertmanager_config_file: /etc/alertmanager/alertmanager.yml +alertmanager_storage_path: /var/lib/alertmanager alertmanager_port: '9093' alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" -alertmanager_web_external_url: "http://localhost:{{ alertmanager_port}}/" # TODO: is this right?? +alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" alertmanager_data_retention: '120h' alertmanager_data_maintenance_interval: '15m' @@ -29,7 +29,7 @@ alertmanager_config_template: alertmanager.yml.j2 alertmanager_null_receiver: name: 'null' -alertmanager_slack_receiver: {} # defined in common env as it needs prometheus_address +alertmanager_slack_receiver: {} # defined in environments/common/inventory/group_vars/all/alertmanager.yml as it needs prometheus_address alertmanager_extra_receivers: [] alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}" alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" From a2c07e0b873a69a4c47872837b5665ee2e3e08fe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 08:46:14 +0000 Subject: [PATCH 16/33] fix up prom address in slack alert links --- ansible/roles/alertmanager/README.md | 1 + environments/common/inventory/group_vars/all/alertmanager.yml | 2 +- environments/common/inventory/group_vars/all/prometheus.yml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 4558f3910..ded761db0 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -134,6 +134,7 @@ Swap: 0B 0B 0B /invite @YOUR_BOT_NAME +TODO: note that `prometheus_web_external_url` might need overriding too. ## Alert Rules diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 3c02598c6..616fac640 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -10,5 +10,5 @@ alertmanager_slack_receiver: # defined here as needs prometheus address authorization: credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" - title_link: "http://{{ prometheus_address }}:9090/alertmanager/#/alerts?receiver=slack-receiver" + title_link: "{{ prometheus_web_external_url }}/alerts?receiver=slack-receiver" send_resolved: true diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index ba363018c..2458c64ea 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -4,7 +4,7 @@ # for variable definitions prometheus_version: 2.27.0 # default from ansible/roles/cloudalchemy.prometheus/defaults/main.yml -prometheus_web_external_url: "http://{{ prometheus_address }}:9090" +prometheus_web_external_url: "http://{{ hostvars[groups['prometheus'].0].ansible_host }}:9090/" # default to host IP address prometheus_storage_retention: "31d" prometheus_storage_retention_size: "100GB" prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" From 99df07fb94cc60f5655fe00e6ec8bc583c3880a4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 10:56:59 +0000 Subject: [PATCH 17/33] alert on large Slurmdbd queue --- environments/common/files/prometheus/rules/slurm.rules | 3 --- .../common/inventory/group_vars/all/prometheus.yml | 8 +++++++- requirements.yml | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/environments/common/files/prometheus/rules/slurm.rules b/environments/common/files/prometheus/rules/slurm.rules index 474e5a19a..be17d66a2 100644 --- a/environments/common/files/prometheus/rules/slurm.rules +++ b/environments/common/files/prometheus/rules/slurm.rules @@ -14,6 +14,3 @@ groups: description: '{{ $value }} Slurm nodes are in fail status' summary: 'At least one Slurm node is failed.' expr: "slurm_nodes_fail > 0\n" - -# TODO: alert on slurm_scheduler_dbd_queue_size - see vpenso exporter, man sdiag, and MaxDBDMsgs -# but node its dynamic diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 2458c64ea..f81584f9a 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -20,7 +20,13 @@ prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if gr prometheus_alert_rules_files_inventory_glob: ../files/prometheus/rules/*.rules prometheus_alert_rules_files: "{{ ansible_inventory_sources | product([prometheus_alert_rules_files_inventory_glob]) | map('join', '/') | map('realpath') }}" -prometheus_alert_rules: [] +prometheus_alert_rules: + - alert: SlurmDBDQueueLarge + # NB: {{ templates }} in annotations.description are interpolated by prometheus, in expr by ansible + annotations: + description: '{% raw %}Slurm DBD message queue size {{ $value }} is larger than half Slurm parameter MaxDBDMsgs - check database health{% endraw %}' + summary: 'Slurm DBD message queue is large.' + expr: "slurm_scheduler_dbd_queue_size > {{ hostvars[groups['control'].0].ansible_local.slurm.MaxDBDMsgs | int }}" # Can set a hostvar 'prometheus_env' to an arbitrary string to group prometheus targets, e.g. by rack. prometheus_targets: diff --git a/requirements.yml b/requirements.yml index 15a6e5c4b..4a181bec4 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.27.0 + version: feat/facts # TODO: bump to release name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 167d37eb8416fd445745cb9c3a479ed65b43371d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 12:58:53 +0000 Subject: [PATCH 18/33] don't alert on /run/credentials/systemd fs problems --- .../common/files/prometheus/rules/node-exporter.rules | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/environments/common/files/prometheus/rules/node-exporter.rules b/environments/common/files/prometheus/rules/node-exporter.rules index 7fa4c66ca..95c3ea9c9 100644 --- a/environments/common/files/prometheus/rules/node-exporter.rules +++ b/environments/common/files/prometheus/rules/node-exporter.rules @@ -44,8 +44,12 @@ groups: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Modified: ignore /run/credentials paths - alert: HostFilesystemDeviceError - expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1 + expr: node_filesystem_device_error{ + fstype!~"^(fuse.*|tmpfs|cifs|nfs)", + mountpoint!~"/run/credentials/.*" + } == 1 for: 2m labels: severity: critical From e6a4d3ca26752cabaf5e59c87c791f3c76c26c93 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 11 Apr 2025 12:59:09 +0000 Subject: [PATCH 19/33] add alertmanager docs --- ansible/roles/alertmanager/README.md | 81 +------------------- docs/alerting.md | 106 +++++++++++++++++++++++++++ docs/monitoring-and-logging.md | 22 ++++-- docs/production.md | 3 +- 4 files changed, 123 insertions(+), 89 deletions(-) create mode 100644 docs/alerting.md diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index ded761db0..e11d0693d 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -10,10 +10,6 @@ Note that: control node) is reimaged, so any alerts silenced via the GUI will reoccur. - No Grafana dashboard for alerts is currently provided. -- not used for caas - todo - maybe we should disable by default, unless everyone has slack? -- have fixed bug with `env` hostvar for prom, now called `prometheus_env` -- have added label "group" to prom for control,compute,login - Alertmanager is enabled by default on the `control` node in the [everything](../../../environments/common/layouts/everything) template which `cookiecutter` uses for a new environment's `inventory/groups` file. @@ -24,7 +20,7 @@ In general usage may only require: - Enabling the Slack integration (see section below). - Possibly setting `alertmanager_web_external_url`. -TODO: explain where alertmanager web GUI is +The web UI is available on `alertmanager_web_external_url`. ## Role variables @@ -89,78 +85,3 @@ The following variables are templated into the [alertmanager configuration](http - weekdays: ['monday:friday'] ``` Note that `route` and `receivers` keys should not be added here. - -## TODO - -memory usage looks a bit close: - -``` -[root@RL9-control rocky]# free -h - total used free shared buff/cache available -Mem: 3.6Gi 2.4Gi 168Mi 11Mi 1.5Gi 1.2Gi -Swap: 0B 0B 0B -``` - - - -## Slack Integration - -1. Create an app with a bot token: - -- Go to https://api.slack.com/apps -- select "Create an App" -- select "From scratch" -- Set app name and workspacef fields, select "Create" -- Fill out "Short description" and "Background color" fields, select "Save changes" -- Select "OAuth & Permissions" on left menu -- Under "Scopes : Bot Token Scopes", select "Add an OAuth Scope", add - `chat:write` and select "Save changes" -- Select "Install App" on left menu, select "Install to your-workspace", select Allow -- Copy the Bot User OAuth token shown - -2. Add the bot token into the config and enable Slurm integration - -- Open `environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml` -- Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token -- Vault-encrypt that file: - - ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml - -- Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` -- Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name - -3. Invite the bot to your alerts channel -- In the appropriate Slack channel type: - - /invite @YOUR_BOT_NAME - -TODO: note that `prometheus_web_external_url` might need overriding too. - -## Alert Rules - -These are part of [Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/), defined in the appliance at -[environments/common/inventory/group_vars/all/prometheus.yml](../../../environments/common/inventory/group_vars/all/prometheus.yml). - -A fairly-minimal set of default alert rule files is provided at -`environments/common/files/prometheus/rules/`. Because compute nodes are expected -to operate with heavy CPU and memory load, no alerting on those parameters is -defined for those nodes. - -By default `prometheus_alert_rules_files` is set such that any `*.rules` files -in a directory `files/prometheus/rules` in the current environment or *any* -parent environment are loaded. So usually, site-specific alerts should be added -by creating additional rules files in `environments/site/files/prometheus/rules`. - -Note that the Prometheus targets are defined such that each node will have labels: - - `env`: `ungrouped`, by default, unless a group/host var `prometheus_env` is set - - `group`: One of `login`, `control`, `compute` or `other` -These may be used to limit alerts to specific sets of nodes. - -Some ideas for future alerts which could be useful: -- smartctl-exporter-based rules for baremetal nodes where the is no - infrastructure-level smart monitoring -- loss of "up" network interfaces - - -TODO: suggest awesome alerts -TODO: note that child env rule files override parent envs diff --git a/docs/alerting.md b/docs/alerting.md new file mode 100644 index 000000000..3c0401e4c --- /dev/null +++ b/docs/alerting.md @@ -0,0 +1,106 @@ +# Alerting + +The [prometheus.io docs](https://prometheus.io/docs/alerting/latest/overview/) +describe the overall alerting process: + +> Alerting with Prometheus is separated into two parts. Alerting rules in + Prometheus servers send alerts to an Alertmanager. The Alertmanager then + manages those alerts, including silencing, inhibition, aggregation and + sending out notifications via methods such as email, on-call notification + systems, and chat platforms. + +By default, both a `prometheus` server and an `alertmanager` server are +deployed on the control node for new environments: + +```ini +# environments/site/groups: +[prometheus:children] +control + +[alertmanager:children] +control +``` + +The general Prometheus configuration is described in +[monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note this +section specifies some role variables which commonly need modification. + +The alertmanager server is defined by the [ansible/roles/alertmanager](../ansible/roles/alertmanager/README.md), +and all the configuration options and defaults are defined there. By default +it will be fully functional but: +- `alertmanager_web_external_url` is likely to require modification. +- A [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) + must be defined to actually provide notifications. Currently a Slack receiver + integration is provided (see below) but alternative receivers + could be defined using the provided role variables. + +## Slack receiver + +This section describes how to enable the Slack receiver to provide notifications +of alerts via Slack. + +1. Create an app with a bot token: + +- Go to https://api.slack.com/apps +- select "Create an App" +- select "From scratch" +- Set app name and workspace fields, select "Create" +- Fill out "Short description" and "Background color" fields, select "Save changes" +- Select "OAuth & Permissions" on left menu +- Under "Scopes : Bot Token Scopes", select "Add an OAuth Scope", add + `chat:write` and select "Save changes" +- Select "Install App" on left menu, select "Install to your-workspace", select Allow +- Copy the Bot User OAuth token shown + +2. Add the bot token into the config and enable Slack integration: + +- Open `environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml` +- Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token +- Vault-encrypt that file: + + ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml + +- Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` +- Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name + +3. Invite the bot to your alerts channel +- In the appropriate Slack channel type: + + /invite @YOUR_BOT_NAME + + +## Alerting Rules + +These are part of [Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) +which is defined appliance at +[environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). + +Two `cloudalchemy.prometheus` role variables are relevant: +- `prometheus_alert_rules_files`: Paths to check for files providing rules. + Note these are copied to Prometheus config directly, so jinja expressions for + Prometheus do not need escaping. +- `prometheus_alert_rules`: Yaml-format rules. Jinja templating here will be +interpolated by Ansible, so templating intended for Prometheus must be escaped +using `{% raw %}`/`{% endraw %}` tags. + +By default, `prometheus_alert_rules_files` is set so that any `*.rules` files +in a directory `files/prometheus/rules` in the current environment or *any* +parent environment are loaded. So usually, site-specific alerts should be added +by creating additional rules files in `environments/site/files/prometheus/rules`. +If the same file exists in more than one environment, the "child" file will take +precedence and any rules in the "parent" file will be ignored. + +A set of default alert rule files is provided at `environments/common/files/prometheus/rules/`. +These cover: +- Some node-exporter metrics for disk, filesystems, memory and clock. Note + no alerts are triggered on memory for compute nodes due to the intended use + of those nodes. +- Slurm nodes in DOWN or FAIL states, or the Slurm DBD message queue being too + large, usually indicating a database problem. + +When defining additional rules, note the [labels defined](./monitoring-and-logging.md#prometheus_node_exporter_targets) for node-exporter targets. + +In future more alerts may be added for: +- smartctl-exporter-based rules for baremetal nodes where there is no + infrastructure-level smart monitoring +- loss of "up" network interfaces diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index db228d410..6913c285f 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -215,6 +215,12 @@ Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) +Prometheus will be functional by default but the following variables should +commonly be modified: +- `prometheus_web_external_url` +- `prometheus_storage_retention` +- `prometheus_storage_retention_size` + ### Placement The `prometheus` group determines the placement of the prometheus service. Load balancing is currently unsupported so it is important that you only assign one host to this group. @@ -240,12 +246,7 @@ This appliance provides a default set of recording rules which can be found here The intended purpose is to pre-compute some expensive queries that are used in the reference set of grafana dashboards. -To add new, or to remove rules you will be to adjust the `prometheus_alert_rules_files` variable. The default value can be found in: - -> [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) - -You can extend this variable in your environment specific configuration to reference extra files or to remove the defaults. The reference set of dashboards expect these variables to be defined, so if you remove them, you -will also have to update your dashboards. +For information on configuring alerting rules see [docs/alerting.md#alerting-rules](./alerting.md#alerting-rules). ### node_exporter @@ -273,7 +274,14 @@ Variables in this file should *not* be customised directly, but should be overri #### prometheus_node_exporter_targets -Groups prometheus targets into per environment groups. The ansible variable, `env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `env: $env`, where `$env` is the value of the `env` variable for that host. +Groups prometheus targets. Metrics from `node_exporter` hosts have two labels +applied: + - `env`: This is set from the Ansible variable `prometheus_env` if present + (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be + used to group metrics by some arbitrary "environment", e.g. rack. + - `group`: This refers to the "top-level" inventory group for the host and + is one of `control`, `login`, `compute` or `other`. This can be used to + define rules for specific host functionalities. ## slurm-stats diff --git a/docs/production.md b/docs/production.md index 7e8130de3..7fcff1d7e 100644 --- a/docs/production.md +++ b/docs/production.md @@ -150,5 +150,4 @@ and referenced from the `site` and `production` environments, e.g.: In general it should be possible to raise this value to 50-100 if the cloud is properly tuned, again, demonstrated through testing. -- Enable alertmanager following the [role docs](../ansible/roles/alertmanager/README.md) - if Slack is available. +- Enable alertmanager if Slack is available - see [docs/alerting.md](./alerting.md). From 8d04c50508f29d94a3bad7f37c3152a6783551f9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 07:45:17 +0000 Subject: [PATCH 20/33] guard alertmanager install --- ansible/fatimage.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 983aa5f10..0b4335b14 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -182,6 +182,7 @@ include_role: name: alertmanager tasks_from: install.yml + when: "'alertmanager' in group_names" - hosts: prometheus become: yes From c8d761cad94c2f956e54a40d9186a418ed49a59c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 09:04:02 +0000 Subject: [PATCH 21/33] fix unused turbovcn service crashing --- ansible/roles/openondemand/tasks/vnc_compute.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 6ec340249..8b6f6cdec 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -35,6 +35,13 @@ - python3.9 - dbus-x11 +- name: Stop turbovnc service + # This is not actually required + systemd: + name: tvncserver + state: stopped + enabled: false + - name: Replace OFED-installed init scripts ansible.builtin.copy: src: /etc/init.d.orig/ # trailing / to get contents From ba1a95e02be3c323ffcbaaacc576bd6c954b7371 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 10:00:34 +0000 Subject: [PATCH 22/33] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 3f449a8bf..073854533 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250409-0953-f5aefb1e", - "RL9": "openhpc-RL9-250409-0953-f5aefb1e" + "RL8": "openhpc-RL8-250415-0906-c8d761ca", + "RL9": "openhpc-RL9-250415-0906-c8d761ca" } } From 5c3e93c07952289400af190dd90cc04fdf8ff0fe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 11:12:52 +0000 Subject: [PATCH 23/33] add basic auth with default user for alertmanager --- ansible/roles/alertmanager/README.md | 14 +++- ansible/roles/alertmanager/defaults/main.yml | 11 ++- .../roles/alertmanager/tasks/configure.yml | 11 ++- .../templates/alertmanager.service.j2 | 1 + ansible/roles/passwords/defaults/main.yml | 1 + docs/alerting.md | 78 ++++++++++++++----- .../inventory/group_vars/all/alertmanager.yml | 2 + 7 files changed, 91 insertions(+), 27 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index e11d0693d..4cd6a6062 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -45,8 +45,10 @@ General variables: The following variables are equivalent to similarly-named arguments to the `alertmanager` binary. See `man alertmanager` for more info: -- `alertmanager_config_file`: String, path alertmanager config file will be - written to. Parent directory will be created if necessary. +- `alertmanager_config_file`: String, path the main alertmanager config file + will be written to. Parent directory will be created if necessary. +- `alertmanager_web_config_file`: String, path alertmanager web config file + will be written to. Parent directory will be created if necessary. - `alertmanager_storage_path`: String, base path for data storage. - `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. - `alertmanager_web_external_url`: String, the URL under which Alertmanager is @@ -59,7 +61,7 @@ The following variables are equivalent to similarly-named arguments to the alertmanager commandline as `--{{ key }}={{ value }}`. - `alertmanager_default_receivers`: -The following variables are templated into the [alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/): +The following variables are templated into the alertmanager [main configuration](https://prometheus.io/docs/alerting/latest/configuration/): - `alertmanager_config_template`: String, path to configuration template. The default is to template in `alertmanager_config_default` and `alertmanager_config_extra`. - `alertmanager_config_default`: Mapping with default configuration for the @@ -85,3 +87,9 @@ The following variables are templated into the [alertmanager configuration](http - weekdays: ['monday:friday'] ``` Note that `route` and `receivers` keys should not be added here. + +The following variables are templated into the alertmanager [web configuration](https://prometheus.io/docs/alerting/latest/https/): +- `alertmanager_web_config_default`: Mapping with default configuration for + `basic_auth_users` providing the default web user. +- `alertmanager_alertmanager_web_config_extra`: Mapping with additional web + configuration. Keys in this become top-level keys in the web configuration. diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index 7ff8a9841..cde4c355f 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -8,19 +8,26 @@ alertmanager_enabled: true alertmanager_system_user: alertmanager alertmanager_system_group: "{{ alertmanager_system_user }}" alertmanager_config_file: /etc/alertmanager/alertmanager.yml +alertmanager_web_config_file: /etc/alertmanager/alertmanager-web.yml alertmanager_storage_path: /var/lib/alertmanager alertmanager_port: '9093' alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" -alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" +alertmanager_web_external_url: '' # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility alertmanager_data_retention: '120h' alertmanager_data_maintenance_interval: '15m' alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` alertmanager_config_template: alertmanager.yml.j2 +alertmanager_web_config_template: alertmanager-web.yml.j2 -# everything below here is interpolated into alertmanager_config_default: +alertmanager_web_config_default: + basic_auth_users: + alertmanager: "{{ vault_alertmanager_admin_password | password_hash('bcrypt', '1234567890123456789012', ident='2b') }}" +alertmanager_alertmanager_web_config_extra: {} # top-level only + +# Variables below are interpolated into alertmanager_config_default: # Uncomment below and add Slack bot app creds for Slack integration # alertmanager_slack_integration: diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index 7028147d5..a43ec2041 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -7,6 +7,7 @@ mode: u=rwX,go=rX loop: - "{{ alertmanager_config_file | dirname }}" + - "{{ alertmanager_web_config_file | dirname }}" - "{{ alertmanager_storage_path }}" - name: Create alertmanager service file with immutable options @@ -19,7 +20,6 @@ register: _alertmanager_service notify: Restart alertmanager - - name: Template alertmanager config ansible.builtin.template: src: "{{ alertmanager_config_template }}" @@ -29,6 +29,15 @@ mode: u=rw,go= notify: Restart alertmanager +- name: Template alertmanager web config + ansible.builtin.template: + src: "{{ alertmanager_web_config_template }}" + dest: "{{ alertmanager_web_config_file }}" + owner: "{{ alertmanager_system_user }}" + group: "{{ alertmanager_system_group }}" + mode: u=rw,go= + notify: Restart alertmanager + - meta: flush_handlers - name: Ensure alertmanager service state diff --git a/ansible/roles/alertmanager/templates/alertmanager.service.j2 b/ansible/roles/alertmanager/templates/alertmanager.service.j2 index e58382cc0..ac457aaea 100644 --- a/ansible/roles/alertmanager/templates/alertmanager.service.j2 +++ b/ansible/roles/alertmanager/templates/alertmanager.service.j2 @@ -24,6 +24,7 @@ ExecStart={{ alertmanager_binary_dir }}/alertmanager \ --web.listen-address={{ address }} \ {% endfor %} --web.external-url={{ alertmanager_web_external_url }} \ + --web.config.file={{ alertmanager_web_config_file }} \ {% for flag, flag_value in alertmanager_config_flags.items() %} --{{ flag }}={{ flag_value }} \ {% endfor %} diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 0dbe66dd8..95e3b6aca 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -11,6 +11,7 @@ slurm_appliance_secrets: vault_k3s_node_password: "{{ vault_k3s_node_password | default(lookup('ansible.builtin.password', '/dev/null', length=64)) }}" vault_pulp_admin_password: "{{ vault_pulp_admin_password | default(lookup('password', '/dev/null', chars=['ascii_letters', 'digits'])) }}" vault_demo_user_password: "{{ vault_demo_user_password | default(lookup('password', '/dev/null')) }}" + vault_alertmanager_admin_password: "{{ vault_alertmanager_admin_password | default(lookup('password', '/dev/null')) }}" secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" diff --git a/docs/alerting.md b/docs/alerting.md index 3c0401e4c..b53c0fa40 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -9,30 +9,65 @@ describe the overall alerting process: sending out notifications via methods such as email, on-call notification systems, and chat platforms. -By default, both a `prometheus` server and an `alertmanager` server are -deployed on the control node for new environments: +The general Prometheus configuration is described in +[monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note that +section specifies some role variables which commonly need modification. + +The alertmanager server is defined by the [ansible/roles/alertmanager](../ansible/roles/alertmanager/README.md), +and all the configuration options and defaults are defined there. The defaults +are fully functional, except that a [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) +must be configured to generate notifications. + +## Enabling alertmanager + +1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the +control node - for new environments the `cookiecutter` tool will have done +this: + + ```ini + # environments/site/groups: + [prometheus:children] + control + + [alertmanager:children] + control + ``` + +2. If the appliance was deployed before the alertmanager functionality was included, +generate a password for the alertmanager UI user: -```ini -# environments/site/groups: -[prometheus:children] -control + ```shell + ansible-playbook ansible/adhoc/generate-passwords.yml + ``` -[alertmanager:children] -control +3. Configure a receiver to generate notifications from alerts. Currently a Slack +integration is provided (see below) but alternative receivers could be defined +via overriding role defaults. + +4. If desired, any other [role defaults](../ansible/roles/alertmanager/README.md) +may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. + +5. Run the `monitoring.yml` playbook (if the cluster is already up) to configure +both alertmanager and prometheus: + + ```shell + ansible-playbook ansible/monitoring.yml + ``` + +## Access + +There is a web interface provided by the alertmanager server. The default +address can be seen using: + +```shell +ansible localhost -m debug -a var=alertmanager_web_external_url ``` -The general Prometheus configuration is described in -[monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note this -section specifies some role variables which commonly need modification. +The user is `alertmanager` and the autogenerated password can be seen using: -The alertmanager server is defined by the [ansible/roles/alertmanager](../ansible/roles/alertmanager/README.md), -and all the configuration options and defaults are defined there. By default -it will be fully functional but: -- `alertmanager_web_external_url` is likely to require modification. -- A [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) - must be defined to actually provide notifications. Currently a Slack receiver - integration is provided (see below) but alternative receivers - could be defined using the provided role variables. +```shell +ansible localhost -m debug -a var=vault_alertmanager_admin_password +``` ## Slack receiver @@ -72,10 +107,11 @@ of alerts via Slack. ## Alerting Rules These are part of [Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -which is defined appliance at +which is defined for the appliance at [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). -Two `cloudalchemy.prometheus` role variables are relevant: +Two [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) +role variables are relevant: - `prometheus_alert_rules_files`: Paths to check for files providing rules. Note these are copied to Prometheus config directly, so jinja expressions for Prometheus do not need escaping. diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 616fac640..55bba998b 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -12,3 +12,5 @@ alertmanager_slack_receiver: # defined here as needs prometheus address text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" title_link: "{{ prometheus_web_external_url }}/alerts?receiver=slack-receiver" send_resolved: true + +alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" From d876471a5e7a6b8fd49d5bae8de462e649641b77 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 13:45:51 +0000 Subject: [PATCH 24/33] add missing alertmanager web config template --- ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 diff --git a/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 b/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 new file mode 100644 index 000000000..ba69f1694 --- /dev/null +++ b/ansible/roles/alertmanager/templates/alertmanager-web.yml.j2 @@ -0,0 +1,4 @@ +{{ ansible_managed | comment }} + +{{ alertmanager_web_config_default | to_nice_yaml }} +{{ alertmanager_alertmanager_web_config_extra | to_nice_yaml if alertmanager_alertmanager_web_config_extra | length > 0 else '' }} From 86ae3098416cfbc602d3149b4773c7653a1709a7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 15 Apr 2025 15:03:34 +0000 Subject: [PATCH 25/33] fix CI for secrets changing between PRs --- .github/workflows/stackhpc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 348edff35..9d8de76d7 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -109,7 +109,6 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook ansible/adhoc/generate-passwords.yml echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -135,6 +134,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml @@ -170,6 +170,7 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate ansible all -m wait_for_connection + ansible-playbook ansible/adhoc/generate-passwords.yml ansible-playbook -v ansible/site.yml ansible-playbook -v ansible/ci/check_slurm.yml From d7efaf6fadc602545e13337f6f6de30526a87ee9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Apr 2025 11:40:04 +0000 Subject: [PATCH 26/33] fix bug with json-encoded munge key in compute-init playbook --- ansible/roles/compute_init/files/compute-init.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 139726c83..086585a8d 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -299,7 +299,9 @@ # if not the case - name: Write Munge key copy: - content: "{{ openhpc_munge_key }}" + # NB: openhpc_munge_key is *binary* and may not survive json encoding + # so do same as environments/common/inventory/group_vars/all/openhpc.yml + content: "{{ vault_openhpc_mungekey | b64decode }}" dest: "/etc/munge/munge.key" owner: munge group: munge From 2ccf04140e9a7d9b967abd39799ed91724e5ab95 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 16 Apr 2025 12:23:09 +0000 Subject: [PATCH 27/33] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 073854533..b6b367fcd 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250415-0906-c8d761ca", - "RL9": "openhpc-RL9-250415-0906-c8d761ca" + "RL8": "openhpc-RL8-250416-1144-d7efaf6f", + "RL9": "openhpc-RL9-250416-1145-d7efaf6f" } } From 3bbb02f1e2a2cb51bbdabb70e4eac8ca911bba16 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 11:18:02 +0000 Subject: [PATCH 28/33] add extra prom alertmanager config + fix bug in same --- environments/common/inventory/group_vars/all/prometheus.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index f81584f9a..eccf4d6d3 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -13,7 +13,8 @@ prometheus_alertmanager_config_default: - static_configs: - targets: - "{{ alertmanager_address }}:{{ alertmanager_port }}" -prometheus_alertmanager_config: "{{ prometheus_alertmanager_config_default if groups['alertmanager'] else {} }}" +prometheus_alertmanager_config_extra: [] +prometheus_alertmanager_config: "{{ (prometheus_alertmanager_config_default if groups['alertmanager'] else []) + prometheus_alertmanager_config_extra }}" # By default, find rule files from the following path relative to current and all parent environment inventory directories: # Note: If the same file exists in parent and child environments, only the file in the latter has any effect. From 28ccabf69ea2bee612d1b6d82955ac6b6bd6d31b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 11:24:52 +0000 Subject: [PATCH 29/33] make slack alertmanager receiver more configurable --- ansible/roles/alertmanager/README.md | 2 ++ ansible/roles/alertmanager/defaults/main.yml | 2 +- .../common/inventory/group_vars/all/alertmanager.yml | 8 +++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 4cd6a6062..612761731 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -74,6 +74,8 @@ The following variables are templated into the alertmanager [main configuration] mappings to add, by default empty. - `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in `environments/common/inventory/group_vars/all/alertmanager.yml`. +- `alertmanager_slack_receiver_name`: String, name for the above Slack reciever. +- `alertmanager_slack_receiver_send_resolved`: Bool, whether to send resolved alerts via the above Slack reciever. - `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. - `alertmanager_config_extra`: Mapping with additional configuration. Keys in this become top-level keys in the configuration. E.g this might be: diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index cde4c355f..b30301739 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -44,7 +44,7 @@ alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra alertmanager_config_default: route: group_by: ['...'] - receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}" + receiver: "{{ alertmanager_slack_receiver_name if alertmanager_slack_integration is defined else 'null' }}" receivers: "{{ alertmanager_receivers }}" alertmanager_config_extra: {} # top-level only diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 55bba998b..c677aaa29 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,8 +1,10 @@ alertmanager_port: '9093' # defined here as required for prometheus +alertmanager_slack_receiver_name: slack-receiver +alertmanager_slack_receiver_send_resolved: true alertmanager_slack_receiver: # defined here as needs prometheus address - name: slack-receiver + name: "{{ alertmanager_slack_receiver_name }}" slack_configs: - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" api_url: https://slack.com/api/chat.postMessage @@ -10,7 +12,7 @@ alertmanager_slack_receiver: # defined here as needs prometheus address authorization: credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" - title_link: "{{ prometheus_web_external_url }}/alerts?receiver=slack-receiver" - send_resolved: true + title_link: "{{ prometheus_web_external_url }}/alerts?receiver={{ alertmanager_slack_receiver_name }}" + send_resolved: "{{ alertmanager_slack_receiver_send_resolved }}" alertmanager_web_external_url: "http://{{ hostvars[groups['alertmanager'].0].ansible_host }}:{{ alertmanager_port}}/" From 903353299504a641c29ef6765645ec965db0c155 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 13:22:12 +0000 Subject: [PATCH 30/33] bump openhpc role to get facts for alert config --- requirements.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.yml b/requirements.yml index 4a181bec4..87b2a6263 100644 --- a/requirements.yml +++ b/requirements.yml @@ -4,7 +4,7 @@ roles: version: v25.3.2 name: stackhpc.nfs - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: feat/facts # TODO: bump to release + version: v0.28.0 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc From 6820a37eaeee1da7434370961644ca5ab59dd08f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 13:24:28 +0000 Subject: [PATCH 31/33] remove empty alertmanager tasks file --- ansible/roles/alertmanager/tasks/main.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 ansible/roles/alertmanager/tasks/main.yml diff --git a/ansible/roles/alertmanager/tasks/main.yml b/ansible/roles/alertmanager/tasks/main.yml deleted file mode 100644 index e69de29bb..000000000 From 41c7331be442db6b09b54b13ade526fb2e9c3cca Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 14:05:48 +0000 Subject: [PATCH 32/33] bump CI image --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index b6b367fcd..56d3e237c 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250416-1144-d7efaf6f", - "RL9": "openhpc-RL9-250416-1145-d7efaf6f" + "RL8": "openhpc-RL8-250422-1328-1a6eff86", + "RL9": "openhpc-RL9-250422-1328-1a6eff86" } } From cad147f87862ebee0e91b6ea73d6e510488bf4ab Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 22 Apr 2025 16:00:25 +0000 Subject: [PATCH 33/33] fix promethes auth to alertmanager --- .../common/inventory/group_vars/all/prometheus.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index eccf4d6d3..6c40e66bb 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -10,9 +10,13 @@ prometheus_storage_retention_size: "100GB" prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" prometheus_alertmanager_config_default: - - static_configs: - - targets: - - "{{ alertmanager_address }}:{{ alertmanager_port }}" + - static_configs: + - targets: + - "{{ alertmanager_address }}:{{ alertmanager_port }}" + basic_auth: + username: alertmanager + password: "{{ vault_alertmanager_admin_password }}" + prometheus_alertmanager_config_extra: [] prometheus_alertmanager_config: "{{ (prometheus_alertmanager_config_default if groups['alertmanager'] else []) + prometheus_alertmanager_config_extra }}"