Skip to content

Commit 52e4daa

Browse files
committed
WIP: add alertmanager
1 parent 6bf1dbb commit 52e4daa

File tree

12 files changed

+206
-1
lines changed

12 files changed

+206
-1
lines changed

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,5 @@ roles/*
8888
!roles/slurm_tools/**
8989
!roles/gateway/
9090
!roles/gateway/**
91+
!roles/alertmanager/
92+
!roles/alertmanager/**

ansible/monitoring.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,19 @@
8686
grafana_dashboards: []
8787
- import_role: # done in same play so it can use handlers from cloudalchemy.grafana
8888
name: grafana-dashboards
89+
90+
- name: Deploy alertmanager
91+
hosts: alertmanager
92+
tags: alertmanager
93+
become: yes
94+
gather_facts: false
95+
tasks:
96+
# TODO: move elsewhere, still needs become
97+
- name: Install alertmanager
98+
include_role:
99+
name: alertmanager
100+
tasks_from: install.yml
101+
- name: Configure alertmanager
102+
include_role:
103+
name: alertmanager
104+
tasks_from: configure.yml
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# alertmanager
2+
3+
4+
notes:
5+
- HA is not supported
6+
- state ("notification state and configured silences") is not preserved across rebuild
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
alertmanager_version: '0.28.1'
2+
alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311'
3+
alertmanager_download_dest: /tmp/alertmanager.tar.gz
4+
alertmanager_binary_dir: /usr/local/bin
5+
alertmanager_started: true
6+
alertmanager_enabled: true
7+
8+
alertmanager_system_user: alertmanager
9+
alertmanager_system_group: alertmanager
10+
alertmanager_config_path: /etc/alertmanager/alertmanager.yml
11+
alertmanager_storage_dir: /var/lib/alertmanager
12+
alertmanager_web_listen_addresses:
13+
- ':9100'
14+
alertmanager_web_external_url: http://localhost:9093/
15+
alertmanager_config_flags: {}
16+
# TODO: data retention?
17+
alertmanager_config_template: alertmanager.yml.j2
18+
19+
20+
# everything below here is interpolated into alertmanager_config_default:
21+
22+
# Uncomment below and add Slack bot app creds for Slack integration
23+
# alertmanager_slack_integration:
24+
# channel: '#alerts'
25+
# app_creds:
26+
27+
28+
alertmanager_default_receivers:
29+
- name: 'null'
30+
31+
alertmanager_slack_receiver: {} # really defined in common as it needs prometheus_address
32+
33+
alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}"
34+
35+
alertmanager_config_default:
36+
route:
37+
group_by: ['...']
38+
receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}"
39+
receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}"
40+
41+
alertmanager_config_extra: {} # top-level only
42+
43+
44+
# TODO: routes??
45+
# TODO: see PR with additional alerts
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
- name: Restart alertmanager
2+
systemd:
3+
name: alertmanager
4+
state: restarted
5+
daemon_reload: "{{ _alertmanager_service.changed | default(false) }}"
6+
when: alertmanager_started | bool
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
- name: Create alertmanager directories
2+
ansible.builtin.file:
3+
path: "{{ item }}"
4+
state: directory
5+
owner: "{{ alertmanager_system_user }}"
6+
group: "{{ alertmanager_system_group }}"
7+
mode: u=rwX,go=rX
8+
loop:
9+
- "{{ alertmanager_config_path | dirname }}"
10+
- "{{ alertmanager_storage_dir }}"
11+
12+
# TODO: selinux?
13+
14+
- name: Create alertmanager service file with immutable options
15+
template:
16+
src: alertmanager.service.j2
17+
dest: /usr/lib/systemd/system/alertmanager.service
18+
owner: root
19+
group: root
20+
mode: u=rw,go=r
21+
register: _alertmanager_service
22+
notify: Restart alertmanager
23+
# TODO: how do we cope with the binary changing?
24+
25+
- name: Template alertmanager config
26+
ansible.builtin.template:
27+
src: "{{ alertmanager_config_template }}"
28+
dest: "{{ alertmanager_config_path }}"
29+
owner: "{{ alertmanager_system_user }}"
30+
group: "{{ alertmanager_system_group }}"
31+
mode: u=rw,go=r # TODO: check there are no sensitive things in here!
32+
notify: Restart alertmanager
33+
34+
- meta: flush_handlers
35+
36+
- name: Ensure alertmanager service state
37+
systemd:
38+
name: alertmanager
39+
state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}"
40+
enabled: "{{ alertmanager_enabled | bool }}"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
- name: Create alertmanager system user
2+
ansible.builtin.user:
3+
name: "{{ alertmanager_system_user }}"
4+
system: true
5+
create_home: false
6+
7+
- name: Download alertmanager binary
8+
ansible.builtin.get_url:
9+
url: "https://github.com/prometheus/alertmanager/releases/download/v{{ alertmanager_version }}/alertmanager-{{ alertmanager_version }}.linux-amd64.tar.gz"
10+
dest: "{{ alertmanager_download_dest }}"
11+
owner: root
12+
group: root
13+
mode: u=rw,go=
14+
checksum: "{{ alertmanager_download_checksum }}"
15+
16+
- name: Unpack alertmanager binary
17+
ansible.builtin.unarchive:
18+
src: "{{ alertmanager_download_dest }}"
19+
include: "alertmanager-{{ alertmanager_version }}.linux-amd64/alertmanager"
20+
dest: "{{ alertmanager_binary_dir }}"
21+
owner: root
22+
group: root
23+
mode: u=rwx,go=rx
24+
remote_src: true
25+
extra_opts: ['--strip-components=1', '--show-stored-names']

ansible/roles/alertmanager/tasks/main.yml

Whitespace-only changes.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
3+
4+
{{ ansible_managed | comment }}
5+
[Unit]
6+
Description=Prometheus Alertmanager
7+
After=network-online.target
8+
StartLimitInterval=0
9+
StartLimitIntervalSec=0
10+
11+
[Service]
12+
Type=simple
13+
PIDFile=/run/alertmanager.pid
14+
User={{ alertmanager_system_user }}
15+
Group={{ alertmanager_system_group }}
16+
ExecReload=/bin/kill -HUP $MAINPID
17+
ExecStart={{ alertmanager_binary_dir }}/alertmanager \
18+
--cluster.listen-address='' \
19+
--config.file={{ alertmanager_config_path }} \
20+
--storage.path={{ alertmanager_storage_dir }} \
21+
{% for address in alertmanager_web_listen_addresses %}
22+
--web.listen-address={{ address }} \
23+
{% endfor %}
24+
--web.external-url={{ alertmanager_web_external_url }} \
25+
{% for flag, flag_value in alertmanager_config_flags.items() %}
26+
--{{ flag }}={{ flag_value }} \
27+
{% endfor %}
28+
29+
SyslogIdentifier=alertmanager
30+
Restart=always
31+
RestartSec=5
32+
33+
CapabilityBoundingSet=CAP_SET_UID
34+
LockPersonality=true
35+
NoNewPrivileges=true
36+
MemoryDenyWriteExecute=true
37+
PrivateTmp=true
38+
ProtectHome=true
39+
ReadWriteDirectories={{ alertmanager_storage_dir }}
40+
RemoveIPC=true
41+
RestrictSUIDSGID=true
42+
43+
PrivateUsers=true
44+
ProtectControlGroups=true
45+
ProtectKernelModules=true
46+
ProtectKernelTunables=yes
47+
ProtectSystem=strict
48+
49+
[Install]
50+
WantedBy=multi-user.target
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{{ ansible_managed | comment }}
2+
3+
{{ alertmanager_config_default }}
4+
{{ alertmanager_config_extra }}

0 commit comments

Comments
 (0)