Skip to content

Commit 9a40bf4

Browse files
committed
Merge remote-tracking branch 'origin/main' into HEAD
Conflicts: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
2 parents fd7e600 + c902fe1 commit 9a40bf4

File tree

39 files changed

+895
-80
lines changed

39 files changed

+895
-80
lines changed

.github/workflows/nightly-cleanup.yml

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -56,42 +56,13 @@ jobs:
5656
fi
5757
shell: bash
5858

59-
- name: Delete clusters if control node not tagged with keep
59+
- name: Delete CI clusters
6060
run: |
6161
. venv/bin/activate
62-
if [[ -z ${ci_clusters} ]]; then
62+
if [[ -z "${ci_clusters}" ]]; then
6363
echo "No clusters to delete."
6464
exit 0
6565
fi
66-
67-
for cluster_prefix in ${ci_clusters}
68-
do
69-
echo "Processing cluster: $cluster_prefix"
70-
# Get all servers with the matching name for control node
71-
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
72-
SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length)
73-
74-
if [[ $SERVER_COUNT -gt 1 ]]; then
75-
echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..."
76-
77-
for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do
78-
# Get tags for each control node
79-
TAGS=$(openstack server show "$server" --column tags --format value)
80-
81-
if [[ $TAGS =~ "keep" ]]; then
82-
echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep"
83-
else
84-
./dev/delete-cluster.py ${cluster_prefix} --force
85-
fi
86-
done
87-
else
88-
# If only one server, extract its tags and proceed
89-
TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags')
90-
if [[ $TAGS =~ "keep" ]]; then
91-
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
92-
else
93-
./dev/delete-cluster.py ${cluster_prefix} --force
94-
fi
95-
fi
96-
done
66+
echo "Deleting clusters: ${ci_clusters}"
67+
./dev/delete-cluster.py ${ci_clusters} --force
9768
shell: bash

.github/workflows/stackhpc.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ jobs:
109109
run: |
110110
. venv/bin/activate
111111
. environments/.stackhpc/activate
112-
ansible-playbook ansible/adhoc/generate-passwords.yml
113112
echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
114113
env:
115114
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
@@ -135,6 +134,7 @@ jobs:
135134
. venv/bin/activate
136135
. environments/.stackhpc/activate
137136
ansible all -m wait_for_connection
137+
ansible-playbook ansible/adhoc/generate-passwords.yml
138138
ansible-playbook -v ansible/site.yml
139139
ansible-playbook -v ansible/ci/check_slurm.yml
140140
@@ -170,6 +170,7 @@ jobs:
170170
. venv/bin/activate
171171
. environments/.stackhpc/activate
172172
ansible all -m wait_for_connection
173+
ansible-playbook ansible/adhoc/generate-passwords.yml
173174
ansible-playbook -v ansible/site.yml
174175
ansible-playbook -v ansible/ci/check_slurm.yml
175176

ansible/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,5 @@ roles/*
8888
!roles/slurm_tools/**
8989
!roles/gateway/
9090
!roles/gateway/**
91+
!roles/alertmanager/
92+
!roles/alertmanager/**

ansible/fatimage.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,12 @@
178178
slurm_exporter_state: stopped
179179
when: "'slurm_exporter' in group_names"
180180

181+
- name: Install alertmanager
182+
include_role:
183+
name: alertmanager
184+
tasks_from: install.yml
185+
when: "'alertmanager' in group_names"
186+
181187
- hosts: prometheus
182188
become: yes
183189
gather_facts: yes

ansible/filter_plugins/utils.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,26 @@
1111
import os.path
1212
import re
1313

14-
def prometheus_node_exporter_targets(hosts, env):
14+
def prometheus_node_exporter_targets(hosts, hostvars, env_key, group):
15+
""" Return a mapping in cloudalchemy.nodeexporter prometheus_targets
16+
format.
17+
18+
hosts: list of inventory_hostnames
19+
hostvars: Ansible hostvars variable
20+
env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped')
21+
group: string to add as label 'group'
22+
"""
1523
result = []
1624
per_env = defaultdict(list)
1725
for host in hosts:
18-
per_env[env].append(host)
26+
host_env = hostvars[host].get(env_key, 'ungrouped')
27+
per_env[host_env].append(host)
1928
for env, hosts in per_env.items():
2029
target = {
21-
"targets": ["{target}:9100".format(target=target) for target in hosts],
30+
"targets": [f"{target}:9100" for target in hosts],
2231
"labels": {
23-
"env": env
32+
'env': env,
33+
'group': group
2434
}
2535
}
2636
result.append(target)

ansible/monitoring.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,14 @@
8686
grafana_dashboards: []
8787
- import_role: # done in same play so it can use handlers from cloudalchemy.grafana
8888
name: grafana-dashboards
89+
90+
- name: Deploy alertmanager
91+
hosts: alertmanager
92+
tags: alertmanager
93+
become: yes
94+
gather_facts: false
95+
tasks:
96+
- name: Configure alertmanager
97+
include_role:
98+
name: alertmanager
99+
tasks_from: configure.yml
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# alertmanager
2+
3+
Deploy [alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/)
4+
to route Prometheus alerts to a receiver. Currently Slack is the only supported
5+
receiver.
6+
7+
Note that:
8+
- HA configuration is not supported
9+
- Alertmanager state is not preserved when the node it runs on (by default,
10+
control node) is reimaged, so any alerts silenced via the GUI will reoccur.
11+
- No Grafana dashboard for alerts is currently provided.
12+
13+
Alertmanager is enabled by default on the `control` node in the
14+
[everything](../../../environments/common/layouts/everything) template which
15+
`cookiecutter` uses for a new environment's `inventory/groups` file.
16+
17+
In general usage may only require:
18+
- Adding the `control` node into the `alertmanager` group in `environments/site/groups`
19+
if upgrading an existing environment.
20+
- Enabling the Slack integration (see section below).
21+
- Possibly setting `alertmanager_web_external_url`.
22+
23+
The web UI is available on `alertmanager_web_external_url`.
24+
25+
## Role variables
26+
27+
All variables are optional. See [defaults/main.yml](defaults/main.yml) for
28+
all default values.
29+
30+
General variables:
31+
- `alertmanager_version`: String, version (no leading 'v')
32+
- `alertmanager_download_checksum`: String, checksum for relevant version from
33+
[prometheus.io download page](https://prometheus.io/download/), in format
34+
`type:value`.
35+
- `alertmanager_download_dest`: String, path of temporary directory used for
36+
download. Must exist.
37+
- `alertmanager_binary_dir`: String, path of directory to install alertmanager
38+
binary to. Must exist.
39+
- `alertmanager_started`: Bool, whether the alertmanager service should be started.
40+
- `alertmanager_enabled`: Bool, whether the alertmanager service should be enabled.
41+
- `alertmanager_system_user`: String, name of user to run alertmanager as. Will be created.
42+
- `alertmanager_system_group`: String, name of group of alertmanager user.
43+
- `alertmanager_port`: Port to listen on.
44+
45+
The following variables are equivalent to similarly-named arguments to the
46+
`alertmanager` binary. See `man alertmanager` for more info:
47+
48+
- `alertmanager_config_file`: String, path the main alertmanager config file
49+
will be written to. Parent directory will be created if necessary.
50+
- `alertmanager_web_config_file`: String, path alertmanager web config file
51+
will be written to. Parent directory will be created if necessary.
52+
- `alertmanager_storage_path`: String, base path for data storage.
53+
- `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on.
54+
- `alertmanager_web_external_url`: String, the URL under which Alertmanager is
55+
externally reachable - defaults to host IP address and `alertmanager_port`.
56+
See man page for more details if proxying alertmanager.
57+
- `alertmanager_data_retention`: String, how long to keep data for
58+
- `alertmanager_data_maintenance_interval`: String, interval between garbage
59+
collection and snapshotting to disk of the silences and the notification logs.
60+
- `alertmanager_config_flags`: Mapping. Keys/values in here are written to the
61+
alertmanager commandline as `--{{ key }}={{ value }}`.
62+
- `alertmanager_default_receivers`:
63+
64+
The following variables are templated into the alertmanager [main configuration](https://prometheus.io/docs/alerting/latest/configuration/):
65+
- `alertmanager_config_template`: String, path to configuration template. The default
66+
is to template in `alertmanager_config_default` and `alertmanager_config_extra`.
67+
- `alertmanager_config_default`: Mapping with default configuration for the
68+
top-level `route` and `receivers` keys. The default is to send all alerts to
69+
the Slack receiver, if that has been enabled (see below).
70+
- `alertmanager_receivers`: A list of [receiver](https://prometheus.io/docs/alerting/)
71+
mappings to define under the top-level `receivers` configuration key. This
72+
will contain the Slack receiver if that has been enabled (see below).
73+
- `alertmanager_extra_receivers`: A list of additional [receiver](https://prometheus.io/docs/alerting/),
74+
mappings to add, by default empty.
75+
- `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in
76+
`environments/common/inventory/group_vars/all/alertmanager.yml`.
77+
- `alertmanager_slack_receiver_name`: String, name for the above Slack reciever.
78+
- `alertmanager_slack_receiver_send_resolved`: Bool, whether to send resolved alerts via the above Slack reciever.
79+
- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined.
80+
- `alertmanager_config_extra`: Mapping with additional configuration. Keys in
81+
this become top-level keys in the configuration. E.g this might be:
82+
```yaml
83+
alertmanager_config_extra:
84+
global:
85+
smtp_from: smtp.example.org:587
86+
time_intervals:
87+
- name: monday-to-friday
88+
time_intervals:
89+
- weekdays: ['monday:friday']
90+
```
91+
Note that `route` and `receivers` keys should not be added here.
92+
93+
The following variables are templated into the alertmanager [web configuration](https://prometheus.io/docs/alerting/latest/https/):
94+
- `alertmanager_web_config_default`: Mapping with default configuration for
95+
`basic_auth_users` providing the default web user.
96+
- `alertmanager_alertmanager_web_config_extra`: Mapping with additional web
97+
configuration. Keys in this become top-level keys in the web configuration.
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
alertmanager_version: '0.28.1'
2+
alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311'
3+
alertmanager_download_dest: /tmp/alertmanager.tar.gz
4+
alertmanager_binary_dir: /usr/local/bin
5+
alertmanager_started: true
6+
alertmanager_enabled: true
7+
8+
alertmanager_system_user: alertmanager
9+
alertmanager_system_group: "{{ alertmanager_system_user }}"
10+
alertmanager_config_file: /etc/alertmanager/alertmanager.yml
11+
alertmanager_web_config_file: /etc/alertmanager/alertmanager-web.yml
12+
alertmanager_storage_path: /var/lib/alertmanager
13+
14+
alertmanager_port: '9093'
15+
alertmanager_web_listen_addresses:
16+
- ":{{ alertmanager_port }}"
17+
alertmanager_web_external_url: '' # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility
18+
19+
alertmanager_data_retention: '120h'
20+
alertmanager_data_maintenance_interval: '15m'
21+
alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager`
22+
alertmanager_config_template: alertmanager.yml.j2
23+
alertmanager_web_config_template: alertmanager-web.yml.j2
24+
25+
alertmanager_web_config_default:
26+
basic_auth_users:
27+
alertmanager: "{{ vault_alertmanager_admin_password | password_hash('bcrypt', '1234567890123456789012', ident='2b') }}"
28+
alertmanager_alertmanager_web_config_extra: {} # top-level only
29+
30+
# Variables below are interpolated into alertmanager_config_default:
31+
32+
# Uncomment below and add Slack bot app creds for Slack integration
33+
# alertmanager_slack_integration:
34+
# channel: '#alerts'
35+
# app_creds:
36+
37+
alertmanager_null_receiver:
38+
name: 'null'
39+
alertmanager_slack_receiver: {} # defined in environments/common/inventory/group_vars/all/alertmanager.yml as it needs prometheus_address
40+
alertmanager_extra_receivers: []
41+
alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}"
42+
alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}"
43+
44+
alertmanager_config_default:
45+
route:
46+
group_by: ['...']
47+
receiver: "{{ alertmanager_slack_receiver_name if alertmanager_slack_integration is defined else 'null' }}"
48+
receivers: "{{ alertmanager_receivers }}"
49+
50+
alertmanager_config_extra: {} # top-level only
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
- name: Restart alertmanager
2+
systemd:
3+
name: alertmanager
4+
state: restarted
5+
daemon_reload: "{{ _alertmanager_service.changed | default(false) }}"
6+
when: alertmanager_started | bool
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
- name: Create alertmanager directories
2+
ansible.builtin.file:
3+
path: "{{ item }}"
4+
state: directory
5+
owner: "{{ alertmanager_system_user }}"
6+
group: "{{ alertmanager_system_group }}"
7+
mode: u=rwX,go=rX
8+
loop:
9+
- "{{ alertmanager_config_file | dirname }}"
10+
- "{{ alertmanager_web_config_file | dirname }}"
11+
- "{{ alertmanager_storage_path }}"
12+
13+
- name: Create alertmanager service file with immutable options
14+
template:
15+
src: alertmanager.service.j2
16+
dest: /usr/lib/systemd/system/alertmanager.service
17+
owner: root
18+
group: root
19+
mode: u=rw,go=r
20+
register: _alertmanager_service
21+
notify: Restart alertmanager
22+
23+
- name: Template alertmanager config
24+
ansible.builtin.template:
25+
src: "{{ alertmanager_config_template }}"
26+
dest: "{{ alertmanager_config_file }}"
27+
owner: "{{ alertmanager_system_user }}"
28+
group: "{{ alertmanager_system_group }}"
29+
mode: u=rw,go=
30+
notify: Restart alertmanager
31+
32+
- name: Template alertmanager web config
33+
ansible.builtin.template:
34+
src: "{{ alertmanager_web_config_template }}"
35+
dest: "{{ alertmanager_web_config_file }}"
36+
owner: "{{ alertmanager_system_user }}"
37+
group: "{{ alertmanager_system_group }}"
38+
mode: u=rw,go=
39+
notify: Restart alertmanager
40+
41+
- meta: flush_handlers
42+
43+
- name: Ensure alertmanager service state
44+
systemd:
45+
name: alertmanager
46+
state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}"
47+
enabled: "{{ alertmanager_enabled | bool }}"

0 commit comments

Comments
 (0)