Skip to content

Commit 2d1dab5

Browse files
committed
Refactored and fixed slack integration
1 parent 5f89be8 commit 2d1dab5

File tree

3 files changed

+24
-23
lines changed

3 files changed

+24
-23
lines changed
Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,27 @@
1-
alertmanager_replicas: 1
2-
alertmanager_port: 30002 # Must be within K3s' reserved port range (default 30000-32767)
3-
4-
# Add receivers here, uncomment below and add Slack bot app creds for Slack integration
51
alertmanager_config:
62
route:
73
group_by: ['...']
8-
# receiver: slack-receiver
9-
global:
10-
resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}"
11-
receivers:
12-
- name: 'null'
13-
# - name: slack-receiver
14-
# slack_configs:
15-
# - channel: "{{ slack_integration.channel }}"
16-
# api_url: https://slack.com/api/chat.postMessage
17-
# http_config:
18-
# authorization:
19-
# credentials: "{{ slack_integration.app_creds }}"
20-
# text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}"
21-
# title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver"
22-
# send_resolved: true
4+
receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}"
5+
receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}"
6+
7+
alertmanager_default_receivers:
8+
- name: 'null'
9+
10+
alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}"
11+
12+
alertmanager_slack_receiver:
13+
name: slack-receiver
14+
slack_configs:
15+
- channel: "{{ alertmanager_slack_integration.channel | default('none') }}"
16+
api_url: https://slack.com/api/chat.postMessage
17+
http_config:
18+
authorization:
19+
credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}"
20+
text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}"
21+
title_link: "http://{{ prometheus_address }}/alertmanager/#/alerts?receiver=slack-receiver"
22+
send_resolved: true
2323

24-
# slack_integration:
24+
# Uncomment below and add Slack bot app creds for Slack integration
25+
# alertmanager_slack_integration:
2526
# channel: '#alerts'
2627
# app_creds:

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ api_address: "{{ inventory_hostname }}"
1616

1717
# Service endpoints
1818
opensearch_address: "127.0.0.1"
19-
prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
19+
prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}:{{ prometheus_port }}"
2020
openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}"
2121
grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
2222

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand
2626
prometheus_extra_rules:
2727
- alert: SlurmNodeDown
2828
annotations:
29-
message: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
29+
description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
3030
summary: 'At least one Slurm node is down.'
3131
expr: "slurm_nodes_down > 0\n"
3232
labels:
@@ -39,7 +39,7 @@ prometheus_extra_rules:
3939
expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
4040
- record: node_cpu_other_seconds:record
4141
expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode!="idle",mode!="user",mode!="system",mode!="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
42-
- record: node_cpu_scaling_frequency_hertz_avg:record # frequency rules aren't working
42+
- record: node_cpu_scaling_frequency_hertz_avg:record # Warning: frequency rules will not work when deploying appliance on VMs
4343
expr: avg by (instance) (node_cpu_scaling_frequency_hertz)
4444
- record: node_cpu_scaling_frequency_hertz_min:record
4545
expr: min by (instance) (node_cpu_scaling_frequency_hertz)

0 commit comments

Comments
 (0)