Skip to content

Commit d1e8c0a

Browse files
committed
Refactored monitoring config and removed redundant groups
1 parent 10d4e93 commit d1e8c0a

File tree

11 files changed

+78
-244
lines changed

11 files changed

+78
-244
lines changed

ansible/roles/kube_prometheus_stack/defaults/main/helm.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ kube_prometheus_stack_release_defaults:
7373
- ReadWriteOnce
7474
resources:
7575
requests:
76-
storage: "{{ kube_prometheus_stack_volume_size }}"
76+
storage: "{{ prometheus_volume_size }}"
7777
retention: "{{ prometheus_storage_retention }}"
7878
retentionSize: "{{ prometheus_storage_retention_size }}"
7979
additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}"
@@ -102,7 +102,7 @@ kube_prometheus_stack_release_defaults:
102102
serviceMonitor:
103103
enabled: false
104104
ingress:
105-
path: "/node/{{ groups['grafana'].0 }}/{{ grafana_port }}"
105+
path: "/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}"
106106
sidecar:
107107
dashboards:
108108
searchNamespace: ALL
@@ -113,7 +113,7 @@ kube_prometheus_stack_release_defaults:
113113
serve_from_sub_path: true
114114
auth: "{{ grafana_auth }}"
115115
auth.anonymous:
116-
enabled: "{{ grafana_anonymous_auth }}"
116+
enabled: "{{ grafana_auth_anonymous }}"
117117
analytics: "{{ grafana_analytics }}"
118118
smtp: "{{ grafana_smtp }}"
119119
log: "{{ grafana_log }}"

ansible/roles/kube_prometheus_stack/defaults/main/main.yml

Lines changed: 6 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably
1616
control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}"
1717
control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io"
1818

19-
grafana_claim_size: 10Gi
19+
grafana_volume_size: 10Gi
2020

21-
grafana_anonymous_auth: true
21+
grafana_auth_anonymous: true
2222

2323
slack_integration:
2424
channel: "#alerts"
@@ -64,7 +64,7 @@ prometheus_storage_retention: "30d"
6464
# supported: KB, MB, GB, TB, PB.
6565
prometheus_storage_retention_size: "40GB"
6666

67-
kube_prometheus_stack_volume_size: 40Gi
67+
prometheus_volume_size: 40Gi
6868

6969
prometheus_config_flags_extra: {}
7070
# prometheus_config_flags_extra:
@@ -126,138 +126,13 @@ prometheus_scrape_configs:
126126
# - prometheus/targets/*.yml
127127
# - prometheus/targets/*.json
128128

129+
prometheus_extra_alert_rules: []
130+
129131
prometheus_alert_rules:
130132
appliance-rules:
131133
groups:
132134
- name: all
133-
rules:
134-
- alert: Watchdog
135-
expr: vector(1)
136-
for: 10m
137-
labels:
138-
severity: warning
139-
alertname: Watchdog
140-
annotations:
141-
description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
142-
summary: 'Ensure entire alerting pipeline is functional'
143-
- alert: InstanceDown
144-
expr: 'up == 0'
145-
for: 5m
146-
labels:
147-
severity: critical
148-
annotations:
149-
description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
150-
summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
151-
- alert: RebootRequired
152-
expr: 'node_reboot_required > 0'
153-
labels:
154-
severity: warning
155-
annotations:
156-
description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
157-
summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
158-
- alert: NodeFilesystemSpaceFillingUp
159-
annotations:
160-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
161-
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
162-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
163-
for: 1h
164-
labels:
165-
severity: warning
166-
- alert: NodeFilesystemSpaceFillingUp
167-
annotations:
168-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
169-
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
170-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
171-
for: 1h
172-
labels:
173-
severity: critical
174-
- alert: NodeFilesystemAlmostOutOfSpace
175-
annotations:
176-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
177-
summary: 'Filesystem has less than 5% space left.'
178-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
179-
for: 1h
180-
labels:
181-
severity: warning
182-
- alert: NodeFilesystemAlmostOutOfSpace
183-
annotations:
184-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
185-
summary: 'Filesystem has less than 3% space left.'
186-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
187-
for: 1h
188-
labels:
189-
severity: critical
190-
- alert: NodeFilesystemFilesFillingUp
191-
annotations:
192-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
193-
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
194-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
195-
for: 1h
196-
labels:
197-
severity: warning
198-
- alert: NodeFilesystemFilesFillingUp
199-
annotations:
200-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
201-
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
202-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
203-
for: 1h
204-
labels:
205-
severity: critical
206-
- alert: NodeFilesystemAlmostOutOfFiles
207-
annotations:
208-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
209-
summary: 'Filesystem has less than 5% inodes left.'
210-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
211-
for: 1h
212-
labels:
213-
severity: warning
214-
- alert: NodeFilesystemAlmostOutOfFiles
215-
annotations:
216-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
217-
summary: 'Filesystem has less than 3% inodes left.'
218-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
219-
for: 1h
220-
labels:
221-
severity: critical
222-
- alert: NodeNetworkReceiveErrs
223-
annotations:
224-
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
225-
summary: 'Network interface is reporting many receive errors.'
226-
expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
227-
for: 1h
228-
labels:
229-
severity: warning
230-
- alert: NodeNetworkTransmitErrs
231-
annotations:
232-
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
233-
summary: 'Network interface is reporting many transmit errors.'
234-
expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
235-
for: 1h
236-
labels:
237-
severity: warning
238-
- alert: NodeHighNumberConntrackEntriesUsed
239-
annotations:
240-
description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
241-
summary: 'Number of conntrack are getting close to the limit'
242-
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
243-
labels:
244-
severity: warning
245-
- alert: NodeClockSkewDetected
246-
annotations:
247-
message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
248-
summary: 'Clock skew detected.'
249-
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
250-
for: 10m
251-
labels:
252-
severity: warning
253-
- alert: NodeClockNotSynchronising
254-
annotations:
255-
message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
256-
summary: 'Clock not synchronising.'
257-
expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
258-
for: 10m
259-
labels:
260-
severity: warning
135+
rules: "{{ prometheus_extra_alert_rules }}"
261136

262137
# ------------------------------------------------------------------------------------------
263138

ansible/roles/kube_prometheus_stack/tasks/main.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
app.kubernetes.io/name: prometheus-dir
4747
spec:
4848
capacity:
49-
storage: "{{ kube_prometheus_stack_volume_size }}"
49+
storage: "{{ prometheus_volume_size }}"
5050
accessModes:
5151
- ReadWriteOnce
5252
hostPath:
@@ -65,7 +65,7 @@
6565
app.kubernetes.io/name: grafana-dir
6666
spec:
6767
capacity:
68-
storage: "{{ grafana_claim_size }}"
68+
storage: "{{ grafana_volume_size }}"
6969
accessModes:
7070
- ReadWriteOnce
7171
hostPath:
@@ -85,7 +85,7 @@
8585
- ReadWriteOnce
8686
resources:
8787
requests:
88-
storage: "{{ grafana_claim_size }}"
88+
storage: "{{ grafana_volume_size }}"
8989
volumeMode: Filesystem
9090
volumeName: grafana-dir
9191

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
alertmanager_replicas: 1
2+
alertmanager_port: 30002 # Must be within K3s' reserved port range (default 30000-32767)
3+
4+
# Add receivers here, uncomment below and add Slack bot app creds for Slack integration
5+
alertmanager_config:
6+
route:
7+
group_by: ['...']
8+
# receiver: slack-receiver
9+
global:
10+
resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}"
11+
receivers:
12+
- name: 'null'
13+
# - name: slack-receiver
14+
# slack_configs:
15+
# - channel: "{{ slack_integration.channel }}"
16+
# api_url: https://slack.com/api/chat.postMessage
17+
# http_config:
18+
# authorization:
19+
# credentials: "{{ slack_integration.app_creds }}"
20+
# text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}"
21+
# title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver"
22+
# send_resolved: true
23+
24+
# slack_integration:
25+
# channel: '#alerts'
26+
# app_creds:

environments/common/inventory/group_vars/all/defaults.yml

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ api_address: "{{ inventory_hostname }}"
1818
opensearch_address: "127.0.0.1"
1919
prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
2020
openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}"
21-
grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}"
21+
grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}"
2222

2323
############################# bootstrap: local user configuration #########################
2424

@@ -50,29 +50,6 @@ appliances_local_users_default:
5050
shell: /sbin/nologin
5151
uid: 202
5252
system: true
53-
54-
- group:
55-
name: prometheus
56-
gid: 976
57-
user:
58-
name: prometheus
59-
uid: 981
60-
home: "{{ prometheus_db_dir }}"
61-
shell: /usr/sbin/nologin
62-
system: true
63-
enable: "{{ 'prometheus' in group_names }}"
64-
65-
- group:
66-
name: grafana
67-
gid: 979
68-
user:
69-
name: grafana
70-
comment: grafana user
71-
uid: 984
72-
home: /usr/share/grafana
73-
shell: /sbin/nologin
74-
system: true
75-
enable: "{{ 'grafana' in group_names }}"
7653

7754
# Overide this to add extra users whilst keeping the defaults.
7855
appliances_local_users_extra: [] # see format of appliances_local_users_default above

environments/common/inventory/group_vars/all/grafana.yml

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,15 @@
1-
---
2-
3-
# See: https://github.com/cloudalchemy/ansible-grafana
4-
# for variable definitions.
5-
grafana_version: '9.5.21'
6-
7-
# need to copy some role defaults here so we can use in inventory:
8-
grafana_port: 30001
1+
grafana_image_tag: '11.2.2'
2+
grafana_port: 30001 # Must be within K3s' reserved port range (default 30000-32767)
93

104
# Define where state is stored
115
grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana"
126

13-
# Configure internal address & URL - note "api" means "internal" to cloudalchemy.grafana but "external" to appliance:
14-
grafana_api_address: "{{ hostvars[groups['grafana'].0].internal_address }}"
15-
grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}"
16-
177
# Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy
8+
grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}"
9+
grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}"
1810
grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}"
19-
grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/{{ grafana_port }}"
11+
grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}"
2012
grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}"
21-
grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}"
2213

2314
grafana_dashboards_default:
2415
# node exporter slurm:
@@ -49,7 +40,6 @@ grafana_dashboards_default:
4940
- placeholder: DS_PROMETHEUS
5041
replacement: prometheus
5142
revision_id: 3
52-
5343
grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}"
5444

5545
grafana_security:
@@ -58,10 +48,6 @@ grafana_security:
5848
allow_embedding: true
5949

6050
grafana_datasources:
61-
# - name: prometheus
62-
# type: prometheus
63-
# url: "http://{{ prometheus_address }}:9090" # default prometheus port
64-
# editable: true
6551
- name: slurmstats
6652
# see https://github.com/grafana/opensearch-datasource#configure-the-data-source-with-provisioning
6753
type: grafana-opensearch-datasource
@@ -81,27 +67,11 @@ grafana_datasources:
8167
flavor: elasticsearch
8268
editable: true
8369
# readOnly: false
84-
8570
grafana_plugins:
8671
- grafana-opensearch-datasource 2.8.1
8772

88-
# want to set grafana_server.serve_from_sub_path if have Open Ondemand to proxy:
89-
grafana_server:
90-
# role defaults:
91-
protocol: http
92-
enforce_domain: false
93-
socket: ""
94-
cert_key: ""
95-
cert_file: ""
96-
enable_gzip: false
97-
static_root_path: public
98-
router_logging: false
99-
# appliance specific:
100-
serve_from_sub_path: "{{ grafana_serve_from_sub_path }}"
101-
102-
103-
grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards
104-
73+
grafana_auth_anonymous: true # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards
74+
grafana_volume_size: 10Gi
10575
_grafana_auth_anon_cfg:
10676
anonymous:
10777
org_name: "Main Org."
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
kube_prometheus_stack_chart_version: 59.1.0
2+
kube_prometheus_stack_release_namespace: monitoring-system
3+
kube_prometheus_stack_release_name: kube-prometheus-stack
4+
kube_prometheus_stack_wait_timeout: 5m
5+
6+
# See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services

0 commit comments

Comments
 (0)