Skip to content

Commit 9c359d9

Browse files
committed
Now adds additional rules correctly
1 parent e724b5d commit 9c359d9

File tree

1 file changed

+131
-126
lines changed
  • ansible/roles/kube_prometheus_stack/defaults/main

1 file changed

+131
-126
lines changed

ansible/roles/kube_prometheus_stack/defaults/main/main.yml

Lines changed: 131 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -138,132 +138,137 @@ prometheus_scrape_configs:
138138
# - prometheus/targets/*.json
139139

140140
prometheus_alert_rules:
141-
- alert: Watchdog
142-
expr: vector(1)
143-
for: 10m
144-
labels:
145-
severity: warning
146-
annotations:
147-
description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
148-
summary: 'Ensure entire alerting pipeline is functional'
149-
- alert: InstanceDown
150-
expr: 'up == 0'
151-
for: 5m
152-
labels:
153-
severity: critical
154-
annotations:
155-
description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
156-
summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
157-
- alert: RebootRequired
158-
expr: 'node_reboot_required > 0'
159-
labels:
160-
severity: warning
161-
annotations:
162-
description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
163-
summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
164-
- alert: NodeFilesystemSpaceFillingUp
165-
annotations:
166-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
167-
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
168-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
169-
for: 1h
170-
labels:
171-
severity: warning
172-
- alert: NodeFilesystemSpaceFillingUp
173-
annotations:
174-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
175-
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
176-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
177-
for: 1h
178-
labels:
179-
severity: critical
180-
- alert: NodeFilesystemAlmostOutOfSpace
181-
annotations:
182-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
183-
summary: 'Filesystem has less than 5% space left.'
184-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
185-
for: 1h
186-
labels:
187-
severity: warning
188-
- alert: NodeFilesystemAlmostOutOfSpace
189-
annotations:
190-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
191-
summary: 'Filesystem has less than 3% space left.'
192-
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
193-
for: 1h
194-
labels:
195-
severity: critical
196-
- alert: NodeFilesystemFilesFillingUp
197-
annotations:
198-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
199-
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
200-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
201-
for: 1h
202-
labels:
203-
severity: warning
204-
- alert: NodeFilesystemFilesFillingUp
205-
annotations:
206-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
207-
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
208-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
209-
for: 1h
210-
labels:
211-
severity: critical
212-
- alert: NodeFilesystemAlmostOutOfFiles
213-
annotations:
214-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
215-
summary: 'Filesystem has less than 5% inodes left.'
216-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
217-
for: 1h
218-
labels:
219-
severity: warning
220-
- alert: NodeFilesystemAlmostOutOfFiles
221-
annotations:
222-
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
223-
summary: 'Filesystem has less than 3% inodes left.'
224-
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
225-
for: 1h
226-
labels:
227-
severity: critical
228-
- alert: NodeNetworkReceiveErrs
229-
annotations:
230-
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
231-
summary: 'Network interface is reporting many receive errors.'
232-
expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
233-
for: 1h
234-
labels:
235-
severity: warning
236-
- alert: NodeNetworkTransmitErrs
237-
annotations:
238-
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
239-
summary: 'Network interface is reporting many transmit errors.'
240-
expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
241-
for: 1h
242-
labels:
243-
severity: warning
244-
- alert: NodeHighNumberConntrackEntriesUsed
245-
annotations:
246-
description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
247-
summary: 'Number of conntrack are getting close to the limit'
248-
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
249-
labels:
250-
severity: warning
251-
- alert: NodeClockSkewDetected
252-
annotations:
253-
message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
254-
summary: 'Clock skew detected.'
255-
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
256-
for: 10m
257-
labels:
258-
severity: warning
259-
- alert: NodeClockNotSynchronising
260-
annotations:
261-
message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
262-
summary: 'Clock not synchronising.'
263-
expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
264-
for: 10m
265-
labels:
266-
severity: warning
141+
appliance-rules:
142+
groups:
143+
- name: all
144+
rules:
145+
- alert: Watchdog
146+
expr: vector(1)
147+
for: 10m
148+
labels:
149+
severity: warning
150+
alertname: Watchdog
151+
annotations:
152+
description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
153+
summary: 'Ensure entire alerting pipeline is functional'
154+
- alert: InstanceDown
155+
expr: 'up == 0'
156+
for: 5m
157+
labels:
158+
severity: critical
159+
annotations:
160+
description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}'
161+
summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}'
162+
- alert: RebootRequired
163+
expr: 'node_reboot_required > 0'
164+
labels:
165+
severity: warning
166+
annotations:
167+
description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}'
168+
summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}'
169+
- alert: NodeFilesystemSpaceFillingUp
170+
annotations:
171+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}'
172+
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
173+
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
174+
for: 1h
175+
labels:
176+
severity: warning
177+
- alert: NodeFilesystemSpaceFillingUp
178+
annotations:
179+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}'
180+
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
181+
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
182+
for: 1h
183+
labels:
184+
severity: critical
185+
- alert: NodeFilesystemAlmostOutOfSpace
186+
annotations:
187+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
188+
summary: 'Filesystem has less than 5% space left.'
189+
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
190+
for: 1h
191+
labels:
192+
severity: warning
193+
- alert: NodeFilesystemAlmostOutOfSpace
194+
annotations:
195+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}'
196+
summary: 'Filesystem has less than 3% space left.'
197+
expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
198+
for: 1h
199+
labels:
200+
severity: critical
201+
- alert: NodeFilesystemFilesFillingUp
202+
annotations:
203+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}'
204+
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
205+
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
206+
for: 1h
207+
labels:
208+
severity: warning
209+
- alert: NodeFilesystemFilesFillingUp
210+
annotations:
211+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}'
212+
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
213+
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
214+
for: 1h
215+
labels:
216+
severity: critical
217+
- alert: NodeFilesystemAlmostOutOfFiles
218+
annotations:
219+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
220+
summary: 'Filesystem has less than 5% inodes left.'
221+
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
222+
for: 1h
223+
labels:
224+
severity: warning
225+
- alert: NodeFilesystemAlmostOutOfFiles
226+
annotations:
227+
description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}'
228+
summary: 'Filesystem has less than 3% inodes left.'
229+
expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n"
230+
for: 1h
231+
labels:
232+
severity: critical
233+
- alert: NodeNetworkReceiveErrs
234+
annotations:
235+
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}'
236+
summary: 'Network interface is reporting many receive errors.'
237+
expr: "increase(node_network_receive_errs_total[2m]) > 10\n"
238+
for: 1h
239+
labels:
240+
severity: warning
241+
- alert: NodeNetworkTransmitErrs
242+
annotations:
243+
description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}'
244+
summary: 'Network interface is reporting many transmit errors.'
245+
expr: "increase(node_network_transmit_errs_total[2m]) > 10\n"
246+
for: 1h
247+
labels:
248+
severity: warning
249+
- alert: NodeHighNumberConntrackEntriesUsed
250+
annotations:
251+
description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}'
252+
summary: 'Number of conntrack are getting close to the limit'
253+
expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n"
254+
labels:
255+
severity: warning
256+
- alert: NodeClockSkewDetected
257+
annotations:
258+
message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}'
259+
summary: 'Clock skew detected.'
260+
expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n"
261+
for: 10m
262+
labels:
263+
severity: warning
264+
- alert: NodeClockNotSynchronising
265+
annotations:
266+
message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}'
267+
summary: 'Clock not synchronising.'
268+
expr: "min_over_time(node_timex_sync_status[5m]) == 0\n"
269+
for: 10m
270+
labels:
271+
severity: warning
267272

268273
# ------------------------------------------------------------------------------------------
269274

0 commit comments

Comments
 (0)