-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprom-rule..yaml
More file actions
205 lines (195 loc) · 10.4 KB
/
prom-rule..yaml
File metadata and controls
205 lines (195 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
additionalPrometheusRules:
- name: kubernetes-infra-rules
groups:
- name: infra.rules
rules:
- alert: NodeCPUUsage
expr: instance:node_cpu_utilisation:rate5m> 85
for: 2m
labels:
severity: critical
annotations:
summary: "{{$labels.node}}: High CPU usage detected"
description: '`{{$labels.node}}`:- CPU usage is above 85% (current value is: {{ $value | printf "%.2f" }}%)'
resolved_desc: '`{{$labels.node}}`:- *RESOLVED* : CPU usage is within thresholds (current value is: {{ $value | printf "%.2f" }}%)'
- alert: NodeMemoryUsage
expr: (100* instance:node_memory_utilisation:ratio )> 85
for: 2m
labels:
severity: critical
annotations:
summary: "{{$labels.node}}: High memory usage detected"
description: '`{{$labels.node}}`: Memory usage is above 85% (current value is: {{ $value | printf "%.2f" }}%)'
resolved_desc: '`{{$labels.node}}`:- *RESOLVED* : Memory usage is within thresholds (current value is: {{ $value | printf "%.2f" }}%)'
- alert: NodeDiskUsage
expr: 100- (100* (node_filesystem_avail_bytes /node_filesystem_size_bytes)) >85
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.node}}: High Disk Usage detected"
description: '`{{$labels.node}}`:- Disk usage is above 85% on {{$labels.device}} (current value is: {{ $value | printf "%.2f" }}%)'
resolved_desc: '`{{$labels.node}}`:- *RESOLVED* : Disk usage is within thresholds on {{$labels.device}} (current value is: {{ $value | printf "%.2f" }}%)'
- alert: ContainerMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace=~".*"}) BY (instance, name,pod,namespace) / sum(container_spec_memory_limit_bytes{namespace=~".*"} > 0) BY (instance, name,pod,namespace) * 100) > 80
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value | printf "%.2f" }}% High memory usage in namespace {{
$labels.namespace }} for pod {{ $labels.pod }}.'
resolved_desc: '*RESOLVED* : Memory usage is normal now (current value is: {{ $value | printf "%.2f" }})'
summary: Container Memory Usage is too high.
- alert: Too many Container restarts
annotations:
summary: Container named {{ $labels.container }} in {{ $labels.pod }} in {{ $labels.namespace }} was restarted
description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\nContainer name: {{$labels.container}}\n"
expr: sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m])) by (pod,namespace,container) > 5
for: 0m
labels:
severity: critical
app: "{{ $labels.pod }}"
- alert: PodOOMKilled
annotations:
summary: Pod named {{ $labels.pod }} in {{ $labels.namespace }} was OOMKilled
description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\nContainer name: {{$labels.container}}\n"
expr: (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0) and on (container)(increase(kube_pod_container_status_restarts_total[24h]) > 0)
for: 0m
labels:
severity: critical
app: "{{ $labels.pod }}"
- alert: CPUThrottlingHigh-override
annotations:
summary: CPUThrottlingHigh for container {{ $labels.container }} in pod {{ $labels.pod }}.
message: "{{ $value | humanizePercentage }} throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod
{{ $labels.pod }}."
expr: >-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="",
}[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 50 / 100 )
for: 20m
labels:
severity: critical
- alert: 100%-CPUThrottling-Critical
annotations:
summary: 100% Throttling of CPU for container {{ $labels.container }} in pod {{ $labels.pod }}.
message: "{{ $value | humanizePercentage }} throttling of CPU in namespace {{
$labels.namespace }} for container {{ $labels.container }} in pod
{{ $labels.pod }}."
expr: >-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="",
}[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 95 / 100 )
for: 2m
labels:
severity: critical
- alert: KubePodNotReady-override
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown", namespace=~".*"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 10m
labels:
severity: critical
team: infra
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
summary: Pod has been in a non-ready state for more than 5 minutes.
- alert: KubePersistentVolumeFillingUp-warning
expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.85
for: 10m
labels:
severity: warning
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is filled up more than 85%. Currently {{ $value | humanizePercentage }}.
summary: PersistentVolume {{ $labels.persistentvolumeclaim }} is filling up.
- alert: KubePersistentVolumeFillingUp-critical
expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95
for: 1m
labels:
severity: critical
team: infra
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is filled up more than 95%. Currently {{ $value | humanizePercentage }}.
summary: PersistentVolume {{ $labels.persistentvolumeclaim }} is filling up.
- alert: KubeContainerWaiting-override
annotations:
summary: Container {{ $labels.container }} is waiting for more than an hour.
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{
$labels.container}} has been in waiting state for longer than 1
hour.
expr: sum by (namespace, pod, container)
(kube_pod_container_status_waiting_reason{job="kube-state-metrics",
namespace=~".*"}) > 0
for: 1h
labels:
severity: critical
- alert: KubeHpaReplicasMismatch-override
annotations:
summary: HPA {{ $labels.hpa }} has not matched the desired no. of replicas for more than 15 mins.
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
desired number of replicas for longer than 15 minutes.
expr: >-
(kube_hpa_status_desired_replicas{job="kube-state-metrics",
namespace=~".*"}
!=
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
and
changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
severity: critical
- alert: KubeJobFailed-override
annotations:
summary: Job {{ $labels.job_name }} has failed to complete success within activeDeadlineSeconds.
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
expr: increase(kube_job_status_failed{job="kube-state-metrics", namespace=~".*"}[1h]) > 0
for: 0m
labels:
severity: critical
---
alertmanager:
config:
route:
group_by: ['namespace']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'devtron-infra-alert'
routes:
- receiver: 'null'
matchers:
- alertname = "Watchdog"
receivers:
- name: 'null'
- name: devtron-infra-alert
discord_configs:
- webhook_url: https://discord.com/api/webhooks/<>/<>
send_resolved: true
title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}'
---
- apiVersion: v1
kind: Pod
metadata:
name: alpine-stress
spec:
containers:
- name: alpine-stress
image: containerstack/alpine-stress
args:
- /bin/sh
- -c
- yes > /dev/null
ports:
- containerPort: 80
resources:
limits:
cpu: "0.5"
memory: 500Mi
requests:
cpu: "0.5"
memory: 500Mi
---
for oom killed alert check please run this command -: stress --vm 1 --vm-bytes 1G --vm-keep -t 60s