eksctl-script/prom-rule..yaml at main · badal773/eksctl-script · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
additionalPrometheusRules:
    -   name: kubernetes-infra-rules
        groups:
        -   name: infra.rules
            rules:
              -   alert: NodeCPUUsage
                  expr: instance:node_cpu_utilisation:rate5m> 85
                  for: 2m
                  labels:
                    severity: critical
                  annotations:
                    summary: "{{$labels.node}}: High CPU usage detected"
                    description: '`{{$labels.node}}`:- CPU usage is above 85% (current value is: {{ $value | printf "%.2f"  }}%)'
                    resolved_desc: '`{{$labels.node}}`:-  *RESOLVED* : CPU usage is within thresholds (current value is: {{ $value | printf "%.2f"  }}%)'
              -   alert: NodeMemoryUsage
                  expr: (100* instance:node_memory_utilisation:ratio )> 85
                  for: 2m
                  labels:
                    severity: critical
                  annotations:
                    summary: "{{$labels.node}}: High memory usage detected"
                    description: '`{{$labels.node}}`: Memory usage is above 85% (current value is: {{ $value | printf "%.2f" }}%)'
                    resolved_desc: '`{{$labels.node}}`:-  *RESOLVED* : Memory usage is within thresholds (current value is: {{ $value | printf "%.2f"  }}%)'
              -   alert: NodeDiskUsage
                  expr: 100- (100* (node_filesystem_avail_bytes /node_filesystem_size_bytes)) >85
                  for: 5m
                  labels:
                    severity: critical
                  annotations:
                    summary: "{{$labels.node}}: High Disk Usage detected"
                    description: '`{{$labels.node}}`:- Disk usage is above 85% on {{$labels.device}} (current value is: {{ $value | printf "%.2f"  }}%)'
                    resolved_desc: '`{{$labels.node}}`:- *RESOLVED* : Disk usage is  within thresholds on {{$labels.device}} (current value is: {{ $value | printf "%.2f"  }}%)'
              -   alert: ContainerMemoryUsageHigh
                  expr: (sum(container_memory_working_set_bytes{namespace=~".*"}) BY (instance, name,pod,namespace) / sum(container_spec_memory_limit_bytes{namespace=~".*"} > 0) BY (instance, name,pod,namespace) * 100) > 80
                  for: 5m
                  labels:
                    severity: critical
                  annotations:
                    description: '{{ $value | printf "%.2f" }}% High memory usage in namespace {{
                      $labels.namespace }} for  pod {{ $labels.pod }}.'
                    resolved_desc: '*RESOLVED* :  Memory usage is normal now (current value is: {{ $value | printf "%.2f"  }})'
                    summary: Container Memory Usage is too high.
              -   alert: Too many Container restarts
                  annotations:
                    summary: Container named {{ $labels.container }} in {{ $labels.pod }} in {{ $labels.namespace }} was restarted
                    description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\nContainer name: {{$labels.container}}\n"
                  expr: sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m])) by (pod,namespace,container) > 5
                  for: 0m
                  labels:
                    severity: critical
                    app: "{{ $labels.pod }}"

              -   alert: PodOOMKilled
                  annotations:
                    summary: Pod named {{ $labels.pod }} in {{ $labels.namespace }} was OOMKilled
                    description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\nContainer name: {{$labels.container}}\n"
                  expr: (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}  > 0) and on (container)(increase(kube_pod_container_status_restarts_total[24h]) > 0)
                  for: 0m
                  labels:
                    severity: critical
                    app: "{{ $labels.pod }}"


              - alert: CPUThrottlingHigh-override
                annotations:
                  summary: CPUThrottlingHigh for container {{ $labels.container }} in pod {{ $labels.pod }}.
                  message: "{{ $value | humanizePercentage }} throttling of CPU in namespace {{
                    $labels.namespace }} for container {{ $labels.container }} in pod
                    {{ $labels.pod }}."
                expr: >-
                  sum(increase(container_cpu_cfs_throttled_periods_total{container!="",
                  }[5m])) by (container, pod, namespace)
                    /
                  sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
                    > ( 50 / 100 )
                for: 20m
                labels:
                  severity: critical

              - alert: 100%-CPUThrottling-Critical
                annotations:
                  summary: 100% Throttling of CPU for container {{ $labels.container }} in pod {{ $labels.pod }}.
                  message: "{{ $value | humanizePercentage }} throttling of CPU in namespace {{
                    $labels.namespace }} for container {{ $labels.container }} in pod
                    {{ $labels.pod }}."
                expr: >-
                  sum(increase(container_cpu_cfs_throttled_periods_total{container!="",
                  }[5m])) by (container, pod, namespace)
                    /
                  sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
                    > ( 95 / 100 )
                for: 2m
                labels:
                  severity: critical
              - alert: KubePodNotReady-override
                expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown", namespace=~".*"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
                for: 10m
                labels:
                  severity: critical
                  team: infra
                annotations:
                  description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
                  summary: Pod has been in a non-ready state for more than 5 minutes.

              - alert: KubePersistentVolumeFillingUp-warning
                expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.85
                for: 10m
                labels:
                  severity: warning
                annotations:
                  description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is filled up more than 85%. Currently {{ $value | humanizePercentage }}.
                  summary: PersistentVolume {{ $labels.persistentvolumeclaim }} is filling up.

              - alert: KubePersistentVolumeFillingUp-critical
                expr: (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95
                for: 1m
                labels:
                  severity: critical
                  team: infra
                annotations:
                  description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is filled up more than 95%. Currently {{ $value | humanizePercentage }}.
                  summary: PersistentVolume {{ $labels.persistentvolumeclaim }} is filling up.

              - alert: KubeContainerWaiting-override
                annotations:
                  summary: Container {{ $labels.container }} is waiting for more than an hour.
                  message: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{
                    $labels.container}} has been in waiting state for longer than 1
                    hour.
                expr: sum by (namespace, pod, container)
                  (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
                  namespace=~".*"}) > 0
                for: 1h
                labels:
                  severity: critical

              - alert: KubeHpaReplicasMismatch-override
                annotations:
                  summary: HPA {{ $labels.hpa }} has not matched the desired no. of replicas for more than 15 mins.
                  message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
                    desired number of replicas for longer than 15 minutes.
                expr: >-
                  (kube_hpa_status_desired_replicas{job="kube-state-metrics",
                  namespace=~".*"}
                    !=
                  kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
                    and
                  changes(kube_hpa_status_current_replicas[15m]) == 0
                for: 15m
                labels:
                  severity: critical

              - alert: KubeJobFailed-override
                annotations:
                  summary: Job {{ $labels.job_name }} has failed to complete success within activeDeadlineSeconds.
                  message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
                expr: increase(kube_job_status_failed{job="kube-state-metrics", namespace=~".*"}[1h]) > 0
                for: 0m
                labels:
                  severity: critical
---

alertmanager:
  config:
    route:
      group_by: ['namespace']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 12h
      receiver: 'devtron-infra-alert'
      routes:
      - receiver: 'null'
        matchers:
          - alertname = "Watchdog"
    receivers:
      - name: 'null'
      - name: devtron-infra-alert
        discord_configs:
            -   webhook_url: https://discord.com/api/webhooks/<>/<>
                send_resolved: true
                title: '[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}'
---
- apiVersion: v1
  kind: Pod
  metadata:
    name: alpine-stress
  spec:
    containers:
    - name: alpine-stress
      image: containerstack/alpine-stress
      args:
        - /bin/sh
        - -c
        - yes > /dev/null
      ports:
      - containerPort: 80
      resources:
        limits:
          cpu: "0.5"
          memory: 500Mi
        requests:
          cpu: "0.5"
          memory: 500Mi
---
for oom killed alert check please run this command -:  stress --vm 1 --vm-bytes 1G --vm-keep -t 60s