Skip to content

Commit 61dbc3c

Browse files
authored
add alerts (#66)
1 parent c02c23d commit 61dbc3c

File tree

2 files changed

+114
-0
lines changed

2 files changed

+114
-0
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Deploy alerts
2+
run-name: Deploy alerts for dapla-statbank-authenticator to test and prod
3+
4+
on:
5+
push:
6+
branches:
7+
- master
8+
paths:
9+
- '.nais/alerts.yaml'
10+
- '.github/workflows/alert-deploy.yml'
11+
permissions:
12+
id-token: write
13+
14+
jobs:
15+
test-deploy:
16+
name: Deploy alerts to test
17+
runs-on: ubuntu-latest
18+
steps:
19+
- name: Checkout code
20+
uses: actions/checkout@v4
21+
- name: Deploy to test
22+
uses: nais/deploy/actions/deploy@v2
23+
env:
24+
CLUSTER: test
25+
RESOURCE: .nais/alerts.yaml
26+
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443
27+
28+
prod-deploy:
29+
name: Deploy alerts to prod
30+
runs-on: ubuntu-latest
31+
steps:
32+
- name: Checkout code
33+
uses: actions/checkout@v4
34+
- name: Deploy to prod
35+
uses: nais/deploy/actions/deploy@v2
36+
env:
37+
CLUSTER: prod
38+
RESOURCE: .nais/alerts.yaml
39+
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443

.nais/alerts.yaml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
apiVersion: "monitoring.coreos.com/v1"
2+
kind: PrometheusRule
3+
metadata:
4+
name: alert-dapla-statbank-authenticator
5+
namespace: dapla-stat
6+
labels:
7+
team: dapla-stat
8+
spec:
9+
groups:
10+
- name: dapla-stat
11+
rules:
12+
# This alert checks if no replicas of dapla-statbank-authenticator are available, indicating the service is unavailable.
13+
- alert: StatbankAuthenticatorUnavailable
14+
expr: kube_deployment_status_replicas_available{deployment="dapla-statbank-authenticator"} == 0
15+
for: 1m
16+
annotations:
17+
title: "dapla-statbank-authenticator is unavailable"
18+
consequence: "The service is unavailable to users. Immediate investigation required."
19+
action: "Check the deployment status and logs for issues."
20+
labels:
21+
service: dapla-statbank-authenticator
22+
namespace: dapla-stat
23+
severity: critical
24+
25+
# This alert detects high CPU usage by calculating the CPU time used over 5 minutes.
26+
- alert: HighCPUUsage
27+
expr: rate(process_cpu_seconds_total{app="dapla-statbank-authenticator"}[5m]) > 0.8
28+
for: 5m
29+
annotations:
30+
title: "High CPU usage detected"
31+
consequence: "The service might experience performance degradation."
32+
action: "Investigate the cause of high CPU usage and optimize if necessary."
33+
labels:
34+
service: dapla-statbank-authenticator
35+
namespace: dapla-stat
36+
severity: warning
37+
38+
# This alert checks if memory usage exceeds 90% of the 12GB limit, which could cause instability.
39+
- alert: HighMemoryUsage
40+
expr: sum by (namespace, pod) (container_memory_working_set_bytes{namespace="dapla-stat", pod=~"dapla-statbank-authenticator-.*"}) > 0.9 * sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes{namespace="dapla-stat", pod=~"dapla-statbank-authenticator-.*"})
41+
for: 5m
42+
annotations:
43+
title: "High memory usage detected"
44+
consequence: "The service might experience instability due to high memory usage."
45+
action: "Check memory utilization and consider increasing resources or optimizing the service."
46+
labels:
47+
service: dapla-statbank-authenticator
48+
namespace: dapla-stat
49+
severity: warning
50+
51+
# This alert detects a high number of error logs in dapla-statbank-authenticator.
52+
- alert: HighNumberOfErrors
53+
expr: (100 * sum by (app, namespace) (rate(log_messages_errors{app="dapla-statbank-authenticator", level=~"Error"}[3m])) / sum by (app, namespace) (rate(log_messages_total{app="dapla-statbank-authenticator"}[3m]))) > 10
54+
for: 3m
55+
annotations:
56+
title: "High number of errors logged in dapla-statbank-authenticator"
57+
consequence: "The application is logging a significant number of errors."
58+
action: "Check the service logs for errors and address the root cause."
59+
labels:
60+
service: dapla-statbank-authenticator
61+
namespace: dapla-stat
62+
severity: critical
63+
64+
# This alert monitors the number of pod restarts for dapla-statbank-authenticator and triggers if more than 3 restarts occur within 15 minutes.
65+
- alert: HighPodRestarts
66+
expr: increase(kube_pod_container_status_restarts_total{namespace="dapla-stat", app="dapla-statbank-authenticator"}[15m]) > 3
67+
for: 15m
68+
annotations:
69+
title: "High number of pod restarts"
70+
consequence: "The service may be unstable or misconfigured."
71+
action: "Investigate the cause of pod restarts and fix configuration or resource issues."
72+
labels:
73+
service: dapla-statbank-authenticator
74+
namespace: dapla-stat
75+
severity: warning

0 commit comments

Comments
 (0)