Skip to content

Commit 75e2f42

Browse files
authored
Merge pull request #118 from statisticsnorway/nais-deploy
NAIS deploy
2 parents c143b08 + 7ad1218 commit 75e2f42

File tree

10 files changed

+606
-8
lines changed

10 files changed

+606
-8
lines changed

.github/workflows/alert-deploy.yml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
name: Deploy alerts
2+
run-name: Deploy alerts for Pseudo Service to test and prod
3+
4+
on:
5+
push:
6+
branches:
7+
- master
8+
paths:
9+
- ".nais/alerts.yaml"
10+
- ".github/workflows/alert-deploy.yml"
11+
workflow_dispatch:
12+
permissions:
13+
id-token: write
14+
env:
15+
TEAM: dapla-stat
16+
17+
jobs:
18+
test-deploy:
19+
name: Deploy alerts to test
20+
runs-on: ubuntu-latest
21+
steps:
22+
- name: Checkout code
23+
uses: actions/checkout@v4
24+
25+
- uses: actions/checkout@v4
26+
name: Retrieve AlertManager configuration
27+
with:
28+
repository: "statisticsnorway/nais-alert-config"
29+
path: "ext_alertconfig"
30+
sparse-checkout: |
31+
alertconfig.yaml
32+
sparse-checkout-cone-mode: false
33+
34+
- name: Deploy to test
35+
uses: nais/deploy/actions/deploy@v2
36+
env:
37+
CLUSTER: test
38+
RESOURCE: .nais/alerts.yaml,ext_alertconfig/alertconfig.yaml
39+
VAR: cluster=test,team=${{ env.TEAM }}
40+
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443
41+
42+
prod-deploy:
43+
name: Deploy alerts to prod
44+
runs-on: ubuntu-latest
45+
steps:
46+
- name: Checkout code
47+
uses: actions/checkout@v4
48+
49+
- uses: actions/checkout@v4
50+
name: Retrieve AlertManager configuration
51+
with:
52+
repository: "statisticsnorway/nais-alert-config"
53+
path: "ext_alertconfig"
54+
sparse-checkout: |
55+
alertconfig.yaml
56+
sparse-checkout-cone-mode: false
57+
58+
- name: Deploy to prod
59+
uses: nais/deploy/actions/deploy@v2
60+
env:
61+
CLUSTER: prod
62+
RESOURCE: .nais/alerts.yaml,ext_alertconfig/alertconfig.yaml
63+
VAR: cluster=prod,team=${{ env.TEAM }}
64+
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443
65+

.github/workflows/build-deploy-app.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
on:
22
release:
33
types: [ published ]
4-
pull_request: ## ONLY FOR TESTING, SHOULD BE REMOVED AFTER DEPLOY PR IS MERGED
5-
branches: [nais-deploy]
64
push:
75
branches:
86
- master

.github/workflows/deploy-to-nais.yml

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,33 @@ jobs:
7474
id-token: "write"
7575
steps:
7676
- uses: actions/checkout@v4
77+
78+
- uses: actions/create-github-app-token@v1
79+
id: app-token
80+
with:
81+
app-id: ${{ secrets.DAPLA_BOT_APP_ID }}
82+
private-key: ${{ secrets.DAPLA_BOT_PRIVATE_KEY }}
83+
owner: statisticsnorway
84+
repositories: dapla-pseudo-iac
85+
86+
- uses: actions/checkout@v4
87+
name: Retrieve protected configuration
88+
with:
89+
repository: "statisticsnorway/dapla-pseudo-iac"
90+
path: "ext-config"
91+
token: ${{ steps.app-token.outputs.token }}
92+
sparse-checkout: |
93+
apps/nais/pseudo-service/${{ inputs.cluster }}
94+
95+
- name: Configure environment variables
96+
run: |
97+
ext_config_dir="ext-config/apps/nais/pseudo-service/${{ inputs.cluster }}"
98+
echo "ext_config=${ext_config_dir}" >> $GITHUB_ENV
7799
78100
- uses: nais/deploy/actions/deploy@v2
79101
env:
80102
CLUSTER: ${{ inputs.cluster }}
81-
RESOURCE: ${{ inputs.nais-config-path }}
82-
VAR: image=${{ inputs.registry }}/${{ secrets.NAIS_MANAGEMENT_PROJECT_ID }}/${{ inputs.repository }}/${{ inputs.image-name }}:${{ inputs.image-tag }}
103+
RESOURCE: ${{ inputs.nais-config-path }},${{ env.ext_config }}/app-roles.yml
104+
VAR: image=${{ inputs.registry }}/${{ secrets.NAIS_MANAGEMENT_PROJECT_ID }}/${{ inputs.repository }}/${{ inputs.image-name }}:${{ inputs.image-tag }},team=dapla-stat
83105
DEPLOY_SERVER: deploy.ssb.cloud.nais.io:443
84106
REF: ${{ inputs.ref }}

.nais/alerts.yaml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
apiVersion: "monitoring.coreos.com/v1"
2+
kind: PrometheusRule
3+
metadata:
4+
name: alert-pseudo-service
5+
namespace: dapla-stat
6+
labels:
7+
team: dapla-stat
8+
cluster: "{{cluster}}"
9+
spec:
10+
groups:
11+
- name: dapla-stat
12+
rules:
13+
# This alert checks if no replicas of pseudo-service are available, indicating the service is unavailable.
14+
- alert: PseudoServiceUnavailable
15+
expr: kube_deployment_status_replicas_available{deployment="pseudo-service"} == 0
16+
for: 1m
17+
annotations:
18+
title: "Pseudo-service is unavailable"
19+
consequence: "The service is unavailable to users. Immediate investigation required."
20+
action: "Check the deployment status and logs for issues."
21+
labels:
22+
service: pseudo-service
23+
namespace: dapla-stat
24+
severity: critical
25+
alertmanager_custom_config: dapla-stat
26+
alert_type: custom
27+
28+
# This alert detects high CPU usage by calculating the CPU time used over 5 minutes.
29+
- alert: HighCPUUsage
30+
expr: rate(process_cpu_seconds_total{app="pseudo-service"}[5m]) > 0.8
31+
for: 5m
32+
annotations:
33+
title: "High CPU usage detected"
34+
consequence: "The service might experience performance degradation."
35+
action: "Investigate the cause of high CPU usage and optimize if necessary."
36+
labels:
37+
service: pseudo-service
38+
namespace: dapla-stat
39+
severity: warning
40+
alertmanager_custom_config: dapla-stat
41+
alert_type: custom
42+
43+
# This alert checks if memory usage exceeds 90% of the 12GB limit, which could cause instability.
44+
- alert: HighMemoryUsage
45+
expr: sum by (namespace, pod) (container_memory_working_set_bytes{namespace="dapla-stat", pod=~"pseudo-service-.*"}) > 0.9 * sum by (namespace, pod) (kube_pod_container_resource_limits_memory_bytes{namespace="dapla-stat", pod=~"pseudo-service-.*"})
46+
for: 5m
47+
annotations:
48+
title: "High memory usage detected"
49+
consequence: "The service might experience instability due to high memory usage."
50+
action: "Check memory utilization and consider increasing resources or optimizing the service."
51+
labels:
52+
service: pseudo-service
53+
namespace: dapla-stat
54+
severity: warning
55+
alertmanager_custom_config: dapla-stat
56+
alert_type: custom
57+
58+
# This alert detects a high number of error logs in pseudo-service.
59+
- alert: HighNumberOfErrors
60+
expr: (100 * sum by (app, namespace) (rate(log_messages_errors{app="pseudo-service", level=~"Error"}[3m])) / sum by (app, namespace) (rate(log_messages_total{app="pseudo-service"}[3m]))) > 10
61+
for: 3m
62+
annotations:
63+
title: "High number of errors logged in pseudo-service"
64+
consequence: "The application is logging a significant number of errors."
65+
action: "Check the service logs for errors and address the root cause."
66+
labels:
67+
service: pseudo-service
68+
namespace: dapla-stat
69+
severity: critical
70+
alertmanager_custom_config: dapla-stat
71+
alert_type: custom
72+
73+
# This alert monitors the number of pod restarts for pseudo-service and triggers if more than 3 restarts occur within 15 minutes.
74+
- alert: HighPodRestarts
75+
expr: increase(kube_pod_container_status_restarts_total{namespace="dapla-stat", app="pseudo-service"}[15m]) > 3
76+
for: 15m
77+
annotations:
78+
title: "High number of pod restarts"
79+
consequence: "The service may be unstable or misconfigured."
80+
action: "Investigate the cause of pod restarts and fix configuration or resource issues."
81+
labels:
82+
service: pseudo-service
83+
namespace: dapla-stat
84+
severity: warning
85+
alertmanager_custom_config: dapla-stat
86+
alert_type: custom

0 commit comments

Comments
 (0)