11#! /bin/bash
22
3+ # TODO dtfranz: The yaml in this file should be pulled out and organized into a kustomization.yaml (where possible) for maintainability/readability
4+
35set -euo pipefail
46
57help=" setup-monitoring.sh is used to set up prometheus monitoring for e2e testing.
9294 runAsUser: 65534
9395 seccompProfile:
9496 type: RuntimeDefault
97+ ruleSelector: {}
9598 serviceDiscoveryRole: EndpointSlice
9699 serviceMonitorSelector: {}
97100EOF
@@ -115,6 +118,49 @@ spec:
115118 - {} # Allows us to query prometheus
116119EOF
117120
121+ kubectl apply -f - << EOF
122+ apiVersion: monitoring.coreos.com/v1
123+ kind: ServiceMonitor
124+ metadata:
125+ name: kubelet
126+ namespace: olmv1-system
127+ labels:
128+ k8s-app: kubelet
129+ spec:
130+ jobLabel: k8s-app
131+ endpoints:
132+ - port: https-metrics
133+ scheme: https
134+ path: /metrics
135+ interval: 10s
136+ honorLabels: true
137+ tlsConfig:
138+ insecureSkipVerify: true
139+ bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
140+ metricRelabelings:
141+ - action: keep
142+ sourceLabels: [pod,container]
143+ regex: (operator-controller|catalogd).*;manager
144+ - port: https-metrics
145+ scheme: https
146+ path: /metrics/cadvisor
147+ interval: 10s
148+ honorLabels: true
149+ tlsConfig:
150+ insecureSkipVerify: true
151+ bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
152+ metricRelabelings:
153+ - action: keep
154+ sourceLabels: [pod,container]
155+ regex: (operator-controller|catalogd).*;manager
156+ selector:
157+ matchLabels:
158+ k8s-app: kubelet
159+ namespaceSelector:
160+ matchNames:
161+ - kube-system
162+ EOF
163+
118164# Give the operator time to create the pod
119165kubectl wait --for=create pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=60s
120166kubectl wait --for=condition=Ready pods -n ${NAMESPACE} prometheus-prometheus-0 --timeout=120s
@@ -131,6 +177,56 @@ metadata:
131177 kubernetes.io/service-account.name: prometheus
132178EOF
133179
180+ kubectl apply -f - << EOF
181+ apiVersion: monitoring.coreos.com/v1
182+ kind: PrometheusRule
183+ metadata:
184+ name: controller-alerts
185+ namespace: ${NAMESPACE}
186+ spec:
187+ groups:
188+ - name: controller-panic
189+ rules:
190+ - alert: reconciler-panic
191+ expr: controller_runtime_reconcile_panics_total{} > 0
192+ annotations:
193+ description: "controller of pod {{ \$ labels.pod }} experienced panic(s); count={{ \$ value }}"
194+ - alert: webhook-panic
195+ expr: controller_runtime_webhook_panics_total{} > 0
196+ annotations:
197+ description: "controller webhook of pod {{ \$ labels.pod }} experienced panic(s); count={{ \$ value }}"
198+ - name: resource-usage
199+ rules:
200+ - alert: oom-events
201+ expr: container_oom_events_total > 0
202+ annotations:
203+ description: "container {{ \$ labels.container }} of pod {{ \$ labels.pod }} experienced OOM event(s); count={{ \$ value }}"
204+ - alert: operator-controller-memory-growth
205+ expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000
206+ for: 5m
207+ keep_firing_for: 1d
208+ annotations:
209+ description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ \$ value | humanize }}B/sec"
210+ - alert: catalogd-memory-growth
211+ expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000
212+ for: 5m
213+ keep_firing_for: 1d
214+ annotations:
215+ description: "catalogd pod memory usage growing at a high rate for 5 minutes: {{ \$ value | humanize }}B/sec"
216+ - alert: operator-controller-cpu-usage
217+ expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
218+ for: 5m
219+ keep_firing_for: 1d
220+ annotations:
221+ description: "operator-controller using high cpu resource for 5 minutes: {{ \$ value | printf \"%.2f\" }}%"
222+ - alert: catalogd-cpu-usage
223+ expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
224+ for: 5m
225+ keep_firing_for: 1d
226+ annotations:
227+ description: "catalogd using high cpu resources for 5 minutes: {{ \$ value | printf \"%.2f\" }}%"
228+ EOF
229+
134230# ServiceMonitors for operator-controller and catalogd
135231kubectl apply -f - << EOF
136232apiVersion: monitoring.coreos.com/v1
@@ -141,6 +237,7 @@ metadata:
141237spec:
142238 endpoints:
143239 - path: /metrics
240+ interval: 10s
144241 port: https
145242 scheme: https
146243 authorization:
@@ -178,6 +275,7 @@ spec:
178275 endpoints:
179276 - path: /metrics
180277 port: metrics
278+ interval: 10s
181279 scheme: https
182280 authorization:
183281 credentials:
0 commit comments