From 1c1e762f3c7e79922856038e67cf09c324da8c1d Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 13 Jun 2025 19:56:55 +0200 Subject: [PATCH] Move Pyrra from addon to core component with SLOs - Move pyrra.libsonnet from addons to core components - Update to Pyrra v0.8.4 from upstream jsonnet - Add SLOs for various Kubernetes components: - API server (read/write errors and latency) - Kubelet (request and runtime errors) - CoreDNS (response errors and latency) - Prometheus Operator (reconcile and HTTP errors) - Prometheus (rule evaluation, SD, query, and notification errors) - Kube Controller Manager and Kube Proxy (request errors) - Enable generic rules for Pyrra by default - Update manifests and documentation accordingly --- ...prometheus-rules-and-grafana-dashboards.md | 16 +- docs/customizing.md | 16 +- example.jsonnet | 16 +- .../kube-prometheus/addons/pyrra.libsonnet | 626 ------------------ .../components/k8s-control-plane.libsonnet | 320 ++++++++- .../components/prometheus-operator.libsonnet | 69 ++ .../components/pyrra.libsonnet | 43 ++ jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnet/kube-prometheus/main.libsonnet | 11 +- jsonnetfile.lock.json | 6 +- kustomization.yaml | 39 +- ...sControlPlane-coredns-ServiceMonitor.yaml} | 2 +- ...trolPlane-coredns-slo-response-errors.yaml | 24 + ...rolPlane-coredns-slo-response-latency.yaml | 24 + ...kubeControllerManagerSLORequestErrors.yaml | 21 + ...-kubeControllerManagerServiceMonitor.yaml} | 0 ...ontrolPlane-kubeProxySLORequestErrors.yaml | 22 + ...ntrolPlane-kubelet-slo-request-errors.yaml | 24 + ...ntrolPlane-kubelet-slo-runtime-errors.yaml | 23 + ...esControlPlane-kubeletServiceMonitor.yaml} | 0 .../prometheusOperator-sloHTTPErrors.yaml | 23 + ...prometheusOperator-sloReconcileErrors.yaml | 27 + manifests/pyrra-apiDeployment.yaml | 41 ++ manifests/pyrra-apiService.yaml | 17 + manifests/pyrra-apiServiceAccount.yaml | 9 + manifests/pyrra-apiServiceMonitor.yaml | 19 + manifests/pyrra-kubernetesClusterRole.yaml | 48 ++ .../pyrra-kubernetesClusterRoleBinding.yaml | 17 + manifests/pyrra-kubernetesDeployment.yaml | 40 ++ manifests/pyrra-kubernetesService.yaml | 23 + manifests/pyrra-kubernetesServiceAccount.yaml | 9 + manifests/pyrra-kubernetesServiceMonitor.yaml | 19 + ...ra-slo-apiserver-read-cluster-latency.yaml | 18 + ...-slo-apiserver-read-namespace-latency.yaml | 18 + ...a-slo-apiserver-read-resource-latency.yaml | 18 + ...ra-slo-apiserver-read-response-errors.yaml | 18 + ...a-slo-apiserver-write-response-errors.yaml | 18 + .../pyrra-slo-coredns-response-errors.yaml | 18 + .../pyrra-slo-kubelet-request-errors.yaml | 18 + .../pyrra-slo-kubelet-runtime-errors.yaml | 18 + ...ra-slo-prometheus-notification-errors.yaml | 18 + ...a-slo-prometheus-operator-http-errors.yaml | 18 + ...-prometheus-operator-reconcile-errors.yaml | 20 + .../pyrra-slo-prometheus-query-errors.yaml | 20 + ...o-prometheus-rule-evaluation-failures.yaml | 18 + ...a-slo-prometheus-sd-kubernetes-errors.yaml | 18 + manifests/setup/crd.yaml | 179 +++++ 47 files changed, 1377 insertions(+), 654 deletions(-) delete mode 100644 jsonnet/kube-prometheus/addons/pyrra.libsonnet create mode 100644 jsonnet/kube-prometheus/components/pyrra.libsonnet rename manifests/{kubernetesControlPlane-serviceMonitorCoreDNS.yaml => kubernetesControlPlane-coredns-ServiceMonitor.yaml} (95%) create mode 100644 manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml create mode 100644 manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml create mode 100644 manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml rename manifests/{kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml => kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml} (100%) create mode 100644 manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml create mode 100644 manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml create mode 100644 manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml rename manifests/{kubernetesControlPlane-serviceMonitorKubelet.yaml => kubernetesControlPlane-kubeletServiceMonitor.yaml} (100%) create mode 100644 manifests/prometheusOperator-sloHTTPErrors.yaml create mode 100644 manifests/prometheusOperator-sloReconcileErrors.yaml create mode 100644 manifests/pyrra-apiDeployment.yaml create mode 100644 manifests/pyrra-apiService.yaml create mode 100644 manifests/pyrra-apiServiceAccount.yaml create mode 100644 manifests/pyrra-apiServiceMonitor.yaml create mode 100644 manifests/pyrra-kubernetesClusterRole.yaml create mode 100644 manifests/pyrra-kubernetesClusterRoleBinding.yaml create mode 100644 manifests/pyrra-kubernetesDeployment.yaml create mode 100644 manifests/pyrra-kubernetesService.yaml create mode 100644 manifests/pyrra-kubernetesServiceAccount.yaml create mode 100644 manifests/pyrra-kubernetesServiceMonitor.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-cluster-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-namespace-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-resource-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-response-errors.yaml create mode 100644 manifests/pyrra-slo-apiserver-write-response-errors.yaml create mode 100644 manifests/pyrra-slo-coredns-response-errors.yaml create mode 100644 manifests/pyrra-slo-kubelet-request-errors.yaml create mode 100644 manifests/pyrra-slo-kubelet-runtime-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-notification-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-operator-http-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-query-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml create mode 100644 manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml create mode 100644 manifests/setup/crd.yaml diff --git a/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md b/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md index cfab2e687f..0a61e0623e 100644 --- a/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md @@ -31,7 +31,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -43,17 +42,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/docs/customizing.md b/docs/customizing.md index a051ed4437..3fb3038f27 100644 --- a/docs/customizing.md +++ b/docs/customizing.md @@ -51,7 +51,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -63,17 +62,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/example.jsonnet b/example.jsonnet index 8974158bcd..5f9f1eae98 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -7,7 +7,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -19,17 +18,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/jsonnet/kube-prometheus/addons/pyrra.libsonnet b/jsonnet/kube-prometheus/addons/pyrra.libsonnet deleted file mode 100644 index c265d50906..0000000000 --- a/jsonnet/kube-prometheus/addons/pyrra.libsonnet +++ /dev/null @@ -1,626 +0,0 @@ -{ - values+:: { - common+: { - versions+: { - pyrra: error 'must provide version', - } + (import '../versions.json'), - images+: { - pyrra+: 'ghcr.io/pyrra-dev/pyrra:v' + $.values.common.versions.pyrra, - }, - }, - pyrra+: { - namespace: $.values.common.namespace, - version: $.values.common.versions.pyrra, - image: $.values.common.images.pyrra, - }, - }, - - local defaults = { - local defaults = self, - - name:: 'pyrra', - namespace:: error 'must provide namespace', - version:: error 'must provide version', - image: error 'must provide image', - replicas:: 1, - port:: 9099, - - commonLabels:: { - 'app.kubernetes.io/name': 'pyrra', - 'app.kubernetes.io/version': defaults.version, - 'app.kubernetes.io/part-of': 'kube-prometheus', - }, - }, - - local pyrra = function(params) { - local pyrra = self, - _config:: defaults + params, - - crd: ( - import 'github.com/pyrra-dev/pyrra/jsonnet/controller-gen/pyrra.dev_servicelevelobjectives.json' - ), - - - _apiMetadata:: { - name: pyrra._config.name + '-api', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'api', - }, - }, - apiSelectorLabels:: { - [labelName]: pyrra._apiMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._apiMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - apiService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._apiMetadata, - spec: { - ports: [ - { name: 'http', targetPort: pyrra._config.port, port: pyrra._config.port }, - ], - selector: pyrra.apiSelectorLabels, - }, - }, - - apiDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'api', - '--api-url=http://%s.%s.svc.cluster.local:9444' % [pyrra.kubernetesService.metadata.name, pyrra.kubernetesService.metadata.namespace], - '--prometheus-url=http://prometheus-k8s.%s.svc.cluster.local:9090' % pyrra._config.namespace, - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - runAsNonRoot: true, - capabilities: { drop: ['ALL'] }, - seccompProfile: { type: 'RuntimeDefault' }, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._apiMetadata, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.apiSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._apiMetadata.labels }, - spec: { - containers: [c], - // serviceAccountName: $.serviceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - _kubernetesMetadata:: { - name: pyrra._config.name + '-kubernetes', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'kubernetes', - }, - }, - kubernetesSelectorLabels:: { - [labelName]: pyrra._kubernetesMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._kubernetesMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - kubernetesServiceAccount: { - apiVersion: 'v1', - kind: 'ServiceAccount', - metadata: pyrra._kubernetesMetadata, - }, - - kubernetesClusterRole: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRole', - metadata: pyrra._kubernetesMetadata, - rules: [{ - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules/status'], - verbs: ['get'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives/status'], - verbs: ['get', 'patch', 'update'], - }], - }, - - kubernetesClusterRoleBinding: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRoleBinding', - metadata: pyrra._kubernetesMetadata, - roleRef: { - apiGroup: 'rbac.authorization.k8s.io', - kind: 'ClusterRole', - name: pyrra.kubernetesClusterRole.metadata.name, - }, - subjects: [{ - kind: 'ServiceAccount', - name: pyrra.kubernetesServiceAccount.metadata.name, - namespace: pyrra._config.namespace, - }], - }, - - kubernetesService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._kubernetesMetadata, - spec: { - ports: [ - { name: 'http', targetPort: 9444, port: 9444 }, - ], - selector: pyrra.kubernetesSelectorLabels, - }, - }, - - kubernetesDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'kubernetes', - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._kubernetesMetadata { - name: pyrra._config.name + '-kubernetes', - }, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.kubernetesSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._kubernetesMetadata.labels }, - spec: { - containers: [c], - serviceAccountName: pyrra.kubernetesServiceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - // Most of these should eventually be moved to the components themselves. - // For now, this is a good start to have everything in one place. - 'slo-apiserver-read-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{component="apiserver",verb=~"LIST|GET",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{component="apiserver",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-write-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-write-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-resource-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-resource-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"resource|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-namespace-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-namespace-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-cluster-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-cluster-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-kubelet-request-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-request-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'rest_client_requests_total{job="kubelet",code=~"5.."}', - }, - total: { - metric: 'rest_client_requests_total{job="kubelet"}', - }, - }, - }, - }, - }, - - 'slo-kubelet-runtime-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-runtime-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'kubelet_runtime_operations_errors_total{job="kubelet"}', - }, - total: { - metric: 'kubelet_runtime_operations_total{job="kubelet"}', - }, - }, - }, - }, - }, - - 'slo-coredns-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'coredns-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}', - }, - total: { - metric: 'coredns_dns_responses_total{job="kube-dns"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-operator-reconcile-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-operator-reconcile-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '95', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_operator_reconcile_errors_total{job="prometheus-operator"}', - }, - total: { - metric: 'prometheus_operator_reconcile_operations_total{job="prometheus-operator"}', - }, - grouping: ['controller'], - }, - }, - }, - }, - - 'slo-prometheus-operator-http-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-operator-http-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.5', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",status_code=~"5.."}', - }, - total: { - metric: 'prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-rule-evaluation-failures': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-rule-evaluation-failures', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.99', - window: '2w', - description: 'Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_rule_evaluation_failures_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_rule_evaluations_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-sd-kubernetes-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-sd-kubernetes-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: 'If there are too many errors Prometheus is having a bad time discovering new Kubernetes services.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"}', - }, - total: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-query-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-query-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - grouping: ['handler'], - errors: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."}', - }, - total: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-notification-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-notification-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_notifications_errors_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_notifications_sent_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - }, - - pyrra: pyrra($.values.pyrra), -} diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index 448ea4cf7e..dccc084599 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -19,13 +19,62 @@ local defaults = { kubeSchedulerSelector: 'job="kube-scheduler"', kubeControllerManagerSelector: 'job="kube-controller-manager"', kubeApiserverSelector: 'job="apiserver"', + kubeProxySelector: 'job="kube-proxy"', + coreDNSSelector: 'job="coredns"', podLabel: 'pod', runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', diskDeviceSelector: 'device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"', hostNetworkInterfaceSelector: 'device!~"veth.+"', }, }, - kubeProxy:: false, + kubelet: { + slos: { + requestErrors: { + target: '99', + window: '2w', + }, + runtimeErrors: { + target: '99.5', + window: '2w', + }, + }, + }, + kubeControllerManager: { + slos: { + requestErrors: { + target: '99', + window: '2w', + }, + }, + }, + kubeProxy: false, + kubeProxyConfig: { // different name for backwards compatability + slos: { + syncRulesLatency: { + target: '90', + latency: '0.512', // must exist as le label + window: '2w', + }, + requestErrors: { + target: '90', // kube-proxy makes very few requests + window: '2w', + }, + }, + }, + coredns: { + name: 'coredns', + slos: { + responseErrors: { + target: '99.99', + window: '2w', + }, + responseLatency: { + target: '99', + latency: '0.032', // must exist as le label + window: '2w', + }, + }, + }, }; function(params) { @@ -104,7 +153,7 @@ function(params) { }, }, - serviceMonitorKubelet: { + kubeletServiceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { @@ -224,7 +273,80 @@ function(params) { }, }, - serviceMonitorKubeControllerManager: { + 'kubelet-slo-request-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kubelet-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kubelet', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: k8s._config.kubelet.slos.requestErrors.target, + window: k8s._config.kubelet.slos.requestErrors.window, + description: ||| + The kubelet is the primary “node agent” that runs on each node. + The kubelet ensures that the containers are running and healthy. + If these requests are failing the Kubelet might not know what to run exactly. + |||, + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + }, + }, + }, + }, + + 'kubelet-slo-runtime-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kubelet-runtime-errors', + labels+: { + 'app.kubernetes.io/name': 'kubelet', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: k8s._config.kubelet.slos.runtimeErrors.target, + window: k8s._config.kubelet.slos.runtimeErrors.window, + description: ||| + The kubelet is the primary “node agent” that runs on each node. + If there are runtime errors the kubelet might be unable to check the containers are running and healthy. + |||, + indicator: { + ratio: { + errors: { + metric: 'kubelet_runtime_operations_errors_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + total: { + metric: 'kubelet_runtime_operations_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + }, + }, + }, + }, + + kubeControllerManagerServiceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { @@ -277,6 +399,43 @@ function(params) { }, }, + kubeControllerManagerSLORequestErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-controller-manager-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kube-controller-manager', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kube-controller-manager', + }, + }, + spec: { + target: k8s._config.kubeControllerManager.slos.requestErrors.target, + window: k8s._config.kubeControllerManager.slos.requestErrors.window, + description: ||| + The Kubernetes controller manager is a daemon that embeds the core control loops shipped with Kubernetes. + In applications of robotics and automation, a control loop is a non-terminating loop that regulates the state of the system. + In Kubernetes, a controller is a control loop that watches the shared state of the cluster through the apiserver and makes changes attempting to move the current state towards the desired state. Examples of controllers that ship with Kubernetes today are the replication controller, endpoints controller, namespace controller, and serviceaccounts controller. + |||, + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ + k8s._config.mixin._config.kubeControllerManagerSelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeControllerManagerSelector, + ], + }, + }, + }, + }, + }, + serviceMonitorApiserver: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -396,18 +555,92 @@ function(params) { }, }, + [if (defaults + params).kubeProxy then 'kubeProxySLOSyncRulesLatency']: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-proxy-sync-rules-latency', + labels+: { + 'app.kubernetes.io/name': 'kube-proxy', + 'app.kubernetes.io/component': 'controller', //TODO + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'kube-proxy', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.kubeProxyConfig.slos.syncRulesLatency.target, + window: k8s._config.kubeProxyConfig.slos.syncRulesLatency.window, + description: ||| + The Kubernetes network proxy runs on each node. + This reflects services as defined in the Kubernetes API on each node and can do simple TCP, UDP + stream forwarding or round robin TCP,UDP forwarding across a set of backends. - serviceMonitorCoreDNS: { + If this is firing the networks might not be synchronized fast enough and services might be unable to reach the containers they want to reach. + |||, + indicator: { + latency: { + success: { + metric: 'kubeproxy_sync_proxy_rules_duration_seconds_bucket{%s,le="%s"}' % [ + k8s._config.mixin._config.kubeProxySelector, + k8s._config.kubeProxyConfig.slos.syncRulesLatency.latency, + ], + }, + total: { + metric: 'kubeproxy_sync_proxy_rules_duration_seconds_count{%s}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + }, + }, + }, + }, + + kubeProxySLORequestErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-proxy-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kube-proxy', + 'app.kubernetes.io/component': 'controller', //TODO + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'kube-proxy', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.kubeProxyConfig.slos.requestErrors.target, + window: k8s._config.kubeProxyConfig.slos.requestErrors.window, + description: '', + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + }, + }, + }, + }, + + 'coredns-ServiceMonitor': { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { - name: 'coredns', - labels+: { 'app.kubernetes.io/name': 'coredns' }, + name: k8s._config.coredns.name, + labels+: { 'app.kubernetes.io/name': k8s._config.coredns.name }, }, spec: { jobLabel: 'app.kubernetes.io/name', selector: { - matchLabels: { 'k8s-app': 'kube-dns' }, + matchLabels: { 'k8s-app': k8s._config.coredns.name }, }, namespaceSelector: { matchNames: ['kube-system'], @@ -431,5 +664,78 @@ function(params) { }, }, + 'coredns-slo-response-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: k8s._config.coredns.name + '-response-errors', + labels+: { + 'app.kubernetes.io/name': k8s._config.coredns.name, + 'app.kubernetes.io/component': 'controller', + prometheus: 'k8s', // TODO + 'pyrra.dev/component': k8s._config.coredns.name, + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.coredns.slos.responseErrors.target, + window: k8s._config.coredns.slos.responseErrors.window, + description: ||| + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS fails to answer requests applications might be unable to make requests. + |||, + indicator: { + ratio: { + errors: { + metric: 'coredns_dns_responses_total{%s,rcode="SERVFAIL"}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + total: { + metric: 'coredns_dns_responses_total{%s}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + }, + }, + }, + }, + 'coredns-slo-response-latency': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: k8s._config.coredns.name + '-response-latency', + labels+: { + 'app.kubernetes.io/name': 'coredns', + 'app.kubernetes.io/component': 'controller', + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'coredns', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.coredns.slos.responseLatency.target, + window: k8s._config.coredns.slos.responseLatency.window, + description: ||| + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS gets too slow it might have an impact on the latency of other applications in this cluster. + |||, + indicator: { + latency: { + success: { + metric: 'coredns_dns_request_duration_seconds_bucket{%s,le="%s"}' % [ + k8s._config.mixin._config.coreDNSSelector, + k8s._config.coredns.slos.responseLatency.latency, + ], + }, + total: { + metric: 'coredns_dns_request_duration_seconds_count{%s}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + }, + }, + }, + }, } diff --git a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet index 104d76a8ba..1737550ef5 100644 --- a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet @@ -43,6 +43,16 @@ local defaults = { runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', }, }, + slos: { + reconcileErrors: { + target: '95', + window: '2w', + }, + HTTPErrors: { + target: '99.5', + window: '2w', + }, + }, }; function(params) @@ -171,4 +181,63 @@ function(params) }, }, }, + + sloReconcileErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: po.service.metadata { + name: po._config.name + '-reconcile-errors', + labels: po._config.commonLabels + po._config.mixin.ruleLabels + { + 'pyrra.dev/component': po._config.name, + }, + }, + spec: { + target: po._config.slos.reconcileErrors.target, + window: po._config.slos.reconcileErrors.window, + description: ||| + The Prometheus Operator reconciles the controllers object to have the underlying resource in the desired state. + If this is firing the object may not be running correctly. + |||, + indicator: { + ratio: { + errors: { + metric: 'prometheus_operator_reconcile_errors_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + total: { + metric: 'prometheus_operator_reconcile_operations_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + grouping: ['controller'], + }, + }, + }, + }, + + sloHTTPErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: po.service.metadata { + name: po._config.name + '-http-errors', + labels: po._config.commonLabels + po._config.mixin.ruleLabels + { + 'pyrra.dev/component': po._config.name, + }, + }, + spec: { + target: '99.5', + window: '2w', + description: ||| + The Prometheus Operator makes HTTP requests to the Kubernetes API server to read and write the objects. + If this firing the Prometheus Operator might not be able read and write the latest objects. + |||, + indicator: { + ratio: { + errors: { + metric: 'prometheus_operator_kubernetes_client_http_requests_total{%s,status_code=~"5.."}' % po._config.mixin._config.prometheusOperatorSelector, + }, + total: { + metric: 'prometheus_operator_kubernetes_client_http_requests_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + }, + }, + }, + }, } diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet new file mode 100644 index 0000000000..00467cf6f7 --- /dev/null +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -0,0 +1,43 @@ +local pyrra = import 'github.com/pyrra-dev/pyrra/jsonnet/pyrra/kubernetes.libsonnet'; + +local defaults = { + local defaults = self, + + name:: 'pyrra', + namespace:: error 'must provide namespace', + version:: error 'must provide version', + image:: error 'must provide image', + resources:: { + limits: { cpu: '200m', memory: '512Mi' }, + requests: { cpu: '100m', memory: '100Mi' }, + }, +}; + +function(params) + local config = defaults { + values+:: { + pyrra+: params, + }, + }; + // Safety check + assert std.isObject(config.resources); + + (pyrra + config).pyrra { + // Enable generic rules for kube-promethues by default + kubernetesDeployment+: { + spec+: { + template+: { + spec+: { + containers: [ + c { + args+: [ + '--generic-rules', + ], + } + for c in super.containers + ], + }, + }, + }, + }, + } diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index bf7d40ea01..5f8b08225a 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -108,7 +108,7 @@ "source": { "git": { "remote": "https://github.com/pyrra-dev/pyrra.git", - "subdir": "jsonnet/controller-gen" + "subdir": "jsonnet" } }, "version": "main", diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index d8aa5028af..8a39eaf1c5 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -8,6 +8,7 @@ local nodeExporter = import './components/node-exporter.libsonnet'; local prometheusAdapter = import './components/prometheus-adapter.libsonnet'; local prometheusOperator = import './components/prometheus-operator.libsonnet'; local prometheus = import './components/prometheus.libsonnet'; +local pyrra = import './components/pyrra.libsonnet'; local platformPatch = import './platforms/platforms.libsonnet'; @@ -35,6 +36,7 @@ local utils = import './lib/utils.libsonnet'; prometheusOperator: error 'must provide version', kubeRbacProxy: error 'must provide version', configmapReload: error 'must provide version', + pyrra: error 'must provide version', } + (import 'versions.json'), images: { alertmanager: 'quay.io/prometheus/alertmanager:v' + $.values.common.versions.alertmanager, @@ -48,6 +50,7 @@ local utils = import './lib/utils.libsonnet'; prometheusOperatorReloader: 'quay.io/prometheus-operator/prometheus-config-reloader:v' + $.values.common.versions.prometheusOperator, kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v' + $.values.common.versions.kubeRbacProxy, configmapReload: 'ghcr.io/jimmidyson/configmap-reload:v' + $.values.common.versions.configmapReload, + pyrra: 'ghcr.io/pyrra-dev/pyrra:v' + $.values.common.versions.pyrra, }, }, alertmanager: { @@ -112,7 +115,7 @@ local utils = import './lib/utils.libsonnet'; image: $.values.common.images.prometheusAdapter, prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.prometheus.namespace + '.svc:9090/', rangeIntervals+: { - kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval), + kubelet: utils.rangeInterval($.kubernetesControlPlane.kubeletServiceMonitor.spec.endpoints[0].interval), nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval), }, }, @@ -128,6 +131,11 @@ local utils = import './lib/utils.libsonnet'; namespace: $.values.common.namespace, mixin+: { ruleLabels: $.values.common.ruleLabels }, }, + pyrra: { + namespace: $.values.common.namespace, + version: $.values.common.versions.pyrra, + image: $.values.common.images.pyrra, + }, }, alertmanager: alertmanager($.values.alertmanager), @@ -138,6 +146,7 @@ local utils = import './lib/utils.libsonnet'; prometheus: prometheus($.values.prometheus), prometheusAdapter: prometheusAdapter($.values.prometheusAdapter), prometheusOperator: prometheusOperator($.values.prometheusOperator), + pyrra: pyrra($.values.pyrra), kubernetesControlPlane: kubernetesControlPlane($.values.kubernetesControlPlane), kubePrometheus: customMixin( { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index db11c37c45..dc990b74f8 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -198,11 +198,11 @@ "source": { "git": { "remote": "https://github.com/pyrra-dev/pyrra.git", - "subdir": "jsonnet/controller-gen" + "subdir": "jsonnet" } }, - "version": "1ed2f3301251455dee757da1658c88fd32e0f1ca", - "sum": "O3c9Uurei8MWAY0Ad7DOL1fMqSgdHyHB7MpHsxSITKM=", + "version": "e6d76176d1adbd4712561a1e61caca470edd4002", + "sum": "egH5yDS/wTfqNLFm7tSWafIsEGjOuk8xDj2PFhCWX2A=", "name": "pyrra" }, { diff --git a/kustomization.yaml b/kustomization.yaml index e0d8039ed7..9421247310 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -36,12 +36,18 @@ resources: - ./manifests/kubeStateMetrics-service.yaml - ./manifests/kubeStateMetrics-serviceAccount.yaml - ./manifests/kubeStateMetrics-serviceMonitor.yaml +- ./manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml +- ./manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml +- ./manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml +- ./manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml +- ./manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml +- ./manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml +- ./manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml +- ./manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml +- ./manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml - ./manifests/kubernetesControlPlane-prometheusRule.yaml - ./manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml - ./manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml - ./manifests/nodeExporter-clusterRole.yaml - ./manifests/nodeExporter-clusterRoleBinding.yaml - ./manifests/nodeExporter-daemonset.yaml @@ -85,6 +91,32 @@ resources: - ./manifests/prometheusOperator-service.yaml - ./manifests/prometheusOperator-serviceAccount.yaml - ./manifests/prometheusOperator-serviceMonitor.yaml +- ./manifests/prometheusOperator-sloHTTPErrors.yaml +- ./manifests/prometheusOperator-sloReconcileErrors.yaml +- ./manifests/pyrra-apiDeployment.yaml +- ./manifests/pyrra-apiService.yaml +- ./manifests/pyrra-apiServiceAccount.yaml +- ./manifests/pyrra-apiServiceMonitor.yaml +- ./manifests/pyrra-kubernetesClusterRole.yaml +- ./manifests/pyrra-kubernetesClusterRoleBinding.yaml +- ./manifests/pyrra-kubernetesDeployment.yaml +- ./manifests/pyrra-kubernetesService.yaml +- ./manifests/pyrra-kubernetesServiceAccount.yaml +- ./manifests/pyrra-kubernetesServiceMonitor.yaml +- ./manifests/pyrra-slo-apiserver-read-cluster-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-namespace-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-resource-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-response-errors.yaml +- ./manifests/pyrra-slo-apiserver-write-response-errors.yaml +- ./manifests/pyrra-slo-coredns-response-errors.yaml +- ./manifests/pyrra-slo-kubelet-request-errors.yaml +- ./manifests/pyrra-slo-kubelet-runtime-errors.yaml +- ./manifests/pyrra-slo-prometheus-notification-errors.yaml +- ./manifests/pyrra-slo-prometheus-operator-http-errors.yaml +- ./manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml +- ./manifests/pyrra-slo-prometheus-query-errors.yaml +- ./manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml +- ./manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml - ./manifests/setup/0alertmanagerConfigCustomResourceDefinition.yaml - ./manifests/setup/0alertmanagerCustomResourceDefinition.yaml - ./manifests/setup/0podmonitorCustomResourceDefinition.yaml @@ -95,4 +127,5 @@ resources: - ./manifests/setup/0scrapeconfigCustomResourceDefinition.yaml - ./manifests/setup/0servicemonitorCustomResourceDefinition.yaml - ./manifests/setup/0thanosrulerCustomResourceDefinition.yaml +- ./manifests/setup/crd.yaml - ./manifests/setup/namespace.yaml diff --git a/manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml b/manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml similarity index 95% rename from manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml rename to manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml index f3313d6da4..bdea0bf4e8 100644 --- a/manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml +++ b/manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml @@ -22,4 +22,4 @@ spec: - kube-system selector: matchLabels: - k8s-app: kube-dns + k8s-app: coredns diff --git a/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml b/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml new file mode 100644 index 0000000000..f28a2fff9a --- /dev/null +++ b/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: coredns + role: alert-rules + name: coredns-response-errors + namespace: monitoring +spec: + description: | + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS fails to answer requests applications might be unable to make requests. + indicator: + ratio: + errors: + metric: coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"} + total: + metric: coredns_dns_responses_total{job="coredns"} + target: "99.99" + window: 2w diff --git a/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml b/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml new file mode 100644 index 0000000000..c3eee8bf10 --- /dev/null +++ b/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: coredns + role: alert-rules + name: coredns-response-latency + namespace: monitoring +spec: + description: | + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS gets too slow it might have an impact on the latency of other applications in this cluster. + indicator: + latency: + success: + metric: coredns_dns_request_duration_seconds_bucket{job="coredns",le="0.032"} + total: + metric: coredns_dns_request_duration_seconds_count{job="coredns"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml b/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml new file mode 100644 index 0000000000..dd066b421f --- /dev/null +++ b/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml @@ -0,0 +1,21 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kube-controller-manager + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kube-controller-manager + role: alert-rules + name: kube-controller-manager-request-errors + namespace: monitoring +spec: + description: "The Kubernetes controller manager is a daemon that embeds the core control loops shipped with Kubernetes. \nIn applications of robotics and automation, a control loop is a non-terminating loop that regulates the state of the system. \nIn Kubernetes, a controller is a control loop that watches the shared state of the cluster through the apiserver and makes changes attempting to move the current state towards the desired state. Examples of controllers that ship with Kubernetes today are the replication controller, endpoints controller, namespace controller, and serviceaccounts controller.\n" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kube-controller-manager",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kube-controller-manager"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml b/manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml similarity index 100% rename from manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml rename to manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml diff --git a/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml b/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml new file mode 100644 index 0000000000..a69f01710b --- /dev/null +++ b/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml @@ -0,0 +1,22 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kube-proxy + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kube-proxy + role: alert-rules + name: kube-proxy-request-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kube-proxy",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kube-proxy"} + target: "90" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml b/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml new file mode 100644 index 0000000000..e1424b4fb0 --- /dev/null +++ b/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kubelet + role: alert-rules + name: kubelet-request-errors + namespace: monitoring +spec: + description: | + The kubelet is the primary “node agent” that runs on each node. + The kubelet ensures that the containers are running and healthy. + If these requests are failing the Kubelet might not know what to run exactly. + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kubelet", metrics_path="/metrics",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kubelet", metrics_path="/metrics"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml b/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml new file mode 100644 index 0000000000..079f7a953b --- /dev/null +++ b/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml @@ -0,0 +1,23 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kubelet + role: alert-rules + name: kubelet-runtime-errors + namespace: monitoring +spec: + description: | + The kubelet is the primary “node agent” that runs on each node. + If there are runtime errors the kubelet might be unable to check the containers are running and healthy. + indicator: + ratio: + errors: + metric: kubelet_runtime_operations_errors_total{job="kubelet", metrics_path="/metrics"} + total: + metric: kubelet_runtime_operations_total{job="kubelet", metrics_path="/metrics"} + target: "99.5" + window: 2w diff --git a/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml b/manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml similarity index 100% rename from manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml rename to manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml diff --git a/manifests/prometheusOperator-sloHTTPErrors.yaml b/manifests/prometheusOperator-sloHTTPErrors.yaml new file mode 100644 index 0000000000..68dd6b886a --- /dev/null +++ b/manifests/prometheusOperator-sloHTTPErrors.yaml @@ -0,0 +1,23 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.83.0 + prometheus: k8s + pyrra.dev/component: prometheus-operator + role: alert-rules + name: prometheus-operator-http-errors + namespace: monitoring +spec: + description: "The Prometheus Operator makes HTTP requests to the Kubernetes API server to read and write the objects.\nIf this firing the Prometheus Operator might not be able read and write the latest objects. \n" + indicator: + ratio: + errors: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",namespace="monitoring",status_code=~"5.."} + total: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",namespace="monitoring"} + target: "99.5" + window: 2w diff --git a/manifests/prometheusOperator-sloReconcileErrors.yaml b/manifests/prometheusOperator-sloReconcileErrors.yaml new file mode 100644 index 0000000000..76115fa329 --- /dev/null +++ b/manifests/prometheusOperator-sloReconcileErrors.yaml @@ -0,0 +1,27 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.83.0 + prometheus: k8s + pyrra.dev/component: prometheus-operator + role: alert-rules + name: prometheus-operator-reconcile-errors + namespace: monitoring +spec: + description: | + The Prometheus Operator reconciles the controllers object to have the underlying resource in the desired state. + If this is firing the object may not be running correctly. + indicator: + ratio: + errors: + metric: prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"} + grouping: + - controller + total: + metric: prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"} + target: "95" + window: 2w diff --git a/manifests/pyrra-apiDeployment.yaml b/manifests/pyrra-apiDeployment.yaml new file mode 100644 index 0000000000..e682c0600c --- /dev/null +++ b/manifests/pyrra-apiDeployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-api + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + spec: + containers: + - args: + - api + - --api-url=http://pyrra-kubernetes.monitoring.svc.cluster.local:9444 + - --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090 + image: ghcr.io/pyrra-dev/pyrra:v0.8.4 + name: pyrra + ports: + - containerPort: 9099 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: pyrra-api diff --git a/manifests/pyrra-apiService.yaml b/manifests/pyrra-apiService.yaml new file mode 100644 index 0000000000..cae85601d8 --- /dev/null +++ b/manifests/pyrra-apiService.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-api + namespace: monitoring +spec: + ports: + - name: http + port: 9099 + targetPort: 9099 + selector: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-apiServiceAccount.yaml b/manifests/pyrra-apiServiceAccount.yaml new file mode 100644 index 0000000000..98f31f614b --- /dev/null +++ b/manifests/pyrra-apiServiceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-api + namespace: monitoring diff --git a/manifests/pyrra-apiServiceMonitor.yaml b/manifests/pyrra-apiServiceMonitor.yaml new file mode 100644 index 0000000000..6cb4168f8d --- /dev/null +++ b/manifests/pyrra-apiServiceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-api + namespace: monitoring +spec: + endpoints: + - port: http + namespaceSelector: + matchNames: + - monitoring + selector: + matchLabels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-kubernetesClusterRole.yaml b/manifests/pyrra-kubernetesClusterRole.yaml new file mode 100644 index 0000000000..b18068ff6e --- /dev/null +++ b/manifests/pyrra-kubernetesClusterRole.yaml @@ -0,0 +1,48 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring +rules: +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules/status + verbs: + - get +- apiGroups: + - pyrra.dev + resources: + - servicelevelobjectives + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - pyrra.dev + resources: + - servicelevelobjectives/status + verbs: + - get + - patch + - update diff --git a/manifests/pyrra-kubernetesClusterRoleBinding.yaml b/manifests/pyrra-kubernetesClusterRoleBinding.yaml new file mode 100644 index 0000000000..73e972b87e --- /dev/null +++ b/manifests/pyrra-kubernetesClusterRoleBinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pyrra-kubernetes +subjects: +- kind: ServiceAccount + name: pyrra-kubernetes + namespace: monitoring diff --git a/manifests/pyrra-kubernetesDeployment.yaml b/manifests/pyrra-kubernetesDeployment.yaml new file mode 100644 index 0000000000..f6a4f3c217 --- /dev/null +++ b/manifests/pyrra-kubernetesDeployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + spec: + containers: + - args: + - kubernetes + - --generic-rules + image: ghcr.io/pyrra-dev/pyrra:v0.8.4 + name: pyrra + ports: + - containerPort: 9099 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: pyrra-kubernetes diff --git a/manifests/pyrra-kubernetesService.yaml b/manifests/pyrra-kubernetesService.yaml new file mode 100644 index 0000000000..16bba76704 --- /dev/null +++ b/manifests/pyrra-kubernetesService.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring +spec: + ports: + - name: metrics + port: 8080 + targetPort: 8080 + - name: http + port: 9444 + targetPort: 9444 + - name: webhooks + port: 9443 + targetPort: 9443 + selector: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-kubernetesServiceAccount.yaml b/manifests/pyrra-kubernetesServiceAccount.yaml new file mode 100644 index 0000000000..f7d359f9de --- /dev/null +++ b/manifests/pyrra-kubernetesServiceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring diff --git a/manifests/pyrra-kubernetesServiceMonitor.yaml b/manifests/pyrra-kubernetesServiceMonitor.yaml new file mode 100644 index 0000000000..46121413e1 --- /dev/null +++ b/manifests/pyrra-kubernetesServiceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.8.4 + name: pyrra-kubernetes + namespace: monitoring +spec: + endpoints: + - port: metrics + namespaceSelector: + matchNames: + - monitoring + selector: + matchLabels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml new file mode 100644 index 0000000000..ddc701d37a --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: apiserver-read-cluster-latency + namespace: monitoring +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"} + total: + metric: apiserver_request_slo_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml new file mode 100644 index 0000000000..86e4c391ff --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: apiserver-read-namespace-latency + namespace: monitoring +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"} + total: + metric: apiserver_request_slo_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml new file mode 100644 index 0000000000..ec0d7598b3 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: apiserver-read-resource-latency + namespace: monitoring +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{verb=~"LIST|GET",le="0.1"} + total: + metric: apiserver_request_slo_duration_seconds_count{verb=~"LIST|GET"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-response-errors.yaml b/manifests/pyrra-slo-apiserver-read-response-errors.yaml new file mode 100644 index 0000000000..067ca6c928 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-response-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: apiserver-read-response-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: apiserver_request_total{component="apiserver",verb=~"LIST|GET",code=~"5.."} + total: + metric: apiserver_request_total{component="apiserver",verb=~"LIST|GET"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-write-response-errors.yaml b/manifests/pyrra-slo-apiserver-write-response-errors.yaml new file mode 100644 index 0000000000..c94985d599 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-write-response-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: apiserver-write-response-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."} + total: + metric: apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-coredns-response-errors.yaml b/manifests/pyrra-slo-coredns-response-errors.yaml new file mode 100644 index 0000000000..346a7fcf28 --- /dev/null +++ b/manifests/pyrra-slo-coredns-response-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: coredns-response-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"} + total: + metric: coredns_dns_responses_total{job="kube-dns"} + target: "99.99" + window: 2w diff --git a/manifests/pyrra-slo-kubelet-request-errors.yaml b/manifests/pyrra-slo-kubelet-request-errors.yaml new file mode 100644 index 0000000000..5696de98b1 --- /dev/null +++ b/manifests/pyrra-slo-kubelet-request-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: kubelet-request-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kubelet",code=~"5.."} + total: + metric: rest_client_requests_total{job="kubelet"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-kubelet-runtime-errors.yaml b/manifests/pyrra-slo-kubelet-runtime-errors.yaml new file mode 100644 index 0000000000..a7a95f89bc --- /dev/null +++ b/manifests/pyrra-slo-kubelet-runtime-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: kubelet-runtime-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: kubelet_runtime_operations_errors_total{job="kubelet"} + total: + metric: kubelet_runtime_operations_total{job="kubelet"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-notification-errors.yaml b/manifests/pyrra-slo-prometheus-notification-errors.yaml new file mode 100644 index 0000000000..c9e01cfe4c --- /dev/null +++ b/manifests/pyrra-slo-prometheus-notification-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-notification-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_notifications_errors_total{job="prometheus-k8s"} + total: + metric: prometheus_notifications_sent_total{job="prometheus-k8s"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-operator-http-errors.yaml b/manifests/pyrra-slo-prometheus-operator-http-errors.yaml new file mode 100644 index 0000000000..217cd0013b --- /dev/null +++ b/manifests/pyrra-slo-prometheus-operator-http-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-operator-http-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",status_code=~"5.."} + total: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator"} + target: "99.5" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml b/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml new file mode 100644 index 0000000000..0c579bbdc8 --- /dev/null +++ b/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml @@ -0,0 +1,20 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-operator-reconcile-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_operator_reconcile_errors_total{job="prometheus-operator"} + grouping: + - controller + total: + metric: prometheus_operator_reconcile_operations_total{job="prometheus-operator"} + target: "95" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-query-errors.yaml b/manifests/pyrra-slo-prometheus-query-errors.yaml new file mode 100644 index 0000000000..99357ea44f --- /dev/null +++ b/manifests/pyrra-slo-prometheus-query-errors.yaml @@ -0,0 +1,20 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-query-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."} + grouping: + - handler + total: + metric: prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml new file mode 100644 index 0000000000..32c542b6ec --- /dev/null +++ b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-rule-evaluation-failures + namespace: monitoring +spec: + description: Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated. + indicator: + ratio: + errors: + metric: prometheus_rule_evaluation_failures_total{job="prometheus-k8s"} + total: + metric: prometheus_rule_evaluations_total{job="prometheus-k8s"} + target: "99.99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml new file mode 100644 index 0000000000..c85d5205f5 --- /dev/null +++ b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-sd-kubernetes-errors + namespace: monitoring +spec: + description: If there are too many errors Prometheus is having a bad time discovering new Kubernetes services. + indicator: + ratio: + errors: + metric: prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"} + total: + metric: prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"} + target: "99" + window: 2w diff --git a/manifests/setup/crd.yaml b/manifests/setup/crd.yaml new file mode 100644 index 0000000000..847543716e --- /dev/null +++ b/manifests/setup/crd.yaml @@ -0,0 +1,179 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: servicelevelobjectives.pyrra.dev +spec: + group: pyrra.dev + names: + kind: ServiceLevelObjective + listKind: ServiceLevelObjectiveList + plural: servicelevelobjectives + shortNames: + - slo + singular: servicelevelobjective + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.window + name: Window + type: string + - jsonPath: .spec.target + name: Target + type: string + - jsonPath: .status.type + name: Type + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: ServiceLevelObjective is the Schema for the ServiceLevelObjectives API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: ServiceLevelObjectiveSpec defines the desired state of ServiceLevelObjective. + properties: + alerting: + description: Alerting customizes the alerting rules generated by Pyrra. + properties: + disabled: + description: Disabled is used to disable the generation of alerts. Recording rules are still generated. + type: boolean + name: + description: Name is used as the name of the alert generated by Pyrra. Defaults to "ErrorBudgetBurn". + type: string + type: object + description: + description: Description describes the ServiceLevelObjective in more detail and gives extra context for engineers that might not directly work on the service. + type: string + indicator: + description: ServiceLevelIndicator is the underlying data source that indicates how the service is doing. This will be a Prometheus metric with specific selectors for your service. + properties: + bool_gauge: + description: BoolGauge is the indicator that measures whether a boolean gauge is successful. + properties: + grouping: + description: Total is the metric that returns how many requests there are in total. + items: + type: string + type: array + metric: + type: string + required: + - metric + type: object + latency: + description: Latency is the indicator that measures a certain percentage to be faster than the expected latency. + properties: + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + success: + description: Success is the metric that returns how many errors there are. + properties: + metric: + type: string + required: + - metric + type: object + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - success + - total + type: object + latencyNative: + description: LatencyNative is the indicator that measures a certain percentage to be faster than the expected latency. This uses the new native histograms in Prometheus. + properties: + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + latency: + description: Latency the requests should be faster than. + type: string + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - latency + - total + type: object + ratio: + description: Ratio is the indicator that measures against errors / total events. + properties: + errors: + description: Errors is the metric that returns how many errors there are. + properties: + metric: + type: string + required: + - metric + type: object + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - errors + - total + type: object + type: object + target: + description: 'Target is a string that''s casted to a float64 between 0 - 100. It represents the desired availability of the service in the given window. float64 are not supported: https://github.com/kubernetes-sigs/controller-tools/issues/245' + type: string + window: + description: Window within which the Target is supposed to be kept. Usually something like 1d, 7d or 28d. + type: string + required: + - indicator + - target + - window + type: object + status: + description: ServiceLevelObjectiveStatus defines the observed state of ServiceLevelObjective. + properties: + type: + description: Type is the generated resource type, like PrometheusRule or ConfigMap + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {}