kubernetes-monitoring
diff --git a/‎.github/workflows/check-with-upstream.yaml‎
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/check-with-upstream.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 0 additions & 20 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎Makefile‎
Lines changed: 6 additions & 0 deletions b/‎Makefile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎alerts/apps_alerts.libsonnet‎
Lines changed: 16 additions & 1 deletion b/‎alerts/apps_alerts.libsonnet‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎alerts/kube_apiserver.libsonnet‎
Lines changed: 2 additions & 2 deletions b/‎alerts/kube_apiserver.libsonnet‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎alerts/kubelet.libsonnet‎
Lines changed: 1 addition & 1 deletion b/‎alerts/kubelet.libsonnet‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎alerts/resource_alerts.libsonnet‎
Lines changed: 8 additions & 8 deletions b/‎alerts/resource_alerts.libsonnet‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎alerts/system_alerts.libsonnet‎
Lines changed: 3 additions & 2 deletions b/‎alerts/system_alerts.libsonnet‎
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,14 @@
+name: check-with-upstream
+# Run every Monday.
+on:
+  schedule:
+    - cron: '0 0 * * 1'
+jobs:
+  check-selectors-ksm:
+    runs-on: ubuntu-latest
+    name: Check if KSM selectors are present on applicable metrics.
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        persist-credentials: false
+    - run: make --always-make check-selectors-ksm
@@ -2,8 +2,6 @@ name: ci
 on:
   - push
   - pull_request
-env:
-  golang-version: '1.17'
 jobs:
   generate:
     runs-on: ubuntu-latest
@@ -12,9 +10,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make generate && git diff --exit-code
   jsonnet-lint:
     runs-on: ubuntu-latest
@@ -23,9 +18,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make jsonnet-lint
   dashboards-lint:
     runs-on: ubuntu-latest
@@ -34,9 +26,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make dashboards-lint
   alerts-lint:
     runs-on: ubuntu-latest
@@ -45,9 +34,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make alerts-lint
   fmt:
     runs-on: ubuntu-latest
@@ -56,9 +42,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make fmt && git diff --exit-code
   unit-tests:
     runs-on: ubuntu-latest
@@ -67,7 +50,4 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make test
@@ -76,3 +76,9 @@ $(TOOLING): $(BIN_DIR)
 	@echo Installing tools from hack/tools.go
 	@cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
 
+########################################
+# "check-with-upstream" workflow checks.
+########################################
+
+check-selectors-ksm:
+	@./scripts/check-selectors-ksm.sh
@@ -87,6 +87,21 @@
             'for': '15m',
             alert: 'KubeDeploymentReplicasMismatch',
           },
+          {
+            expr: |||
+              kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
+              != 0
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
+              summary: 'Deployment rollout is not progressing.',
+            },
+            'for': '15m',
+            alert: 'KubeDeploymentRolloutStuck',
+          },
           {
             expr: |||
               (
@@ -104,7 +119,7 @@
             },
             annotations: {
               description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
-              summary: 'Deployment has not matched the expected number of replicas.',
+              summary: 'StatefulSet has not matched the expected number of replicas.',
             },
             'for': '15m',
             alert: 'KubeStatefulSetReplicasMismatch',
 
@@ -77,7 +77,7 @@ local utils = import '../lib/utils.libsonnet';
           {
             alert: 'KubeAggregatedAPIErrors',
             expr: |||
-              sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+              sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4
             ||| % $._config,
             labels: {
               severity: 'warning',
@@ -90,7 +90,7 @@ local utils = import '../lib/utils.libsonnet';
           {
             alert: 'KubeAggregatedAPIDown',
             expr: |||
-              (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
+              (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
             ||| % $._config,
             'for': '5m',
             labels: {
 
@@ -72,7 +72,7 @@
           {
             alert: 'KubeNodeReadinessFlapping',
             expr: |||
-              sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
+              sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
             ||| % $._config,
             'for': '15m',
             labels: {
 
@@ -34,18 +34,18 @@
           } +
           if $._config.showMultiCluster then {
             expr: |||
-              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
+              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
               and
-              (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
+              (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
             ||| % $._config,
             annotations+: {
               description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
             },
           } else {
             expr: |||
-              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
+              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
               and
-              (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
+              (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
             ||| % $._config,
             annotations+: {
               description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
@@ -63,19 +63,19 @@
           } +
           if $._config.showMultiCluster then {
             expr: |||
-              sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
+              sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
               and
-              (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
+              (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
             ||| % $._config,
             annotations+: {
               description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
             },
           } else
             {
               expr: |||
-                sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
+                sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
                 and
-                (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
+                (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
               ||| % $._config,
               annotations+: {
                 description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
 
@@ -1,6 +1,7 @@
 {
   _config+:: {
     notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
+    kubeApiserverSelector: 'job="kube-apiserver"',
   },
 
   prometheusAlerts+:: {
@@ -28,9 +29,9 @@
             // this is normal and an expected error, therefore it should be
             // ignored in this alert.
             expr: |||
-              (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
+              (sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
                 /
-              sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
+              sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
               > 0.01
             ||| % $._config,
             'for': '15m',
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@`
`72`	`72`	`{`
`73`	`73`	`alert: 'KubeNodeReadinessFlapping',`
`74`	`74`	`expr: \|\|\|`
`75`		`- sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2`
	`75`	`+ sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2`
`76`	`76`	`\|\|\| % $._config,`
`77`	`77`	`'for': '15m',`
`78`	`78`	`labels: {`