kubernetes-monitoring
diff --git a/‎.github/workflows/check-with-upstream.yaml
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/check-with-upstream.yaml
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yaml
Lines changed: 0 additions & 20 deletions b/‎.github/workflows/ci.yaml
Lines changed: 0 additions & 20 deletions
diff --git a/‎Makefile
Lines changed: 16 additions & 8 deletions b/‎Makefile
Lines changed: 16 additions & 8 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎alerts/apps_alerts.libsonnet
Lines changed: 16 additions & 1 deletion b/‎alerts/apps_alerts.libsonnet
Lines changed: 16 additions & 1 deletion
diff --git a/‎alerts/kube_apiserver.libsonnet
Lines changed: 4 additions & 2 deletions b/‎alerts/kube_apiserver.libsonnet
Lines changed: 4 additions & 2 deletions
diff --git a/‎alerts/kubelet.libsonnet
Lines changed: 1 addition & 1 deletion b/‎alerts/kubelet.libsonnet
Lines changed: 1 addition & 1 deletion
diff --git a/‎alerts/resource_alerts.libsonnet
Lines changed: 84 additions & 27 deletions b/‎alerts/resource_alerts.libsonnet
Lines changed: 84 additions & 27 deletions
diff --git a/‎alerts/system_alerts.libsonnet
Lines changed: 3 additions & 2 deletions b/‎alerts/system_alerts.libsonnet
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,14 @@
+name: check-with-upstream
+# Run every Monday.
+on:
+  schedule:
+    - cron: '0 0 * * 1'
+jobs:
+  check-selectors-ksm:
+    runs-on: ubuntu-latest
+    name: Check if KSM selectors are present on applicable metrics.
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        persist-credentials: false
+    - run: make --always-make check-selectors-ksm
@@ -2,8 +2,6 @@ name: ci
 on:
   - push
   - pull_request
-env:
-  golang-version: '1.17'
 jobs:
   generate:
     runs-on: ubuntu-latest
@@ -12,9 +10,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make generate && git diff --exit-code
   jsonnet-lint:
     runs-on: ubuntu-latest
@@ -23,9 +18,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make jsonnet-lint
   dashboards-lint:
     runs-on: ubuntu-latest
@@ -34,9 +26,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make dashboards-lint
   alerts-lint:
     runs-on: ubuntu-latest
@@ -45,9 +34,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make alerts-lint
   fmt:
     runs-on: ubuntu-latest
@@ -56,9 +42,6 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make fmt && git diff --exit-code
   unit-tests:
     runs-on: ubuntu-latest
@@ -67,7 +50,4 @@ jobs:
     - uses: actions/checkout@v2
       with:
         persist-credentials: false
-    - uses: actions/setup-go@v2
-      with:
-        go-version: ${{ env.golang-version }}
     - run: make --always-make test
@@ -9,12 +9,14 @@ JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
 PROMTOOL_BIN=$(BIN_DIR)/promtool
 TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN)
 JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s
+SRC_DIR ?=dashboards
+OUT_DIR ?=dashboards_out
 
 .PHONY: all
 all: fmt generate lint test
 
 .PHONY: generate
-generate: prometheus_alerts.yaml prometheus_rules.yaml dashboards_out
+generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR)
 
 $(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json
 	$(JB_BIN) install
@@ -30,9 +32,9 @@ prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts
 prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet
 	@$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@
 
-dashboards_out: $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet dashboards/*.libsonnet
-	@mkdir -p dashboards_out
-	@$(JSONNET_BIN) -J vendor -m dashboards_out lib/dashboards.jsonnet
+$(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet
+	@mkdir -p $(OUT_DIR)
+	@$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet
 
 .PHONY: lint
 lint: jsonnet-lint alerts-lint dashboards-lint
@@ -48,14 +50,14 @@ alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
 	@$(PROMTOOL_BIN) check rules prometheus_rules.yaml
 	@$(PROMTOOL_BIN) check rules prometheus_alerts.yaml
 
-dashboards_out/.lint: dashboards_out
+$(OUT_DIR)/.lint: $(OUT_DIR)
 	@cp .lint $@
 
 .PHONY: dashboards-lint
-dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) dashboards_out/.lint
+dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint
 	# Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy.
-	@sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' dashboards_out/*.json
-	@find dashboards_out -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
+	@sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json
+	@find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
 
 
 .PHONY: clean
@@ -74,3 +76,9 @@ $(TOOLING): $(BIN_DIR)
 	@echo Installing tools from hack/tools.go
 	@cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
 
+########################################
+# "check-with-upstream" workflow checks.
+########################################
+
+check-selectors-ksm:
+	@./scripts/check-selectors-ksm.sh
@@ -20,6 +20,7 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
 | release-0.9  | v1.20+             | v2.11.0+  | v2.0+ |
 | release-0.10 | v1.20+             | v2.11.0+  | v2.0+ |
 | release-0.11 | v1.23+             | v2.11.0+  | v2.0+ |
+| release-0.12 | v1.23+             | v2.11.0+  | v2.0+ |
 | master       | v1.23+             | v2.11.0+  | v2.0+ |
 
 In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.
 
@@ -87,6 +87,21 @@
             'for': '15m',
             alert: 'KubeDeploymentReplicasMismatch',
           },
+          {
+            expr: |||
+              kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
+              != 0
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
+              summary: 'Deployment rollout is not progressing.',
+            },
+            'for': '15m',
+            alert: 'KubeDeploymentRolloutStuck',
+          },
           {
             expr: |||
               (
@@ -285,7 +300,7 @@
             },
             annotations: {
               description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }} has not matched the desired number of replicas for longer than 15 minutes.',
-              summary: 'HPA has not matched descired number of replicas.',
+              summary: 'HPA has not matched desired number of replicas.',
             },
             'for': '15m',
             alert: 'KubeHpaReplicasMismatch',
 
@@ -51,6 +51,7 @@ local utils = import '../lib/utils.libsonnet';
             expr: |||
               apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s
             ||| % $._config,
+            'for': '5m',
             labels: {
               severity: 'warning',
             },
@@ -64,6 +65,7 @@ local utils = import '../lib/utils.libsonnet';
             expr: |||
               apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s
             ||| % $._config,
+            'for': '5m',
             labels: {
               severity: 'critical',
             },
@@ -75,7 +77,7 @@ local utils = import '../lib/utils.libsonnet';
           {
             alert: 'KubeAggregatedAPIErrors',
             expr: |||
-              sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
+              sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4
             ||| % $._config,
             labels: {
               severity: 'warning',
@@ -88,7 +90,7 @@ local utils = import '../lib/utils.libsonnet';
           {
             alert: 'KubeAggregatedAPIDown',
             expr: |||
-              (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
+              (1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
             ||| % $._config,
             'for': '5m',
             labels: {
 
@@ -72,7 +72,7 @@
           {
             alert: 'KubeNodeReadinessFlapping',
             expr: |||
-              sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
+              sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
             ||| % $._config,
             'for': '15m',
             labels: {
 
@@ -24,70 +24,127 @@
         rules: [
           {
             alert: 'KubeCPUOvercommit',
-            expr: |||
-              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
-              and
-              (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
-            ||| % $._config,
             labels: {
               severity: 'warning',
             },
             annotations: {
-              description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.',
               summary: 'Cluster has overcommitted CPU resource requests.',
             },
             'for': '10m',
-          },
-          {
-            alert: 'KubeMemoryOvercommit',
+          } +
+          if $._config.showMultiCluster then {
+            expr: |||
+              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
+              and
+              (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
+            ||| % $._config,
+            annotations+: {
+              description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
+            },
+          } else {
             expr: |||
-              sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
+              sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
               and
-              (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
+              (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
             ||| % $._config,
+            annotations+: {
+              description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
+            },
+          },
+          {
+            alert: 'KubeMemoryOvercommit',
             labels: {
               severity: 'warning',
             },
             annotations: {
-              description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
               summary: 'Cluster has overcommitted memory resource requests.',
             },
             'for': '10m',
-          },
-          {
-            alert: 'KubeCPUQuotaOvercommit',
+          } +
+          if $._config.showMultiCluster then {
             expr: |||
-              sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
-                /
-              sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
-                > %(namespaceOvercommitFactor)s
+              sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
+              and
+              (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
             ||| % $._config,
+            annotations+: {
+              description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
+            },
+          } else
+            {
+              expr: |||
+                sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
+                and
+                (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
+              ||| % $._config,
+              annotations+: {
+                description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
+              },
+            },
+          {
+            alert: 'KubeCPUQuotaOvercommit',
             labels: {
               severity: 'warning',
             },
             annotations: {
-              description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
               summary: 'Cluster has overcommitted CPU resource requests.',
             },
             'for': '5m',
-          },
-          {
-            alert: 'KubeMemoryQuotaOvercommit',
+          } +
+          if $._config.showMultiCluster then {
             expr: |||
-              sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
+              sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) by (%(clusterLabel)s)
                 /
-              sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
+              sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
                 > %(namespaceOvercommitFactor)s
             ||| % $._config,
+            annotations+: {
+              description: 'Cluster {{ $labels.%(clusterLabel)s }}  has overcommitted CPU resource requests for Namespaces.' % $._config,
+            },
+          } else
+            {
+              expr: |||
+                sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
+                  /
+                sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
+                  > %(namespaceOvercommitFactor)s
+              ||| % $._config,
+              annotations+: {
+                description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
+              },
+            },
+          {
+            alert: 'KubeMemoryQuotaOvercommit',
             labels: {
               severity: 'warning',
             },
             annotations: {
-              description: 'Cluster has overcommitted memory resource requests for Namespaces.',
               summary: 'Cluster has overcommitted memory resource requests.',
             },
             'for': '5m',
-          },
+          } +
+          if $._config.showMultiCluster then {
+            expr: |||
+              sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) by (%(clusterLabel)s)
+                /
+              sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
+                > %(namespaceOvercommitFactor)s
+            ||| % $._config,
+            annotations+: {
+              description: 'Cluster {{ $labels.%(clusterLabel)s }}  has overcommitted memory resource requests for Namespaces.' % $._config,
+            },
+          } else
+            {
+              expr: |||
+                sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
+                  /
+                sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
+                  > %(namespaceOvercommitFactor)s
+              ||| % $._config,
+              annotations+: {
+                description: 'Cluster has overcommitted memory resource requests for Namespaces.',
+              },
+            },
           {
             alert: 'KubeQuotaAlmostFull',
             expr: |||
 
@@ -1,6 +1,7 @@
 {
   _config+:: {
     notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
+    kubeApiserverSelector: 'job="kube-apiserver"',
   },
 
   prometheusAlerts+:: {
@@ -28,9 +29,9 @@
             // this is normal and an expected error, therefore it should be
             // ignored in this alert.
             expr: |||
-              (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
+              (sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
                 /
-              sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
+              sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
               > 0.01
             ||| % $._config,
             'for': '15m',
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@`
`72`	`72`	`{`
`73`	`73`	`alert: 'KubeNodeReadinessFlapping',`
`74`	`74`	`expr: \|\|\|`
`75`		`- sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2`
	`75`	`+ sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2`
`76`	`76`	`\|\|\| % $._config,`
`77`	`77`	`'for': '15m',`
`78`	`78`	`labels: {`