Skip to content

Commit 04702d8

Browse files
authored
Merge branch 'master' into fix/scalar-quotas
2 parents a796aaf + 003ba5e commit 04702d8

19 files changed

+263
-97
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: check-with-upstream
2+
# Run every Monday.
3+
on:
4+
schedule:
5+
- cron: '0 0 * * 1'
6+
jobs:
7+
check-selectors-ksm:
8+
runs-on: ubuntu-latest
9+
name: Check if KSM selectors are present on applicable metrics.
10+
steps:
11+
- uses: actions/checkout@v2
12+
with:
13+
persist-credentials: false
14+
- run: make --always-make check-selectors-ksm

.github/workflows/ci.yaml

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ name: ci
22
on:
33
- push
44
- pull_request
5-
env:
6-
golang-version: '1.17'
75
jobs:
86
generate:
97
runs-on: ubuntu-latest
@@ -12,9 +10,6 @@ jobs:
1210
- uses: actions/checkout@v2
1311
with:
1412
persist-credentials: false
15-
- uses: actions/setup-go@v2
16-
with:
17-
go-version: ${{ env.golang-version }}
1813
- run: make --always-make generate && git diff --exit-code
1914
jsonnet-lint:
2015
runs-on: ubuntu-latest
@@ -23,9 +18,6 @@ jobs:
2318
- uses: actions/checkout@v2
2419
with:
2520
persist-credentials: false
26-
- uses: actions/setup-go@v2
27-
with:
28-
go-version: ${{ env.golang-version }}
2921
- run: make --always-make jsonnet-lint
3022
dashboards-lint:
3123
runs-on: ubuntu-latest
@@ -34,9 +26,6 @@ jobs:
3426
- uses: actions/checkout@v2
3527
with:
3628
persist-credentials: false
37-
- uses: actions/setup-go@v2
38-
with:
39-
go-version: ${{ env.golang-version }}
4029
- run: make --always-make dashboards-lint
4130
alerts-lint:
4231
runs-on: ubuntu-latest
@@ -45,9 +34,6 @@ jobs:
4534
- uses: actions/checkout@v2
4635
with:
4736
persist-credentials: false
48-
- uses: actions/setup-go@v2
49-
with:
50-
go-version: ${{ env.golang-version }}
5137
- run: make --always-make alerts-lint
5238
fmt:
5339
runs-on: ubuntu-latest
@@ -56,9 +42,6 @@ jobs:
5642
- uses: actions/checkout@v2
5743
with:
5844
persist-credentials: false
59-
- uses: actions/setup-go@v2
60-
with:
61-
go-version: ${{ env.golang-version }}
6245
- run: make --always-make fmt && git diff --exit-code
6346
unit-tests:
6447
runs-on: ubuntu-latest
@@ -67,7 +50,4 @@ jobs:
6750
- uses: actions/checkout@v2
6851
with:
6952
persist-credentials: false
70-
- uses: actions/setup-go@v2
71-
with:
72-
go-version: ${{ env.golang-version }}
7353
- run: make --always-make test

Makefile

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
99
PROMTOOL_BIN=$(BIN_DIR)/promtool
1010
TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN)
1111
JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s
12+
SRC_DIR ?=dashboards
13+
OUT_DIR ?=dashboards_out
1214

1315
.PHONY: all
1416
all: fmt generate lint test
1517

1618
.PHONY: generate
17-
generate: prometheus_alerts.yaml prometheus_rules.yaml dashboards_out
19+
generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR)
1820

1921
$(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json
2022
$(JB_BIN) install
@@ -30,9 +32,9 @@ prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts
3032
prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet
3133
@$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@
3234

33-
dashboards_out: $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet dashboards/*.libsonnet
34-
@mkdir -p dashboards_out
35-
@$(JSONNET_BIN) -J vendor -m dashboards_out lib/dashboards.jsonnet
35+
$(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet
36+
@mkdir -p $(OUT_DIR)
37+
@$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet
3638

3739
.PHONY: lint
3840
lint: jsonnet-lint alerts-lint dashboards-lint
@@ -48,14 +50,14 @@ alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
4850
@$(PROMTOOL_BIN) check rules prometheus_rules.yaml
4951
@$(PROMTOOL_BIN) check rules prometheus_alerts.yaml
5052

51-
dashboards_out/.lint: dashboards_out
53+
$(OUT_DIR)/.lint: $(OUT_DIR)
5254
@cp .lint $@
5355

5456
.PHONY: dashboards-lint
55-
dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) dashboards_out/.lint
57+
dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint
5658
# Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy.
57-
@sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' dashboards_out/*.json
58-
@find dashboards_out -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
59+
@sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json
60+
@find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
5961

6062

6163
.PHONY: clean
@@ -74,3 +76,9 @@ $(TOOLING): $(BIN_DIR)
7476
@echo Installing tools from hack/tools.go
7577
@cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
7678

79+
########################################
80+
# "check-with-upstream" workflow checks.
81+
########################################
82+
83+
check-selectors-ksm:
84+
@./scripts/check-selectors-ksm.sh

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ A set of Grafana dashboards and Prometheus alerts for Kubernetes.
2020
| release-0.9 | v1.20+ | v2.11.0+ | v2.0+ |
2121
| release-0.10 | v1.20+ | v2.11.0+ | v2.0+ |
2222
| release-0.11 | v1.23+ | v2.11.0+ | v2.0+ |
23+
| release-0.12 | v1.23+ | v2.11.0+ | v2.0+ |
2324
| master | v1.23+ | v2.11.0+ | v2.0+ |
2425

2526
In Kubernetes 1.14 there was a major [metrics overhaul](https://github.com/kubernetes/enhancements/issues/1206) implemented.

alerts/apps_alerts.libsonnet

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,21 @@
8787
'for': '15m',
8888
alert: 'KubeDeploymentReplicasMismatch',
8989
},
90+
{
91+
expr: |||
92+
kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
93+
!= 0
94+
||| % $._config,
95+
labels: {
96+
severity: 'warning',
97+
},
98+
annotations: {
99+
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
100+
summary: 'Deployment rollout is not progressing.',
101+
},
102+
'for': '15m',
103+
alert: 'KubeDeploymentRolloutStuck',
104+
},
90105
{
91106
expr: |||
92107
(
@@ -285,7 +300,7 @@
285300
},
286301
annotations: {
287302
description: 'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes.',
288-
summary: 'HPA has not matched descired number of replicas.',
303+
summary: 'HPA has not matched desired number of replicas.',
289304
},
290305
'for': '15m',
291306
alert: 'KubeHpaReplicasMismatch',

alerts/kube_apiserver.libsonnet

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ local utils = import '../lib/utils.libsonnet';
5151
expr: |||
5252
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s
5353
||| % $._config,
54+
'for': '5m',
5455
labels: {
5556
severity: 'warning',
5657
},
@@ -64,6 +65,7 @@ local utils = import '../lib/utils.libsonnet';
6465
expr: |||
6566
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s
6667
||| % $._config,
68+
'for': '5m',
6769
labels: {
6870
severity: 'critical',
6971
},
@@ -75,7 +77,7 @@ local utils = import '../lib/utils.libsonnet';
7577
{
7678
alert: 'KubeAggregatedAPIErrors',
7779
expr: |||
78-
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
80+
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4
7981
||| % $._config,
8082
labels: {
8183
severity: 'warning',
@@ -88,7 +90,7 @@ local utils = import '../lib/utils.libsonnet';
8890
{
8991
alert: 'KubeAggregatedAPIDown',
9092
expr: |||
91-
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
93+
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
9294
||| % $._config,
9395
'for': '5m',
9496
labels: {

alerts/kubelet.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
{
7373
alert: 'KubeNodeReadinessFlapping',
7474
expr: |||
75-
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
75+
sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
7676
||| % $._config,
7777
'for': '15m',
7878
labels: {

alerts/resource_alerts.libsonnet

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -24,70 +24,127 @@
2424
rules: [
2525
{
2626
alert: 'KubeCPUOvercommit',
27-
expr: |||
28-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
29-
and
30-
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
31-
||| % $._config,
3227
labels: {
3328
severity: 'warning',
3429
},
3530
annotations: {
36-
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.',
3731
summary: 'Cluster has overcommitted CPU resource requests.',
3832
},
3933
'for': '10m',
40-
},
41-
{
42-
alert: 'KubeMemoryOvercommit',
34+
} +
35+
if $._config.showMultiCluster then {
36+
expr: |||
37+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
38+
and
39+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
40+
||| % $._config,
41+
annotations+: {
42+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
43+
},
44+
} else {
4345
expr: |||
44-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
46+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4547
and
46-
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
48+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4749
||| % $._config,
50+
annotations+: {
51+
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
52+
},
53+
},
54+
{
55+
alert: 'KubeMemoryOvercommit',
4856
labels: {
4957
severity: 'warning',
5058
},
5159
annotations: {
52-
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
5360
summary: 'Cluster has overcommitted memory resource requests.',
5461
},
5562
'for': '10m',
56-
},
57-
{
58-
alert: 'KubeCPUQuotaOvercommit',
63+
} +
64+
if $._config.showMultiCluster then {
5965
expr: |||
60-
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
61-
/
62-
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
63-
> %(namespaceOvercommitFactor)s
66+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
67+
and
68+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
6469
||| % $._config,
70+
annotations+: {
71+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
72+
},
73+
} else
74+
{
75+
expr: |||
76+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
77+
and
78+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
79+
||| % $._config,
80+
annotations+: {
81+
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
82+
},
83+
},
84+
{
85+
alert: 'KubeCPUQuotaOvercommit',
6586
labels: {
6687
severity: 'warning',
6788
},
6889
annotations: {
69-
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
7090
summary: 'Cluster has overcommitted CPU resource requests.',
7191
},
7292
'for': '5m',
73-
},
74-
{
75-
alert: 'KubeMemoryQuotaOvercommit',
93+
} +
94+
if $._config.showMultiCluster then {
7695
expr: |||
77-
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
96+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) by (%(clusterLabel)s)
7897
/
79-
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
98+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
8099
> %(namespaceOvercommitFactor)s
81100
||| % $._config,
101+
annotations+: {
102+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Namespaces.' % $._config,
103+
},
104+
} else
105+
{
106+
expr: |||
107+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
108+
/
109+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
110+
> %(namespaceOvercommitFactor)s
111+
||| % $._config,
112+
annotations+: {
113+
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
114+
},
115+
},
116+
{
117+
alert: 'KubeMemoryQuotaOvercommit',
82118
labels: {
83119
severity: 'warning',
84120
},
85121
annotations: {
86-
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
87122
summary: 'Cluster has overcommitted memory resource requests.',
88123
},
89124
'for': '5m',
90-
},
125+
} +
126+
if $._config.showMultiCluster then {
127+
expr: |||
128+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) by (%(clusterLabel)s)
129+
/
130+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
131+
> %(namespaceOvercommitFactor)s
132+
||| % $._config,
133+
annotations+: {
134+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Namespaces.' % $._config,
135+
},
136+
} else
137+
{
138+
expr: |||
139+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
140+
/
141+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
142+
> %(namespaceOvercommitFactor)s
143+
||| % $._config,
144+
annotations+: {
145+
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
146+
},
147+
},
91148
{
92149
alert: 'KubeQuotaAlmostFull',
93150
expr: |||

alerts/system_alerts.libsonnet

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
_config+:: {
33
notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
4+
kubeApiserverSelector: 'job="kube-apiserver"',
45
},
56

67
prometheusAlerts+:: {
@@ -28,9 +29,9 @@
2829
// this is normal and an expected error, therefore it should be
2930
// ignored in this alert.
3031
expr: |||
31-
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
32+
(sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
3233
/
33-
sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
34+
sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
3435
> 0.01
3536
||| % $._config,
3637
'for': '15m',

0 commit comments

Comments
 (0)