Skip to content

Commit 559d01e

Browse files
authored
Merge branch 'master' into master
2 parents 4e72db2 + 63337d9 commit 559d01e

18 files changed

+222
-126
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: check-with-upstream
2+
# Run every Monday.
3+
on:
4+
schedule:
5+
- cron: '0 0 * * 1'
6+
jobs:
7+
check-selectors-ksm:
8+
runs-on: ubuntu-latest
9+
name: Check if KSM selectors are present on applicable metrics.
10+
steps:
11+
- uses: actions/checkout@v2
12+
with:
13+
persist-credentials: false
14+
- run: make --always-make check-selectors-ksm

.github/workflows/ci.yaml

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ name: ci
22
on:
33
- push
44
- pull_request
5-
env:
6-
golang-version: '1.17'
75
jobs:
86
generate:
97
runs-on: ubuntu-latest
@@ -12,9 +10,6 @@ jobs:
1210
- uses: actions/checkout@v2
1311
with:
1412
persist-credentials: false
15-
- uses: actions/setup-go@v2
16-
with:
17-
go-version: ${{ env.golang-version }}
1813
- run: make --always-make generate && git diff --exit-code
1914
jsonnet-lint:
2015
runs-on: ubuntu-latest
@@ -23,9 +18,6 @@ jobs:
2318
- uses: actions/checkout@v2
2419
with:
2520
persist-credentials: false
26-
- uses: actions/setup-go@v2
27-
with:
28-
go-version: ${{ env.golang-version }}
2921
- run: make --always-make jsonnet-lint
3022
dashboards-lint:
3123
runs-on: ubuntu-latest
@@ -34,9 +26,6 @@ jobs:
3426
- uses: actions/checkout@v2
3527
with:
3628
persist-credentials: false
37-
- uses: actions/setup-go@v2
38-
with:
39-
go-version: ${{ env.golang-version }}
4029
- run: make --always-make dashboards-lint
4130
alerts-lint:
4231
runs-on: ubuntu-latest
@@ -45,9 +34,6 @@ jobs:
4534
- uses: actions/checkout@v2
4635
with:
4736
persist-credentials: false
48-
- uses: actions/setup-go@v2
49-
with:
50-
go-version: ${{ env.golang-version }}
5137
- run: make --always-make alerts-lint
5238
fmt:
5339
runs-on: ubuntu-latest
@@ -56,9 +42,6 @@ jobs:
5642
- uses: actions/checkout@v2
5743
with:
5844
persist-credentials: false
59-
- uses: actions/setup-go@v2
60-
with:
61-
go-version: ${{ env.golang-version }}
6245
- run: make --always-make fmt && git diff --exit-code
6346
unit-tests:
6447
runs-on: ubuntu-latest
@@ -67,7 +50,4 @@ jobs:
6750
- uses: actions/checkout@v2
6851
with:
6952
persist-credentials: false
70-
- uses: actions/setup-go@v2
71-
with:
72-
go-version: ${{ env.golang-version }}
7353
- run: make --always-make test

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,9 @@ $(TOOLING): $(BIN_DIR)
7676
@echo Installing tools from hack/tools.go
7777
@cd scripts && go list -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
7878

79+
########################################
80+
# "check-with-upstream" workflow checks.
81+
########################################
82+
83+
check-selectors-ksm:
84+
@./scripts/check-selectors-ksm.sh

alerts/apps_alerts.libsonnet

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,21 @@
8787
'for': '15m',
8888
alert: 'KubeDeploymentReplicasMismatch',
8989
},
90+
{
91+
expr: |||
92+
kube_deployment_status_condition{condition="Progressing", status="false",%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
93+
!= 0
94+
||| % $._config,
95+
labels: {
96+
severity: 'warning',
97+
},
98+
annotations: {
99+
description: 'Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes.',
100+
summary: 'Deployment rollout is not progressing.',
101+
},
102+
'for': '15m',
103+
alert: 'KubeDeploymentRolloutStuck',
104+
},
90105
{
91106
expr: |||
92107
(
@@ -104,7 +119,7 @@
104119
},
105120
annotations: {
106121
description: 'StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.',
107-
summary: 'Deployment has not matched the expected number of replicas.',
122+
summary: 'StatefulSet has not matched the expected number of replicas.',
108123
},
109124
'for': '15m',
110125
alert: 'KubeStatefulSetReplicasMismatch',

alerts/kube_apiserver.libsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ local utils = import '../lib/utils.libsonnet';
7777
{
7878
alert: 'KubeAggregatedAPIErrors',
7979
expr: |||
80-
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
80+
sum by(name, namespace, %(clusterLabel)s)(increase(aggregator_unavailable_apiservice_total{%(kubeApiserverSelector)s}[10m])) > 4
8181
||| % $._config,
8282
labels: {
8383
severity: 'warning',
@@ -90,7 +90,7 @@ local utils = import '../lib/utils.libsonnet';
9090
{
9191
alert: 'KubeAggregatedAPIDown',
9292
expr: |||
93-
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
93+
(1 - max by(name, namespace, %(clusterLabel)s)(avg_over_time(aggregator_unavailable_apiservice{%(kubeApiserverSelector)s}[10m]))) * 100 < 85
9494
||| % $._config,
9595
'for': '5m',
9696
labels: {

alerts/kubelet.libsonnet

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
{
7373
alert: 'KubeNodeReadinessFlapping',
7474
expr: |||
75-
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
75+
sum(changes(kube_node_status_condition{%(kubeStateMetricsSelector)s,status="true",condition="Ready"}[15m])) by (%(clusterLabel)s, node) > 2
7676
||| % $._config,
7777
'for': '15m',
7878
labels: {

alerts/resource_alerts.libsonnet

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,18 @@
3434
} +
3535
if $._config.showMultiCluster then {
3636
expr: |||
37-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
37+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
3838
and
39-
(sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
39+
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
4040
||| % $._config,
4141
annotations+: {
4242
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
4343
},
4444
} else {
4545
expr: |||
46-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
46+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4747
and
48-
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
48+
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
4949
||| % $._config,
5050
annotations+: {
5151
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
@@ -63,19 +63,19 @@
6363
} +
6464
if $._config.showMultiCluster then {
6565
expr: |||
66-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
66+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
6767
and
68-
(sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
68+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
6969
||| % $._config,
7070
annotations+: {
7171
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
7272
},
7373
} else
7474
{
7575
expr: |||
76-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
76+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
7777
and
78-
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
78+
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
7979
||| % $._config,
8080
annotations+: {
8181
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',

alerts/system_alerts.libsonnet

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
_config+:: {
33
notKubeDnsCoreDnsSelector: 'job!~"kube-dns|coredns"',
4+
kubeApiserverSelector: 'job="kube-apiserver"',
45
},
56

67
prometheusAlerts+:: {
@@ -28,9 +29,9 @@
2829
// this is normal and an expected error, therefore it should be
2930
// ignored in this alert.
3031
expr: |||
31-
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
32+
(sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s,code=~"5.."}[5m])) by (%(clusterLabel)s, instance, job, namespace)
3233
/
33-
sum(rate(rest_client_requests_total[5m])) by (%(clusterLabel)s, instance, job, namespace))
34+
sum(rate(rest_client_requests_total{%(kubeApiserverSelector)s}[5m])) by (%(clusterLabel)s, instance, job, namespace))
3435
> 0.01
3536
||| % $._config,
3637
'for': '15m',

0 commit comments

Comments
 (0)