Skip to content

Commit 5b7bf4c

Browse files
committed
add highcpuusagesnoalertcontroller tests
1 parent 0894a6a commit 5b7bf4c

File tree

3 files changed

+342
-0
lines changed

3 files changed

+342
-0
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
package highcpuusagealertcontroller
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"io"
7+
"math/rand"
8+
"os"
9+
"strconv"
10+
"testing"
11+
12+
v1 "github.com/openshift/api/config/v1"
13+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
14+
"k8s.io/apimachinery/pkg/runtime"
15+
"k8s.io/apimachinery/pkg/runtime/schema"
16+
"k8s.io/client-go/dynamic"
17+
"k8s.io/client-go/dynamic/fake"
18+
)
19+
20+
func TestSNOAlert(t *testing.T) {
21+
type args struct {
22+
clusterObjects []runtime.Object
23+
enabledCapabilities []v1.ClusterVersionCapability
24+
cpuMode v1.CPUPartitioningMode
25+
}
26+
tests := []struct {
27+
name string
28+
args args
29+
goldenFile string
30+
wantErr bool
31+
}{
32+
{
33+
name: "No node tuning capability",
34+
goldenFile: "./testdata/alert_8_cores.yaml",
35+
wantErr: false,
36+
},
37+
{
38+
name: "Node tuning capability, but wrong cpu mode",
39+
args: args{
40+
enabledCapabilities: []v1.ClusterVersionCapability{v1.ClusterVersionCapabilityNodeTuning},
41+
cpuMode: v1.CPUPartitioningNone,
42+
},
43+
goldenFile: "./testdata/alert_8_cores.yaml",
44+
wantErr: false,
45+
},
46+
{
47+
name: "Node tuning capability,correct cpu mode, but no PerformanceProfile",
48+
args: args{
49+
enabledCapabilities: []v1.ClusterVersionCapability{v1.ClusterVersionCapabilityNodeTuning},
50+
cpuMode: v1.CPUPartitioningAllNodes,
51+
},
52+
goldenFile: "./testdata/alert_8_cores.yaml",
53+
wantErr: false,
54+
},
55+
{
56+
name: "Node tuning capability, correct cpu mode, correct PerformanceProfile",
57+
args: args{
58+
enabledCapabilities: []v1.ClusterVersionCapability{v1.ClusterVersionCapabilityNodeTuning},
59+
cpuMode: v1.CPUPartitioningAllNodes,
60+
clusterObjects: []runtime.Object{performanceProfileWithNodeSelector("node-role.kubernetes.io/master")},
61+
},
62+
goldenFile: "./testdata/alert_2_cores.yaml",
63+
wantErr: false,
64+
},
65+
}
66+
67+
for _, tt := range tests {
68+
t.Run(tt.name, func(t *testing.T) {
69+
got, err := snoAlert(context.Background(), clientWithObjects(tt.args.clusterObjects...), tt.args.enabledCapabilities, tt.args.cpuMode)
70+
if (err != nil) != tt.wantErr {
71+
t.Fatalf("snoAlert() error = %v, expectedErr = %v", err, tt.wantErr)
72+
}
73+
goldenFileAlert := readBytesFromFile(t, tt.goldenFile)
74+
75+
if !bytes.Equal(got, goldenFileAlert) {
76+
t.Errorf("snoAlert() got = %v, goldenFile = %v", got, goldenFileAlert)
77+
}
78+
})
79+
}
80+
}
81+
82+
func TestPerformanceProfileControlPlaneCores(t *testing.T) {
83+
tests := []struct {
84+
name string
85+
clusterObjects []runtime.Object
86+
87+
expectedCores int
88+
expectedToFindCores bool
89+
expectedErr bool
90+
}{
91+
{
92+
name: "no performanceProfile",
93+
expectedToFindCores: false,
94+
expectedErr: false,
95+
},
96+
{
97+
name: "one performanceProfile",
98+
clusterObjects: []runtime.Object{performanceProfileWithNodeSelector("node-role.kubernetes.io/master")},
99+
expectedCores: 2,
100+
expectedToFindCores: true,
101+
expectedErr: false,
102+
},
103+
{
104+
name: "only worker performanceProfile",
105+
clusterObjects: []runtime.Object{performanceProfileWithNodeSelector("node-role.kubernetes.io/worker")},
106+
expectedToFindCores: false,
107+
expectedErr: false,
108+
},
109+
{
110+
name: "multiple performanceProfiles",
111+
clusterObjects: []runtime.Object{
112+
performanceProfileWithNodeSelector("node-role.kubernetes.io/master"),
113+
performanceProfileWithNodeSelector("node-role.kubernetes.io/worker")},
114+
expectedCores: 2,
115+
expectedToFindCores: true,
116+
expectedErr: false,
117+
},
118+
{
119+
name: "invalid cpu set in performance profile",
120+
clusterObjects: []runtime.Object{invalidCPUSetInvalidPerformanceProfile()},
121+
expectedToFindCores: false,
122+
expectedErr: true,
123+
},
124+
{
125+
name: "no node selector in performance profile",
126+
clusterObjects: []runtime.Object{noNodeSelectorInvalidPerformanceProfile()},
127+
expectedToFindCores: false,
128+
},
129+
{
130+
name: "no cpu set in performance profile",
131+
clusterObjects: []runtime.Object{noCPUSetInvalidPerformanceProfile()},
132+
expectedToFindCores: false,
133+
},
134+
}
135+
for _, tt := range tests {
136+
t.Run(tt.name, func(t *testing.T) {
137+
coresFound, isFound, err := performanceProfileControlPlaneCores(context.Background(), clientWithObjects(tt.clusterObjects...))
138+
if (err != nil) != tt.expectedErr {
139+
t.Fatalf("performanceProfileControlPlaneCores() error = %v, expectedErr = %v", err, tt.expectedErr)
140+
}
141+
if coresFound != tt.expectedCores {
142+
t.Errorf("performanceProfileControlPlaneCores() coresFound = %v, expectedCores = %v", coresFound, tt.expectedCores)
143+
}
144+
if isFound != tt.expectedToFindCores {
145+
t.Errorf("performanceProfileControlPlaneCores() isFound = %v, expectedToFindCores = %v", isFound, tt.expectedToFindCores)
146+
}
147+
})
148+
}
149+
}
150+
151+
func clientWithObjects(objs ...runtime.Object) dynamic.Interface {
152+
scheme := runtime.NewScheme()
153+
return fake.NewSimpleDynamicClientWithCustomListKinds(scheme, map[schema.GroupVersionResource]string{
154+
performanceGroup: "PerformanceProfileList",
155+
}, objs...)
156+
}
157+
158+
func performanceProfileWithNodeSelector(selector string) runtime.Unstructured {
159+
return &unstructured.Unstructured{
160+
Object: map[string]interface{}{
161+
"apiVersion": "performance.openshift.io/v2",
162+
"kind": "PerformanceProfile",
163+
"metadata": map[string]interface{}{
164+
"name": "performanceProfile" + strconv.Itoa(rand.Int()),
165+
},
166+
"spec": map[string]interface{}{
167+
"nodeSelector": map[string]interface{}{
168+
selector: "",
169+
},
170+
"cpu": map[string]interface{}{
171+
"isolated": "0-13",
172+
"reserved": "14-15",
173+
},
174+
},
175+
},
176+
}
177+
}
178+
179+
func invalidCPUSetInvalidPerformanceProfile() runtime.Unstructured {
180+
return &unstructured.Unstructured{
181+
Object: map[string]interface{}{
182+
"apiVersion": "performance.openshift.io/v2",
183+
"kind": "PerformanceProfile",
184+
"metadata": map[string]interface{}{
185+
"name": "performanceProfile" + strconv.Itoa(rand.Int()),
186+
},
187+
"spec": map[string]interface{}{
188+
"nodeSelector": map[string]interface{}{
189+
"node-role.kubernetes.io/master": "",
190+
},
191+
"cpu": map[string]interface{}{
192+
"isolated": "0-13",
193+
"reserved": "14+15",
194+
},
195+
},
196+
},
197+
}
198+
}
199+
200+
func noNodeSelectorInvalidPerformanceProfile() runtime.Unstructured {
201+
return &unstructured.Unstructured{
202+
Object: map[string]interface{}{
203+
"apiVersion": "performance.openshift.io/v2",
204+
"kind": "PerformanceProfile",
205+
"metadata": map[string]interface{}{
206+
"name": "performanceProfile" + strconv.Itoa(rand.Int()),
207+
},
208+
"spec": map[string]interface{}{
209+
"cpu": map[string]interface{}{
210+
"isolated": "0-13",
211+
"reserved": "14-15",
212+
},
213+
},
214+
},
215+
}
216+
}
217+
218+
func noCPUSetInvalidPerformanceProfile() runtime.Unstructured {
219+
return &unstructured.Unstructured{
220+
Object: map[string]interface{}{
221+
"apiVersion": "performance.openshift.io/v2",
222+
"kind": "PerformanceProfile",
223+
"metadata": map[string]interface{}{
224+
"name": "performanceProfile" + strconv.Itoa(rand.Int()),
225+
},
226+
"spec": map[string]interface{}{
227+
"nodeSelector": map[string]interface{}{
228+
"node-role.kubernetes.io/master": "",
229+
},
230+
},
231+
},
232+
}
233+
}
234+
235+
func readBytesFromFile(t *testing.T, filename string) []byte {
236+
file, err := os.Open(filename)
237+
if err != nil {
238+
t.Fatal(err)
239+
}
240+
defer file.Close()
241+
242+
data, err := io.ReadAll(file)
243+
if err != nil {
244+
t.Fatal(err)
245+
}
246+
247+
return data
248+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: cpu-utilization
5+
namespace: openshift-kube-apiserver
6+
spec:
7+
groups:
8+
- name: control-plane-cpu-utilization
9+
rules:
10+
- alert: HighOverallControlPlaneCPU
11+
annotations:
12+
summary: >-
13+
CPU utilization across control plane pods is more than 60% of total CPU. High CPU usage usually means that something goes wrong.
14+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
15+
description: >-
16+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
17+
problems with cluster or control plane pods. To manage this alert or modify threshold it in case of false positives see the following link:
18+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
19+
expr: |
20+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / 2 * 100 > 60
21+
for: 10m
22+
labels:
23+
namespace: openshift-kube-apiserver
24+
severity: warning
25+
- alert: ExtremelyHighIndividualControlPlaneCPU
26+
annotations:
27+
summary: >-
28+
CPU utilization across control plane pods is more than 90% of total CPU. High CPU usage usually means that something goes wrong.
29+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
30+
description: >-
31+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
32+
problems with cluster or control plane pods. When workload partitioning is enabled,
33+
Extreme CPU pressure can cause slow serialization and poor performance from the kube-apiserver and etcd.
34+
When this happens, there is a risk of clients seeing non-responsive API requests which are issued again
35+
causing even more CPU pressure.
36+
It can also cause failing liveness probes due to slow etcd responsiveness on the backend.
37+
If one kube-apiserver fails under this condition, chances are you will experience a cascade as the remaining
38+
kube-apiservers are also under-provisioned.
39+
To fix this, increase the CPU and memory on your control plane nodes.
40+
To manage this alert or modify threshold it in case of false positives see the following link:
41+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
42+
expr: |
43+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / 2 * 100 > 90
44+
for: 1h
45+
labels:
46+
namespace: openshift-kube-apiserver
47+
severity: critical
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: cpu-utilization
5+
namespace: openshift-kube-apiserver
6+
spec:
7+
groups:
8+
- name: control-plane-cpu-utilization
9+
rules:
10+
- alert: HighOverallControlPlaneCPU
11+
annotations:
12+
summary: >-
13+
CPU utilization across control plane pods is more than 60% of total CPU. High CPU usage usually means that something goes wrong.
14+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
15+
description: >-
16+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
17+
problems with cluster or control plane pods. To manage this alert or modify threshold it in case of false positives see the following link:
18+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
19+
expr: |
20+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / 8 * 100 > 60
21+
for: 10m
22+
labels:
23+
namespace: openshift-kube-apiserver
24+
severity: warning
25+
- alert: ExtremelyHighIndividualControlPlaneCPU
26+
annotations:
27+
summary: >-
28+
CPU utilization across control plane pods is more than 90% of total CPU. High CPU usage usually means that something goes wrong.
29+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
30+
description: >-
31+
This level of CPU utlization of an control plane is probably not a problem under most circumstances, but high levels of utilization may indicate
32+
problems with cluster or control plane pods. When workload partitioning is enabled,
33+
Extreme CPU pressure can cause slow serialization and poor performance from the kube-apiserver and etcd.
34+
When this happens, there is a risk of clients seeing non-responsive API requests which are issued again
35+
causing even more CPU pressure.
36+
It can also cause failing liveness probes due to slow etcd responsiveness on the backend.
37+
If one kube-apiserver fails under this condition, chances are you will experience a cascade as the remaining
38+
kube-apiservers are also under-provisioned.
39+
To fix this, increase the CPU and memory on your control plane nodes.
40+
To manage this alert or modify threshold it in case of false positives see the following link:
41+
https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
42+
expr: |
43+
sum(rate(container_cpu_usage_seconds_total{namespace=~"openshift-.*",image!=""}[4m])) / 8 * 100 > 90
44+
for: 1h
45+
labels:
46+
namespace: openshift-kube-apiserver
47+
severity: critical

0 commit comments

Comments
 (0)