Skip to content

Commit 2972625

Browse files
Merge pull request #28411 from dgoodwin/target-down-kube-system
TRT-1235: Add ability to specify alerts that should never fire
2 parents e5ca74e + 76dc35d commit 2972625

File tree

3 files changed

+94
-49
lines changed

3 files changed

+94
-49
lines changed

pkg/monitortestlibrary/allowedalerts/all.go

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -12,64 +12,71 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT
1212

1313
ret := []AlertTest{}
1414
ret = append(ret, newWatchdogAlert(jobType))
15-
ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).pending().neverFail().toTests()...)
16-
ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).firing().toTests()...)
17-
18-
ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...)
19-
ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).firing().toTests()...)
20-
ret = append(ret, newAlert("etcd", "etcdGRPCRequestsSlow", jobType).pending().neverFail().toTests()...)
21-
ret = append(ret, newAlert("etcd", "etcdGRPCRequestsSlow", jobType).firing().toTests()...)
22-
ret = append(ret, newAlert("etcd", "etcdHighNumberOfFailedGRPCRequests", jobType).pending().neverFail().toTests()...)
23-
ret = append(ret, newAlert("etcd", "etcdHighNumberOfFailedGRPCRequests", jobType).firing().toTests()...)
24-
ret = append(ret, newAlert("etcd", "etcdMemberCommunicationSlow", jobType).pending().neverFail().toTests()...)
25-
ret = append(ret, newAlert("etcd", "etcdMemberCommunicationSlow", jobType).firing().toTests()...)
26-
ret = append(ret, newAlert("etcd", "etcdNoLeader", jobType).pending().neverFail().toTests()...)
27-
ret = append(ret, newAlert("etcd", "etcdNoLeader", jobType).firing().toTests()...)
28-
ret = append(ret, newAlert("etcd", "etcdHighFsyncDurations", jobType).pending().neverFail().toTests()...)
29-
ret = append(ret, newAlert("etcd", "etcdHighFsyncDurations", jobType).firing().toTests()...)
30-
ret = append(ret, newAlert("etcd", "etcdHighCommitDurations", jobType).pending().neverFail().toTests()...)
31-
ret = append(ret, newAlert("etcd", "etcdHighCommitDurations", jobType).firing().toTests()...)
32-
ret = append(ret, newAlert("etcd", "etcdInsufficientMembers", jobType).pending().neverFail().toTests()...)
33-
ret = append(ret, newAlert("etcd", "etcdInsufficientMembers", jobType).firing().toTests()...)
34-
ret = append(ret, newAlert("etcd", "etcdHighNumberOfLeaderChanges", jobType).pending().neverFail().toTests()...)
15+
ret = append(ret, newAlertTestPerNamespace("KubePodNotReady", jobType).pending().neverFail().toTests()...)
16+
ret = append(ret, newAlertTestPerNamespace("KubePodNotReady", jobType).firing().toTests()...)
17+
18+
ret = append(ret, newAlertTest("bz-etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...)
19+
ret = append(ret, newAlertTest("bz-etcd", "etcdMembersDown", jobType).firing().toTests()...)
20+
ret = append(ret, newAlertTest("bz-etcd", "etcdGRPCRequestsSlow", jobType).pending().neverFail().toTests()...)
21+
ret = append(ret, newAlertTest("bz-etcd", "etcdGRPCRequestsSlow", jobType).firing().toTests()...)
22+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighNumberOfFailedGRPCRequests", jobType).pending().neverFail().toTests()...)
23+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighNumberOfFailedGRPCRequests", jobType).firing().toTests()...)
24+
ret = append(ret, newAlertTest("bz-etcd", "etcdMemberCommunicationSlow", jobType).pending().neverFail().toTests()...)
25+
ret = append(ret, newAlertTest("bz-etcd", "etcdMemberCommunicationSlow", jobType).firing().toTests()...)
26+
ret = append(ret, newAlertTest("bz-etcd", "etcdNoLeader", jobType).pending().neverFail().toTests()...)
27+
ret = append(ret, newAlertTest("bz-etcd", "etcdNoLeader", jobType).firing().toTests()...)
28+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighFsyncDurations", jobType).pending().neverFail().toTests()...)
29+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighFsyncDurations", jobType).firing().toTests()...)
30+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighCommitDurations", jobType).pending().neverFail().toTests()...)
31+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighCommitDurations", jobType).firing().toTests()...)
32+
ret = append(ret, newAlertTest("bz-etcd", "etcdInsufficientMembers", jobType).pending().neverFail().toTests()...)
33+
ret = append(ret, newAlertTest("bz-etcd", "etcdInsufficientMembers", jobType).firing().toTests()...)
34+
35+
// A rare and pretty serious failure, should always be accompanied by other failures but we want to see a specific test failure for this.
36+
// It likely means a kubelet is down.
37+
ret = append(ret, newAlertTest(
38+
"sig-node", "TargetDown", jobType).inNamespace("kube-system").
39+
firing().alwaysFail().toTests()...)
40+
41+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighNumberOfLeaderChanges", jobType).pending().neverFail().toTests()...)
3542

3643
// This test gets a little special treatment, if we're moving through etcd updates, we expect leader changes, so if this scenario is detected
3744
// this test is given fixed leeway for the alert to fire, otherwise it too falls back to historical data.
38-
ret = append(ret, newAlert("etcd", "etcdHighNumberOfLeaderChanges", jobType).withAllowance(etcdAllowance).firing().toTests()...)
45+
ret = append(ret, newAlertTest("bz-etcd", "etcdHighNumberOfLeaderChanges", jobType).withAllowance(etcdAllowance).firing().toTests()...)
3946

40-
ret = append(ret, newAlert("kube-apiserver", "KubeAPIErrorBudgetBurn", jobType).pending().neverFail().toTests()...)
41-
ret = append(ret, newAlert("kube-apiserver", "KubeAPIErrorBudgetBurn", jobType).firing().toTests()...)
42-
ret = append(ret, newAlert("kube-apiserver", "KubeClientErrors", jobType).pending().neverFail().toTests()...)
43-
ret = append(ret, newAlert("kube-apiserver", "KubeClientErrors", jobType).firing().toTests()...)
47+
ret = append(ret, newAlertTest("bz-kube-apiserver", "KubeAPIErrorBudgetBurn", jobType).pending().neverFail().toTests()...)
48+
ret = append(ret, newAlertTest("bz-kube-apiserver", "KubeAPIErrorBudgetBurn", jobType).firing().toTests()...)
49+
ret = append(ret, newAlertTest("bz-kube-apiserver", "KubeClientErrors", jobType).pending().neverFail().toTests()...)
50+
ret = append(ret, newAlertTest("bz-kube-apiserver", "KubeClientErrors", jobType).firing().toTests()...)
4451

45-
ret = append(ret, newAlert("storage", "KubePersistentVolumeErrors", jobType).pending().neverFail().toTests()...)
46-
ret = append(ret, newAlert("storage", "KubePersistentVolumeErrors", jobType).firing().toTests()...)
52+
ret = append(ret, newAlertTest("bz-storage", "KubePersistentVolumeErrors", jobType).pending().neverFail().toTests()...)
53+
ret = append(ret, newAlertTest("bz-storage", "KubePersistentVolumeErrors", jobType).firing().toTests()...)
4754

48-
ret = append(ret, newAlert("machine config operator", "MCDDrainError", jobType).pending().neverFail().toTests()...)
49-
ret = append(ret, newAlert("machine config operator", "MCDDrainError", jobType).firing().toTests()...)
55+
ret = append(ret, newAlertTest("bz-machine config operator", "MCDDrainError", jobType).pending().neverFail().toTests()...)
56+
ret = append(ret, newAlertTest("bz-machine config operator", "MCDDrainError", jobType).firing().toTests()...)
5057

51-
ret = append(ret, newAlert("single-node", "KubeMemoryOvercommit", jobType).pending().neverFail().toTests()...)
58+
ret = append(ret, newAlertTest("bz-single-node", "KubeMemoryOvercommit", jobType).pending().neverFail().toTests()...)
5259
// this appears to have no direct impact on the cluster in CI. It's important in general, but for CI we're willing to run pretty hot.
53-
ret = append(ret, newAlert("single-node", "KubeMemoryOvercommit", jobType).firing().neverFail().toTests()...)
54-
ret = append(ret, newAlert("machine config operator", "MCDPivotError", jobType).pending().neverFail().toTests()...)
55-
ret = append(ret, newAlert("machine config operator", "MCDPivotError", jobType).firing().toTests()...)
60+
ret = append(ret, newAlertTest("bz-single-node", "KubeMemoryOvercommit", jobType).firing().neverFail().toTests()...)
61+
ret = append(ret, newAlertTest("bz-machine config operator", "MCDPivotError", jobType).pending().neverFail().toTests()...)
62+
ret = append(ret, newAlertTest("bz-machine config operator", "MCDPivotError", jobType).firing().toTests()...)
5663

57-
ret = append(ret, newAlert("monitoring", "PrometheusOperatorWatchErrors", jobType).pending().neverFail().toTests()...)
58-
ret = append(ret, newAlert("monitoring", "PrometheusOperatorWatchErrors", jobType).firing().toTests()...)
64+
ret = append(ret, newAlertTest("bz-monitoring", "PrometheusOperatorWatchErrors", jobType).pending().neverFail().toTests()...)
65+
ret = append(ret, newAlertTest("bz-monitoring", "PrometheusOperatorWatchErrors", jobType).firing().toTests()...)
5966

60-
ret = append(ret, newAlert("networking", "OVNKubernetesResourceRetryFailure", jobType).pending().neverFail().toTests()...)
61-
ret = append(ret, newAlert("networking", "OVNKubernetesResourceRetryFailure", jobType).firing().toTests()...)
67+
ret = append(ret, newAlertTest("bz-networking", "OVNKubernetesResourceRetryFailure", jobType).pending().neverFail().toTests()...)
68+
ret = append(ret, newAlertTest("bz-networking", "OVNKubernetesResourceRetryFailure", jobType).firing().toTests()...)
6269

63-
ret = append(ret, newAlert("OLM", "RedhatOperatorsCatalogError", jobType).pending().neverFail().toTests()...)
64-
ret = append(ret, newAlert("OLM", "RedhatOperatorsCatalogError", jobType).firing().toTests()...)
70+
ret = append(ret, newAlertTest("bz-OLM", "RedhatOperatorsCatalogError", jobType).pending().neverFail().toTests()...)
71+
ret = append(ret, newAlertTest("bz-OLM", "RedhatOperatorsCatalogError", jobType).firing().toTests()...)
6572

66-
ret = append(ret, newAlert("storage", "VSphereOpenshiftNodeHealthFail", jobType).pending().neverFail().toTests()...)
67-
ret = append(ret, newAlert("storage", "VSphereOpenshiftNodeHealthFail", jobType).firing().neverFail().toTests()...) // https://bugzilla.redhat.com/show_bug.cgi?id=2055729
73+
ret = append(ret, newAlertTest("bz-storage", "VSphereOpenshiftNodeHealthFail", jobType).pending().neverFail().toTests()...)
74+
ret = append(ret, newAlertTest("bz-storage", "VSphereOpenshiftNodeHealthFail", jobType).firing().neverFail().toTests()...) // https://bugzilla.redhat.com/show_bug.cgi?id=2055729
6875

69-
ret = append(ret, newAlert("samples", "SamplesImagestreamImportFailing", jobType).pending().neverFail().toTests()...)
70-
ret = append(ret, newAlert("samples", "SamplesImagestreamImportFailing", jobType).firing().toTests()...)
76+
ret = append(ret, newAlertTest("bz-samples", "SamplesImagestreamImportFailing", jobType).pending().neverFail().toTests()...)
77+
ret = append(ret, newAlertTest("bz-samples", "SamplesImagestreamImportFailing", jobType).firing().toTests()...)
7178

72-
ret = append(ret, newAlert("apiserver-auth", "PodSecurityViolation", jobType).firing().toTests()...)
79+
ret = append(ret, newAlertTest("bz-apiserver-auth", "PodSecurityViolation", jobType).firing().toTests()...)
7380

7481
return ret
7582
}

pkg/monitortestlibrary/allowedalerts/basic_alert.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type alertBuilder struct {
4646
bugzillaComponent string
4747
divideByNamespaces bool
4848
alertName string
49+
alertNamespace string
4950
alertState AlertState
5051
jobType *platformidentification2.JobType
5152

@@ -62,7 +63,8 @@ type basicAlertTest struct {
6263
allowanceCalculator AlertTestAllowanceCalculator
6364
}
6465

65-
func newAlert(bugzillaComponent, alertName string, jobType *platformidentification2.JobType) *alertBuilder {
66+
// newAlertTest creates a single alert test with no consideration of namespace.
67+
func newAlertTest(bugzillaComponent, alertName string, jobType *platformidentification2.JobType) *alertBuilder {
6668
return &alertBuilder{
6769
bugzillaComponent: bugzillaComponent,
6870
alertName: alertName,
@@ -72,7 +74,8 @@ func newAlert(bugzillaComponent, alertName string, jobType *platformidentificati
7274
}
7375
}
7476

75-
func newNamespacedAlert(alertName string, jobType *platformidentification2.JobType) *alertBuilder {
77+
// newAlertTestPerNamespace creates an alert test builder per entry in the hardcoded list of namespaces we're interested in.
78+
func newAlertTestPerNamespace(alertName string, jobType *platformidentification2.JobType) *alertBuilder {
7679
return &alertBuilder{
7780
divideByNamespaces: true,
7881
alertName: alertName,
@@ -92,6 +95,12 @@ func (a *alertBuilder) pending() *alertBuilder {
9295
return a
9396
}
9497

98+
// inNamespace limits the alert test to a specific namespace.
99+
func (a *alertBuilder) inNamespace(namespace string) *alertBuilder {
100+
a.alertNamespace = namespace
101+
return a
102+
}
103+
95104
func (a *alertBuilder) firing() *alertBuilder {
96105
a.alertState = AlertInfo
97106
return a
@@ -112,17 +121,27 @@ func (a *alertBuilder) neverFail() *alertBuilder {
112121
return a
113122
}
114123

124+
// alwaysFlake will flake the test if the alert enters the given state for any amount of time,
125+
// regardless of historical data.
115126
func (a *alertBuilder) alwaysFlake() *alertBuilder {
116127
a.allowanceCalculator = alwaysFlake()
117128
return a
118129
}
119130

131+
// alwaysFail will fail the test if the alert enters the given state for any amount of time,
132+
// regardless of historical data.
133+
func (a *alertBuilder) alwaysFail() *alertBuilder {
134+
a.allowanceCalculator = failOnAny()
135+
return a
136+
}
137+
120138
func (a *alertBuilder) toTests() []AlertTest {
121139
if !a.divideByNamespaces {
122140
return []AlertTest{
123141
&basicAlertTest{
124142
bugzillaComponent: a.bugzillaComponent,
125143
alertName: a.alertName,
144+
namespace: a.alertNamespace, // will be populated if we're creating for a specific namespace
126145
alertState: a.alertState,
127146
allowanceCalculator: a.allowanceCalculator,
128147
jobType: a.jobType,
@@ -156,11 +175,11 @@ func (a *alertBuilder) toTests() []AlertTest {
156175
func (a *basicAlertTest) InvariantTestName() string {
157176
switch {
158177
case len(a.namespace) == 0:
159-
return fmt.Sprintf("[bz-%v][invariant] alert/%s should not be at or above %s", a.bugzillaComponent, a.alertName, a.alertState)
178+
return fmt.Sprintf("[%v][invariant] alert/%s should not be at or above %s", a.bugzillaComponent, a.alertName, a.alertState)
160179
case a.namespace == platformidentification2.NamespaceOther:
161-
return fmt.Sprintf("[bz-%v][invariant] alert/%s should not be at or above %s in all the other namespaces", a.bugzillaComponent, a.alertName, a.alertState)
180+
return fmt.Sprintf("[%v][invariant] alert/%s should not be at or above %s in all the other namespaces", a.bugzillaComponent, a.alertName, a.alertState)
162181
default:
163-
return fmt.Sprintf("[bz-%v][invariant] alert/%s should not be at or above %s in ns/%s", a.bugzillaComponent, a.alertName, a.alertState, a.namespace)
182+
return fmt.Sprintf("[%v][invariant] alert/%s should not be at or above %s in ns/%s", a.bugzillaComponent, a.alertName, a.alertState, a.namespace)
164183
}
165184
}
166185

pkg/monitortestlibrary/allowedalerts/matches.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
historicaldata2 "github.com/openshift/origin/pkg/monitortestlibrary/historicaldata"
77
)
88

9+
// neverFailAllowance will ignore historical data and impose a FailAfter limit that should not be
10+
// reachable in a CI job run, so the test can never fail, only flake if beyond historical limits.
911
type neverFailAllowance struct {
1012
flakeDelegate AlertTestAllowanceCalculator
1113
}
@@ -71,3 +73,20 @@ func (d *alwaysFlakeAllowance) FailAfter(key historicaldata2.AlertDataKey) (time
7173
func (d *alwaysFlakeAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
7274
return 1 * time.Second
7375
}
76+
77+
func failOnAny() AlertTestAllowanceCalculator {
78+
return &alwaysFailAllowance{}
79+
}
80+
81+
// alwaysFailAllowance is for alerts we want to fail a test if they occur at all.
82+
type alwaysFailAllowance struct {
83+
}
84+
85+
func (d *alwaysFailAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) {
86+
return 1 * time.Second, nil
87+
}
88+
89+
func (d *alwaysFailAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
90+
// flake is irrelevant here, we're going to fail on ANY duration
91+
return 24 * time.Hour
92+
}

0 commit comments

Comments
 (0)