@@ -12,64 +12,71 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT
1212
1313 ret := []AlertTest {}
1414 ret = append (ret , newWatchdogAlert (jobType ))
15- ret = append (ret , newNamespacedAlert ("KubePodNotReady" , jobType ).pending ().neverFail ().toTests ()... )
16- ret = append (ret , newNamespacedAlert ("KubePodNotReady" , jobType ).firing ().toTests ()... )
17-
18- ret = append (ret , newAlert ("etcd" , "etcdMembersDown" , jobType ).pending ().neverFail ().toTests ()... )
19- ret = append (ret , newAlert ("etcd" , "etcdMembersDown" , jobType ).firing ().toTests ()... )
20- ret = append (ret , newAlert ("etcd" , "etcdGRPCRequestsSlow" , jobType ).pending ().neverFail ().toTests ()... )
21- ret = append (ret , newAlert ("etcd" , "etcdGRPCRequestsSlow" , jobType ).firing ().toTests ()... )
22- ret = append (ret , newAlert ("etcd" , "etcdHighNumberOfFailedGRPCRequests" , jobType ).pending ().neverFail ().toTests ()... )
23- ret = append (ret , newAlert ("etcd" , "etcdHighNumberOfFailedGRPCRequests" , jobType ).firing ().toTests ()... )
24- ret = append (ret , newAlert ("etcd" , "etcdMemberCommunicationSlow" , jobType ).pending ().neverFail ().toTests ()... )
25- ret = append (ret , newAlert ("etcd" , "etcdMemberCommunicationSlow" , jobType ).firing ().toTests ()... )
26- ret = append (ret , newAlert ("etcd" , "etcdNoLeader" , jobType ).pending ().neverFail ().toTests ()... )
27- ret = append (ret , newAlert ("etcd" , "etcdNoLeader" , jobType ).firing ().toTests ()... )
28- ret = append (ret , newAlert ("etcd" , "etcdHighFsyncDurations" , jobType ).pending ().neverFail ().toTests ()... )
29- ret = append (ret , newAlert ("etcd" , "etcdHighFsyncDurations" , jobType ).firing ().toTests ()... )
30- ret = append (ret , newAlert ("etcd" , "etcdHighCommitDurations" , jobType ).pending ().neverFail ().toTests ()... )
31- ret = append (ret , newAlert ("etcd" , "etcdHighCommitDurations" , jobType ).firing ().toTests ()... )
32- ret = append (ret , newAlert ("etcd" , "etcdInsufficientMembers" , jobType ).pending ().neverFail ().toTests ()... )
33- ret = append (ret , newAlert ("etcd" , "etcdInsufficientMembers" , jobType ).firing ().toTests ()... )
34- ret = append (ret , newAlert ("etcd" , "etcdHighNumberOfLeaderChanges" , jobType ).pending ().neverFail ().toTests ()... )
15+ ret = append (ret , newAlertTestPerNamespace ("KubePodNotReady" , jobType ).pending ().neverFail ().toTests ()... )
16+ ret = append (ret , newAlertTestPerNamespace ("KubePodNotReady" , jobType ).firing ().toTests ()... )
17+
18+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdMembersDown" , jobType ).pending ().neverFail ().toTests ()... )
19+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdMembersDown" , jobType ).firing ().toTests ()... )
20+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdGRPCRequestsSlow" , jobType ).pending ().neverFail ().toTests ()... )
21+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdGRPCRequestsSlow" , jobType ).firing ().toTests ()... )
22+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighNumberOfFailedGRPCRequests" , jobType ).pending ().neverFail ().toTests ()... )
23+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighNumberOfFailedGRPCRequests" , jobType ).firing ().toTests ()... )
24+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdMemberCommunicationSlow" , jobType ).pending ().neverFail ().toTests ()... )
25+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdMemberCommunicationSlow" , jobType ).firing ().toTests ()... )
26+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdNoLeader" , jobType ).pending ().neverFail ().toTests ()... )
27+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdNoLeader" , jobType ).firing ().toTests ()... )
28+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighFsyncDurations" , jobType ).pending ().neverFail ().toTests ()... )
29+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighFsyncDurations" , jobType ).firing ().toTests ()... )
30+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighCommitDurations" , jobType ).pending ().neverFail ().toTests ()... )
31+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighCommitDurations" , jobType ).firing ().toTests ()... )
32+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdInsufficientMembers" , jobType ).pending ().neverFail ().toTests ()... )
33+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdInsufficientMembers" , jobType ).firing ().toTests ()... )
34+
35+ // A rare and pretty serious failure, should always be accompanied by other failures but we want to see a specific test failure for this.
36+ // It likely means a kubelet is down.
37+ ret = append (ret , newAlertTest (
38+ "sig-node" , "TargetDown" , jobType ).inNamespace ("kube-system" ).
39+ firing ().alwaysFail ().toTests ()... )
40+
41+ ret = append (ret , newAlertTest ("bz-etcd" , "etcdHighNumberOfLeaderChanges" , jobType ).pending ().neverFail ().toTests ()... )
3542
3643 // This test gets a little special treatment, if we're moving through etcd updates, we expect leader changes, so if this scenario is detected
3744 // this test is given fixed leeway for the alert to fire, otherwise it too falls back to historical data.
38- ret = append (ret , newAlert ( " etcd" , "etcdHighNumberOfLeaderChanges" , jobType ).withAllowance (etcdAllowance ).firing ().toTests ()... )
45+ ret = append (ret , newAlertTest ( "bz- etcd" , "etcdHighNumberOfLeaderChanges" , jobType ).withAllowance (etcdAllowance ).firing ().toTests ()... )
3946
40- ret = append (ret , newAlert ( " kube-apiserver" , "KubeAPIErrorBudgetBurn" , jobType ).pending ().neverFail ().toTests ()... )
41- ret = append (ret , newAlert ( " kube-apiserver" , "KubeAPIErrorBudgetBurn" , jobType ).firing ().toTests ()... )
42- ret = append (ret , newAlert ( " kube-apiserver" , "KubeClientErrors" , jobType ).pending ().neverFail ().toTests ()... )
43- ret = append (ret , newAlert ( " kube-apiserver" , "KubeClientErrors" , jobType ).firing ().toTests ()... )
47+ ret = append (ret , newAlertTest ( "bz- kube-apiserver" , "KubeAPIErrorBudgetBurn" , jobType ).pending ().neverFail ().toTests ()... )
48+ ret = append (ret , newAlertTest ( "bz- kube-apiserver" , "KubeAPIErrorBudgetBurn" , jobType ).firing ().toTests ()... )
49+ ret = append (ret , newAlertTest ( "bz- kube-apiserver" , "KubeClientErrors" , jobType ).pending ().neverFail ().toTests ()... )
50+ ret = append (ret , newAlertTest ( "bz- kube-apiserver" , "KubeClientErrors" , jobType ).firing ().toTests ()... )
4451
45- ret = append (ret , newAlert ( " storage" , "KubePersistentVolumeErrors" , jobType ).pending ().neverFail ().toTests ()... )
46- ret = append (ret , newAlert ( " storage" , "KubePersistentVolumeErrors" , jobType ).firing ().toTests ()... )
52+ ret = append (ret , newAlertTest ( "bz- storage" , "KubePersistentVolumeErrors" , jobType ).pending ().neverFail ().toTests ()... )
53+ ret = append (ret , newAlertTest ( "bz- storage" , "KubePersistentVolumeErrors" , jobType ).firing ().toTests ()... )
4754
48- ret = append (ret , newAlert ( " machine config operator" , "MCDDrainError" , jobType ).pending ().neverFail ().toTests ()... )
49- ret = append (ret , newAlert ( " machine config operator" , "MCDDrainError" , jobType ).firing ().toTests ()... )
55+ ret = append (ret , newAlertTest ( "bz- machine config operator" , "MCDDrainError" , jobType ).pending ().neverFail ().toTests ()... )
56+ ret = append (ret , newAlertTest ( "bz- machine config operator" , "MCDDrainError" , jobType ).firing ().toTests ()... )
5057
51- ret = append (ret , newAlert ( " single-node" , "KubeMemoryOvercommit" , jobType ).pending ().neverFail ().toTests ()... )
58+ ret = append (ret , newAlertTest ( "bz- single-node" , "KubeMemoryOvercommit" , jobType ).pending ().neverFail ().toTests ()... )
5259 // this appears to have no direct impact on the cluster in CI. It's important in general, but for CI we're willing to run pretty hot.
53- ret = append (ret , newAlert ( " single-node" , "KubeMemoryOvercommit" , jobType ).firing ().neverFail ().toTests ()... )
54- ret = append (ret , newAlert ( " machine config operator" , "MCDPivotError" , jobType ).pending ().neverFail ().toTests ()... )
55- ret = append (ret , newAlert ( " machine config operator" , "MCDPivotError" , jobType ).firing ().toTests ()... )
60+ ret = append (ret , newAlertTest ( "bz- single-node" , "KubeMemoryOvercommit" , jobType ).firing ().neverFail ().toTests ()... )
61+ ret = append (ret , newAlertTest ( "bz- machine config operator" , "MCDPivotError" , jobType ).pending ().neverFail ().toTests ()... )
62+ ret = append (ret , newAlertTest ( "bz- machine config operator" , "MCDPivotError" , jobType ).firing ().toTests ()... )
5663
57- ret = append (ret , newAlert ( " monitoring" , "PrometheusOperatorWatchErrors" , jobType ).pending ().neverFail ().toTests ()... )
58- ret = append (ret , newAlert ( " monitoring" , "PrometheusOperatorWatchErrors" , jobType ).firing ().toTests ()... )
64+ ret = append (ret , newAlertTest ( "bz- monitoring" , "PrometheusOperatorWatchErrors" , jobType ).pending ().neverFail ().toTests ()... )
65+ ret = append (ret , newAlertTest ( "bz- monitoring" , "PrometheusOperatorWatchErrors" , jobType ).firing ().toTests ()... )
5966
60- ret = append (ret , newAlert ( " networking" , "OVNKubernetesResourceRetryFailure" , jobType ).pending ().neverFail ().toTests ()... )
61- ret = append (ret , newAlert ( " networking" , "OVNKubernetesResourceRetryFailure" , jobType ).firing ().toTests ()... )
67+ ret = append (ret , newAlertTest ( "bz- networking" , "OVNKubernetesResourceRetryFailure" , jobType ).pending ().neverFail ().toTests ()... )
68+ ret = append (ret , newAlertTest ( "bz- networking" , "OVNKubernetesResourceRetryFailure" , jobType ).firing ().toTests ()... )
6269
63- ret = append (ret , newAlert ( " OLM" , "RedhatOperatorsCatalogError" , jobType ).pending ().neverFail ().toTests ()... )
64- ret = append (ret , newAlert ( " OLM" , "RedhatOperatorsCatalogError" , jobType ).firing ().toTests ()... )
70+ ret = append (ret , newAlertTest ( "bz- OLM" , "RedhatOperatorsCatalogError" , jobType ).pending ().neverFail ().toTests ()... )
71+ ret = append (ret , newAlertTest ( "bz- OLM" , "RedhatOperatorsCatalogError" , jobType ).firing ().toTests ()... )
6572
66- ret = append (ret , newAlert ( " storage" , "VSphereOpenshiftNodeHealthFail" , jobType ).pending ().neverFail ().toTests ()... )
67- ret = append (ret , newAlert ( " storage" , "VSphereOpenshiftNodeHealthFail" , jobType ).firing ().neverFail ().toTests ()... ) // https://bugzilla.redhat.com/show_bug.cgi?id=2055729
73+ ret = append (ret , newAlertTest ( "bz- storage" , "VSphereOpenshiftNodeHealthFail" , jobType ).pending ().neverFail ().toTests ()... )
74+ ret = append (ret , newAlertTest ( "bz- storage" , "VSphereOpenshiftNodeHealthFail" , jobType ).firing ().neverFail ().toTests ()... ) // https://bugzilla.redhat.com/show_bug.cgi?id=2055729
6875
69- ret = append (ret , newAlert ( " samples" , "SamplesImagestreamImportFailing" , jobType ).pending ().neverFail ().toTests ()... )
70- ret = append (ret , newAlert ( " samples" , "SamplesImagestreamImportFailing" , jobType ).firing ().toTests ()... )
76+ ret = append (ret , newAlertTest ( "bz- samples" , "SamplesImagestreamImportFailing" , jobType ).pending ().neverFail ().toTests ()... )
77+ ret = append (ret , newAlertTest ( "bz- samples" , "SamplesImagestreamImportFailing" , jobType ).firing ().toTests ()... )
7178
72- ret = append (ret , newAlert ( " apiserver-auth" , "PodSecurityViolation" , jobType ).firing ().toTests ()... )
79+ ret = append (ret , newAlertTest ( "bz- apiserver-auth" , "PodSecurityViolation" , jobType ).firing ().toTests ()... )
7380
7481 return ret
7582}
0 commit comments