Skip to content

Commit d90ca10

Browse files
committed
Support pinning alerts that we want to fire on any occurrence
New initiative to allow pinning alerts that are simply not allowed to fire, regardless of historical data.
1 parent fc55c26 commit d90ca10

File tree

6 files changed

+40
-15
lines changed

6 files changed

+40
-15
lines changed

pkg/alerts/check.go

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"strings"
77
"time"
88

9-
allowedalerts2 "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
9+
allowedalerts "github.com/openshift/origin/pkg/monitortestlibrary/allowedalerts"
1010
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
1111

1212
o "github.com/onsi/gomega"
@@ -18,20 +18,14 @@ import (
1818
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1919
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2020
"k8s.io/apimachinery/pkg/util/sets"
21-
"k8s.io/client-go/rest"
2221
"k8s.io/kubernetes/test/e2e/framework"
2322
)
2423

2524
type AllowedAlertsFunc func(featureSet configv1.FeatureSet) (allowedFiringWithBugs, allowedFiring, allowedPendingWithBugs, allowedPending helper.MetricConditions)
2625

2726
// CheckAlerts will query prometheus and ensure no-unexpected alerts were pending or firing.
2827
// Used by both upgrade and conformance suites, with different allowances for each.
29-
func CheckAlerts(allowancesFunc AllowedAlertsFunc,
30-
restConfig *rest.Config,
31-
prometheusClient prometheusv1.API, // TODO: remove
32-
configClient configclient.Interface, // TODO: remove
33-
testDuration time.Duration,
34-
f *framework.Framework) {
28+
func CheckAlerts(allowancesFunc AllowedAlertsFunc, prometheusClient prometheusv1.API, configClient configclient.Interface, testDuration time.Duration, f *framework.Framework) {
3529

3630
featureSet := configv1.Default
3731
featureGate, err := configClient.ConfigV1().FeatureGates().Get(context.TODO(), "cluster", metav1.GetOptions{})
@@ -46,11 +40,11 @@ func CheckAlerts(allowancesFunc AllowedAlertsFunc,
4640
// In addition to the alert allowances passed in (which can differ for upgrades vs conformance),
4741
// we also exclude alerts that have their own separate tests codified. This is a backstop test for
4842
// everything else.
49-
for _, alertTest := range allowedalerts2.AllAlertTests(&platformidentification.JobType{},
50-
allowedalerts2.DefaultAllowances) {
43+
for _, alertTest := range allowedalerts.AllAlertTests(&platformidentification.JobType{},
44+
allowedalerts.DefaultAllowances) {
5145

5246
switch alertTest.AlertState() {
53-
case allowedalerts2.AlertPending:
47+
case allowedalerts.AlertPending:
5448
// a pending test covers pending and everything above (firing)
5549
allowedPendingAlerts = append(allowedPendingAlerts,
5650
helper.MetricCondition{
@@ -64,7 +58,7 @@ func CheckAlerts(allowancesFunc AllowedAlertsFunc,
6458
Text: "has a separate e2e test",
6559
},
6660
)
67-
case allowedalerts2.AlertInfo:
61+
case allowedalerts.AlertInfo:
6862
// an info test covers all firing
6963
allowedFiringAlerts = append(allowedFiringAlerts,
7064
helper.MetricCondition{

pkg/monitortestlibrary/allowedalerts/all.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@ func AllAlertTests(jobType *platformidentification.JobType, etcdAllowance AlertT
1515
ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).pending().neverFail().toTests()...)
1616
ret = append(ret, newNamespacedAlert("KubePodNotReady", jobType).firing().toTests()...)
1717

18+
// ClusterOperatorDown and ClusterOperatorDegraded should not occur during a normal upgrade.
19+
// In CI firing is rare, but does happen a few times a month. Pending however occurs all the time, which implies operators are routinely in
20+
// a state they're not supposed to be during upgrade.
21+
// https://github.com/openshift/enhancements/blob/cb81452fddf86c1099acd87610b88369cd6192db/dev-guide/cluster-version-operator/dev/clusteroperator.md#there-are-a-set-of-guarantees-components-are-expected-to-honor-in-return
22+
ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).pending().alwaysFail().toTests()...)
23+
ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).pending().alwaysFail().toTests()...)
24+
ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDown", jobType).firing().alwaysFail().toTests()...)
25+
ret = append(ret, newAlert("Cluster Version Operator", "ClusterOperatorDegraded", jobType).firing().alwaysFail().toTests()...)
26+
1827
ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).pending().neverFail().toTests()...)
1928
ret = append(ret, newAlert("etcd", "etcdMembersDown", jobType).firing().toTests()...)
2029
ret = append(ret, newAlert("etcd", "etcdGRPCRequestsSlow", jobType).pending().neverFail().toTests()...)

pkg/monitortestlibrary/allowedalerts/basic_alert.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ func (a *alertBuilder) neverFail() *alertBuilder {
111111
return a
112112
}
113113

114+
func (a *alertBuilder) alwaysFail() *alertBuilder {
115+
a.allowanceCalculator = alwaysFail()
116+
return a
117+
}
118+
114119
func (a *alertBuilder) toTests() []AlertTest {
115120
if !a.divideByNamespaces {
116121
return []AlertTest{

pkg/monitortestlibrary/allowedalerts/matches.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,21 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
5454
func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
5555
return getCurrentResults().BestMatchDuration(key)
5656
}
57+
58+
func alwaysFail() AlertTestAllowanceCalculator {
59+
return &alwaysFailAllowance{}
60+
}
61+
62+
// alwaysFailAllowance is for alerts we want to fail a test if they occur at all.
63+
type alwaysFailAllowance struct {
64+
}
65+
66+
func (d *alwaysFailAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) {
67+
// TODO: right now we're just flaking until we're certain this doesn't happen too often. Once we're sure,
68+
// change to 1 second.
69+
return 24 * time.Hour, nil
70+
}
71+
72+
func (d *alwaysFailAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
73+
return 1 * time.Second
74+
}

test/e2e/upgrade/alert/alert.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ func (t *UpgradeTest) Test(ctx context.Context, f *framework.Framework, done <-c
4848

4949
testDuration := time.Now().Sub(startTime).Round(time.Second)
5050

51-
alerts.CheckAlerts(alerts.AllowedAlertsDuringUpgrade, t.oc.AdminConfig(),
52-
t.oc.NewPrometheusClient(context.TODO()), t.oc.AdminConfigClient(), testDuration, f)
51+
alerts.CheckAlerts(alerts.AllowedAlertsDuringUpgrade, t.oc.NewPrometheusClient(context.TODO()), t.oc.AdminConfigClient(), testDuration, f)
5352
}
5453

5554
// Teardown cleans up any remaining resources.

test/extended/prometheus/prometheus.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ var _ = g.Describe("[sig-instrumentation][Late] Alerts", func() {
255255
g.It("shouldn't report any unexpected alerts in firing or pending state", func() {
256256
// we only consider samples since the beginning of the test
257257
testDuration := exutil.DurationSinceStartInSeconds()
258-
alerts.CheckAlerts(alerts.AllowedAlertsDuringConformance, oc.AdminConfig(), oc.NewPrometheusClient(context.TODO()), oc.AdminConfigClient(), testDuration, nil)
258+
alerts.CheckAlerts(alerts.AllowedAlertsDuringConformance, oc.NewPrometheusClient(context.TODO()), oc.AdminConfigClient(), testDuration, nil)
259259
})
260260

261261
g.It("shouldn't exceed the series limit of total series sent via telemetry from each cluster", func() {

0 commit comments

Comments
 (0)