Skip to content

Commit 617bc41

Browse files
Merge pull request #28332 from dgoodwin/alert-pin-tests
Improvements and Additions to Alert Testing Stack
2 parents abb76df + dc3c6fc commit 617bc41

File tree

24 files changed

+383
-304
lines changed

24 files changed

+383
-304
lines changed

pkg/alerts/check.go

Lines changed: 0 additions & 152 deletions
This file was deleted.

pkg/cmd/openshift-tests/dev/dev.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
135135
return nil, err
136136
}
137137

138-
return monitorserialization.EventsFromJSON(jsonBytes)
138+
return monitorserialization.IntervalsFromJSON(jsonBytes)
139139
}
140140

141141
func newRunDisruptionInvariantsCommand() *cobra.Command {

pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions
5757

5858
IOStreams: ioStreams,
5959
KnownRenderers: map[string]RenderFunc{
60-
"json": monitorserialization.EventsToJSON,
60+
"json": monitorserialization.IntervalsToJSON,
6161
"html": renderHTML,
6262
},
6363
KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{

pkg/disruption/backend/sampler/remote.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
380380
if err != nil {
381381
return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
382382
}
383-
allEvents, err := monitorserialization.EventsFromJSON(allBytes)
383+
allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
384384
if err != nil {
385385
return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
386386
}

pkg/monitor/monitorapi/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ const (
199199
type AnnotationKey string
200200

201201
const (
202+
AnnotationAlertState AnnotationKey = "alertstate"
203+
AnnotationSeverity AnnotationKey = "severity"
202204
AnnotationReason AnnotationKey = "reason"
203205
AnnotationContainerExitCode AnnotationKey = "code"
204206
AnnotationCause AnnotationKey = "cause"

pkg/monitor/serialization/serialize.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ type EventIntervalList struct {
3737
}
3838

3939
func EventsToFile(filename string, events monitorapi.Intervals) error {
40-
json, err := EventsToJSON(events)
40+
json, err := IntervalsToJSON(events)
4141
if err != nil {
4242
return err
4343
}
@@ -49,10 +49,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
4949
if err != nil {
5050
return nil, err
5151
}
52-
return EventsFromJSON(data)
52+
return IntervalsFromJSON(data)
5353
}
5454

55-
func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
55+
func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
5656
var list EventIntervalList
5757
if err := json.Unmarshal(data, &list); err != nil {
5858
return nil, err
@@ -68,8 +68,8 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
6868
Condition: monitorapi.Condition{
6969
Level: level,
7070
Locator: interval.Locator,
71-
Message: interval.Message,
7271
StructuredLocator: interval.StructuredLocator,
72+
Message: interval.Message,
7373
StructuredMessage: interval.StructuredMessage,
7474
},
7575

@@ -120,9 +120,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
120120
return buf.Bytes(), nil
121121
}
122122

123-
func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
123+
func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
124124
outputEvents := []EventInterval{}
125-
for _, curr := range events {
125+
for _, curr := range intervals {
126126
outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
127127
}
128128

@@ -131,14 +131,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
131131
return json.MarshalIndent(list, "", " ")
132132
}
133133

134-
func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
135-
json, err := EventsIntervalsToJSON(events)
134+
func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
135+
json, err := EventsIntervalsToJSON(intervals)
136136
if err != nil {
137137
return err
138138
}
139139
return ioutil.WriteFile(filename, json, 0644)
140140
}
141141

142+
// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
143+
// with from/to equal or empty to?
142144
func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
143145
outputEvents := []EventInterval{}
144146
for _, curr := range events {
@@ -165,7 +167,6 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
165167
From: metav1.Time{Time: interval.From},
166168
To: metav1.Time{Time: interval.To},
167169
}
168-
169170
return ret
170171
}
171172

pkg/monitortestlibrary/allowedalerts/basic_alert.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type AlertTest interface {
3131

3232
// AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
3333
// any state above pending (like info or warning) will cause the test to fail.
34+
// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
3435
type AlertState string
3536

3637
const (
@@ -111,6 +112,11 @@ func (a *alertBuilder) neverFail() *alertBuilder {
111112
return a
112113
}
113114

115+
func (a *alertBuilder) alwaysFlake() *alertBuilder {
116+
a.allowanceCalculator = alwaysFlake()
117+
return a
118+
}
119+
114120
func (a *alertBuilder) toTests() []AlertTest {
115121
if !a.divideByNamespaces {
116122
return []AlertTest{

pkg/monitortestlibrary/allowedalerts/matches.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,22 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
5252
// getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
5353
// We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
5454
func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
55-
return getCurrentResults().BestMatchDuration(key)
55+
return GetHistoricalData().BestMatchDuration(key)
56+
}
57+
58+
func alwaysFlake() AlertTestAllowanceCalculator {
59+
return &alwaysFlakeAllowance{}
60+
}
61+
62+
// alwaysFlakeAllowance is for alerts we want to flake a test if they occur at all.
63+
type alwaysFlakeAllowance struct {
64+
}
65+
66+
func (d *alwaysFlakeAllowance) FailAfter(key historicaldata2.AlertDataKey) (time.Duration, error) {
67+
// make it effectively impossible for a test failure here, we only want flakes
68+
return 24 * time.Hour, nil
69+
}
70+
71+
func (d *alwaysFlakeAllowance) FlakeAfter(key historicaldata2.AlertDataKey) time.Duration {
72+
return 1 * time.Second
5673
}

pkg/monitortestlibrary/allowedalerts/matches_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
171171
// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
172172
func TestAlertDataFileParsing(t *testing.T) {
173173

174-
alertMatcher := getCurrentResults()
174+
alertMatcher := GetHistoricalData()
175175

176176
// The list of known alerts that goes into this file is composed of everything we've ever
177177
// seen fire in that release. As such it can change from one release to the next as alerts

pkg/monitortestlibrary/allowedalerts/types.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ var (
2222
historicalData *historicaldata.AlertBestMatcher
2323
)
2424

25-
func getCurrentResults() *historicaldata.AlertBestMatcher {
25+
func GetHistoricalData() *historicaldata.AlertBestMatcher {
2626
readResults.Do(
2727
func() {
2828
var err error
@@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {
3434

3535
return historicalData
3636
}
37+
38+
// AllowedAlertNames is a list of alerts we do not test against.
39+
var AllowedAlertNames = []string{
40+
"Watchdog",
41+
"AlertmanagerReceiversNotConfigured",
42+
"PrometheusRemoteWriteDesiredShards",
43+
"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the prometheus tests.
44+
45+
// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
46+
"TelemeterClientFailures",
47+
}

0 commit comments

Comments
 (0)