Skip to content

Commit d334db8

Browse files
committed
Add new test that will fail on new alerts firing
Attempts to be forgiving, scan entire data file regardless of nurp looking to see if we've seen this alert name before anywhere. If not, or we did but first observed is less than two weeks ago, fail the test. This approach should help us survive the cutover to new releases.
1 parent 70e7a31 commit d334db8

File tree

20 files changed

+324
-80
lines changed

20 files changed

+324
-80
lines changed

pkg/cmd/openshift-tests/dev/dev.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func readIntervalsFromFile(intervalsFile string) (monitorapi.Intervals, error) {
135135
return nil, err
136136
}
137137

138-
return monitorserialization.EventsFromJSON(jsonBytes)
138+
return monitorserialization.IntervalsFromJSON(jsonBytes)
139139
}
140140

141141
func newRunDisruptionInvariantsCommand() *cobra.Command {

pkg/cmd/openshift-tests/monitor/timeline/timeline_command.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func NewTimelineOptions(ioStreams genericclioptions.IOStreams) *TimelineOptions
5757

5858
IOStreams: ioStreams,
5959
KnownRenderers: map[string]RenderFunc{
60-
"json": monitorserialization.EventsToJSON,
60+
"json": monitorserialization.IntervalsToJSON,
6161
"html": renderHTML,
6262
},
6363
KnownTimelines: map[string]monitorapi.EventIntervalMatchesFunc{

pkg/disruption/backend/sampler/remote.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ func fetchEventsFromFileOnNode(ctx context.Context, clientset *kubernetes.Client
380380
if err != nil {
381381
return filteredEvents, fmt.Errorf("failed to fetch file %s on node %s: %v", filePath, nodeName, err)
382382
}
383-
allEvents, err := monitorserialization.EventsFromJSON(allBytes)
383+
allEvents, err := monitorserialization.IntervalsFromJSON(allBytes)
384384
if err != nil {
385385
return nil, fmt.Errorf("failed to convert file %s from node %s to intervals: %v", filePath, nodeName, err)
386386
}

pkg/monitor/monitorapi/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ const (
199199
type AnnotationKey string
200200

201201
const (
202+
AnnotationAlertState AnnotationKey = "alertstate"
203+
AnnotationSeverity AnnotationKey = "severity"
202204
AnnotationReason AnnotationKey = "reason"
203205
AnnotationContainerExitCode AnnotationKey = "code"
204206
AnnotationCause AnnotationKey = "cause"

pkg/monitor/serialization/serialize.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ type EventInterval struct {
2121

2222
// TODO: Remove the omitempty, just here to keep from having to repeatedly updated the json
2323
// files used in some new tests
24-
Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
24+
Source string `json:"tempSource,omitempty"` // also temporary, unsure if this concept will survive
25+
Display bool `json:"display"`
2526

2627
// TODO: we're hoping to move these to just locator/message when everything is ready.
2728
StructuredLocator monitorapi.Locator `json:"tempStructuredLocator"`
@@ -37,7 +38,7 @@ type EventIntervalList struct {
3738
}
3839

3940
func EventsToFile(filename string, events monitorapi.Intervals) error {
40-
json, err := EventsToJSON(events)
41+
json, err := IntervalsToJSON(events)
4142
if err != nil {
4243
return err
4344
}
@@ -49,10 +50,10 @@ func EventsFromFile(filename string) (monitorapi.Intervals, error) {
4950
if err != nil {
5051
return nil, err
5152
}
52-
return EventsFromJSON(data)
53+
return IntervalsFromJSON(data)
5354
}
5455

55-
func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
56+
func IntervalsFromJSON(data []byte) (monitorapi.Intervals, error) {
5657
var list EventIntervalList
5758
if err := json.Unmarshal(data, &list); err != nil {
5859
return nil, err
@@ -64,12 +65,13 @@ func EventsFromJSON(data []byte) (monitorapi.Intervals, error) {
6465
return nil, err
6566
}
6667
events = append(events, monitorapi.Interval{
67-
Source: monitorapi.IntervalSource(interval.Source),
68+
Source: monitorapi.IntervalSource(interval.Source),
69+
Display: interval.Display,
6870
Condition: monitorapi.Condition{
6971
Level: level,
7072
Locator: interval.Locator,
71-
Message: interval.Message,
7273
StructuredLocator: interval.StructuredLocator,
74+
Message: interval.Message,
7375
StructuredMessage: interval.StructuredMessage,
7476
},
7577

@@ -91,7 +93,8 @@ func IntervalFromJSON(data []byte) (*monitorapi.Interval, error) {
9193
return nil, err
9294
}
9395
return &monitorapi.Interval{
94-
Source: monitorapi.IntervalSource(serializedInterval.Source),
96+
Source: monitorapi.IntervalSource(serializedInterval.Source),
97+
Display: serializedInterval.Display,
9598
Condition: monitorapi.Condition{
9699
Level: level,
97100
Locator: serializedInterval.Locator,
@@ -120,9 +123,9 @@ func IntervalToOneLineJSON(interval monitorapi.Interval) ([]byte, error) {
120123
return buf.Bytes(), nil
121124
}
122125

123-
func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
126+
func IntervalsToJSON(intervals monitorapi.Intervals) ([]byte, error) {
124127
outputEvents := []EventInterval{}
125-
for _, curr := range events {
128+
for _, curr := range intervals {
126129
outputEvents = append(outputEvents, monitorEventIntervalToEventInterval(curr))
127130
}
128131

@@ -131,14 +134,16 @@ func EventsToJSON(events monitorapi.Intervals) ([]byte, error) {
131134
return json.MarshalIndent(list, "", " ")
132135
}
133136

134-
func EventsIntervalsToFile(filename string, events monitorapi.Intervals) error {
135-
json, err := EventsIntervalsToJSON(events)
137+
func IntervalsToFile(filename string, intervals monitorapi.Intervals) error {
138+
json, err := EventsIntervalsToJSON(intervals)
136139
if err != nil {
137140
return err
138141
}
139142
return ioutil.WriteFile(filename, json, 0644)
140143
}
141144

145+
// TODO: this is very similar but subtly different to the function above, what is the purpose of skipping those
146+
// with from/to equal or empty to?
142147
func EventsIntervalsToJSON(events monitorapi.Intervals) ([]byte, error) {
143148
outputEvents := []EventInterval{}
144149
for _, curr := range events {
@@ -161,11 +166,11 @@ func monitorEventIntervalToEventInterval(interval monitorapi.Interval) EventInte
161166
StructuredLocator: interval.StructuredLocator,
162167
StructuredMessage: interval.StructuredMessage,
163168
Source: string(interval.Source),
169+
Display: interval.Display,
164170

165171
From: metav1.Time{Time: interval.From},
166172
To: metav1.Time{Time: interval.To},
167173
}
168-
169174
return ret
170175
}
171176

pkg/monitortestlibrary/allowedalerts/basic_alert.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ type AlertTest interface {
3131

3232
// AlertState is the state of the alert. They are logically ordered, so if a test says it limits on "pending", then
3333
// any state above pending (like info or warning) will cause the test to fail.
34+
// TODO this looks wrong, AlertState (pending|firing) and AlertLevel (info|warning|critical) are different things, but they seem lumped together here.
3435
type AlertState string
3536

3637
const (

pkg/monitortestlibrary/allowedalerts/matches.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ func (d *percentileAllowances) FlakeAfter(key historicaldata2.AlertDataKey) time
5252
// getClosestPercentilesValues uses the backend and information about the cluster to choose the best historical p99 to operate against.
5353
// We enforce "don't get worse" for disruption by watching the aggregate data in CI over many runs.
5454
func getClosestPercentilesValues(key historicaldata2.AlertDataKey) (historicaldata2.StatisticalDuration, string, error) {
55-
return getCurrentResults().BestMatchDuration(key)
55+
return GetHistoricalData().BestMatchDuration(key)
5656
}
5757

5858
func alwaysFlake() AlertTestAllowanceCalculator {

pkg/monitortestlibrary/allowedalerts/matches_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ func TestGetClosestP99Value(t *testing.T) {
171171
// from bigquery and commit into origin. Test ensures we can parse it and the data looks sane.
172172
func TestAlertDataFileParsing(t *testing.T) {
173173

174-
alertMatcher := getCurrentResults()
174+
alertMatcher := GetHistoricalData()
175175

176176
// The list of known alerts that goes into this file is composed of everything we've ever
177177
// seen fire in that release. As such it can change from one release to the next as alerts

pkg/monitortestlibrary/allowedalerts/types.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ var (
2222
historicalData *historicaldata.AlertBestMatcher
2323
)
2424

25-
func getCurrentResults() *historicaldata.AlertBestMatcher {
25+
func GetHistoricalData() *historicaldata.AlertBestMatcher {
2626
readResults.Do(
2727
func() {
2828
var err error
@@ -34,3 +34,14 @@ func getCurrentResults() *historicaldata.AlertBestMatcher {
3434

3535
return historicalData
3636
}
37+
38+
// AllowedAlertNames is a list of alerts we do not test against.
39+
var AllowedAlertNames = []string{
40+
"Watchdog",
41+
"AlertmanagerReceiversNotConfigured",
42+
"PrometheusRemoteWriteDesiredShards",
43+
"KubeJobFailed", // this is a result of bug https://bugzilla.redhat.com/show_bug.cgi?id=2054426 . We should catch these in the prometheus tests.
44+
45+
// indicates a problem in the external Telemeter service, presently very common, does not impact our ability to e2e test:
46+
"TelemeterClientFailures",
47+
}

pkg/monitortestlibrary/historicaldata/alert_types.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@ import (
1212
)
1313

1414
type AlertStatisticalData struct {
15-
AlertDataKey `json:",inline"`
16-
Name string
17-
P95 float64
18-
P99 float64
19-
JobRuns int64
15+
AlertDataKey `json:",inline"`
16+
Name string
17+
P50 float64
18+
P75 float64
19+
P95 float64
20+
P99 float64
21+
FirstObserved time.Time
22+
LastObserved time.Time
23+
JobRuns int64
2024
}
2125

2226
type AlertDataKey struct {
@@ -84,7 +88,7 @@ func (b *AlertBestMatcher) bestMatch(key AlertDataKey) (AlertStatisticalData, st
8488
Debugf("searching for best match for %+v", key.JobType)
8589

8690
if percentiles, ok := b.HistoricalData[exactMatchKey]; ok {
87-
if percentiles.JobRuns > minJobRuns {
91+
if percentiles.JobRuns >= minJobRuns {
8892
logrus.Infof("found exact match: %+v", percentiles)
8993
return percentiles, "", nil
9094
}
@@ -159,8 +163,12 @@ func (b *AlertBestMatcher) BestMatchP99(key AlertDataKey) (*time.Duration, strin
159163

160164
func toAlertStatisticalDuration(in AlertStatisticalData) StatisticalDuration {
161165
return StatisticalDuration{
162-
JobType: in.AlertDataKey.JobType,
163-
P95: DurationOrDie(in.P95),
164-
P99: DurationOrDie(in.P99),
166+
JobType: in.AlertDataKey.JobType,
167+
P50: DurationOrDie(in.P50),
168+
P75: DurationOrDie(in.P75),
169+
P95: DurationOrDie(in.P95),
170+
P99: DurationOrDie(in.P99),
171+
FirstObserved: in.FirstObserved,
172+
LastObserved: in.LastObserved,
165173
}
166174
}

0 commit comments

Comments
 (0)