Skip to content

Commit 0d7b97d

Browse files
committed
Drop the etcd should not log excessive took too long messages test
This has caused nothing but problems as it pretty much logs constantly on some platforms, and has never really caught a bug that we can recall. Trying to tune it is leading to such absurd limits that it's clearly not worth monitoring for failures. Removing with staff eng approval.
1 parent 33d26ed commit 0d7b97d

File tree

2 files changed

+0
-40
lines changed

2 files changed

+0
-40
lines changed

pkg/monitortests/etcd/legacyetcdmonitortests/etcd.go

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -80,45 +80,6 @@ func testEtcdShouldNotLogDroppedRaftMessages(events monitorapi.Intervals) []*jun
8080
return []*junitapi.JUnitTestCase{failure, success}
8181
}
8282

83-
// etcdTookTooLongMaxRatePerFourHours is the max rate of messages allowed over a 4-hour period.
84-
// This replaces the fixed limit approach with a rate-based approach.
85-
// Virtually all jobs log these messages at some point, we're just interested in the ones that do so excessively.
86-
const etcdTookTooLongMaxRatePerFourHours = 12000
87-
88-
func testEtcdDoesNotLogExcessiveTookTooLongMessages(events monitorapi.Intervals, startTime time.Time) []*junitapi.JUnitTestCase {
89-
const testName = "[sig-etcd] etcd should not log excessive took too long messages"
90-
success := &junitapi.JUnitTestCase{Name: testName}
91-
92-
counter := 0
93-
for _, event := range events {
94-
if event.Source == monitorapi.SourceEtcdLog &&
95-
strings.Contains(event.Message.HumanMessage, "took too long") {
96-
counter++
97-
}
98-
}
99-
100-
maxAllowedCount := calculateRateBasedLimit(startTime, etcdTookTooLongMaxRatePerFourHours)
101-
actualDuration := time.Since(startTime)
102-
103-
if counter <= maxAllowedCount {
104-
return []*junitapi.JUnitTestCase{success}
105-
}
106-
107-
msg := fmt.Sprintf("Etcd logged %d 'took too long' messages in %v, exceeding the rate-based limit of %d "+
108-
"(based on max rate of %d messages per 4 hours). This is a strong indicator that etcd was very unhealthy "+
109-
"throughout the run. This can cause sporadic e2e failures and disruption and typically indicates faster "+
110-
"disks are needed. These log message intervals are included in spyglass chart artifacts and can be used "+
111-
"to correlate with disruption and failed tests.",
112-
counter, actualDuration.Round(time.Minute), maxAllowedCount, etcdTookTooLongMaxRatePerFourHours)
113-
failure := &junitapi.JUnitTestCase{
114-
Name: testName,
115-
FailureOutput: &junitapi.FailureOutput{
116-
Output: msg,
117-
},
118-
}
119-
return []*junitapi.JUnitTestCase{failure}
120-
}
121-
12283
// etcdOverloadedNetworkMaxRatePerFourHours uses the same rate-based approach for overloaded network messages.
12384
// We use the same rate limit as the "took too long" messages since both indicate severe etcd health issues.
12485
//

pkg/monitortests/etcd/legacyetcdmonitortests/monitortest.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
7171
junits = append(junits, testEtcdShouldNotLogSlowFdataSyncs(finalIntervals)...)
7272
junits = append(junits, testEtcdShouldNotLogDroppedRaftMessages(finalIntervals)...)
7373
junits = append(junits, testOperatorStatusChanged(finalIntervals)...)
74-
junits = append(junits, testEtcdDoesNotLogExcessiveTookTooLongMessages(finalIntervals, w.startTime)...)
7574
junits = append(junits, testEtcdDoesNotLogExcessiveOverloadedNetworkMessages(finalIntervals, w.startTime)...)
7675

7776
return junits, nil

0 commit comments

Comments
 (0)