Drop the etcd should not log excessive took too long messages test

dgoodwin · dgoodwin · commit 0d7b97d6f830 · 2025-08-20T14:43:16.000-03:00
This has caused nothing but problems as it pretty much logs constantly on some platforms, and has never really caught a bug that we can recall. Trying to tune it is leading to such absurd limits that it's clearly not worth monitoring for failures. Removing with staff eng approval.
diff --git a/pkg/monitortests/etcd/legacyetcdmonitortests/etcd.go b/pkg/monitortests/etcd/legacyetcdmonitortests/etcd.go
@@ -80,45 +80,6 @@ func testEtcdShouldNotLogDroppedRaftMessages(events monitorapi.Intervals) []*jun
 	return []*junitapi.JUnitTestCase{failure, success}
 }
 
-// etcdTookTooLongMaxRatePerFourHours is the max rate of messages allowed over a 4-hour period.
-// This replaces the fixed limit approach with a rate-based approach.
-// Virtually all jobs log these messages at some point, we're just interested in the ones that do so excessively.
-const etcdTookTooLongMaxRatePerFourHours = 12000
-
-func testEtcdDoesNotLogExcessiveTookTooLongMessages(events monitorapi.Intervals, startTime time.Time) []*junitapi.JUnitTestCase {
-	const testName = "[sig-etcd] etcd should not log excessive took too long messages"
-	success := &junitapi.JUnitTestCase{Name: testName}
-
-	counter := 0
-	for _, event := range events {
-		if event.Source == monitorapi.SourceEtcdLog &&
-			strings.Contains(event.Message.HumanMessage, "took too long") {
-			counter++
-		}
-	}
-
-	maxAllowedCount := calculateRateBasedLimit(startTime, etcdTookTooLongMaxRatePerFourHours)
-	actualDuration := time.Since(startTime)
-
-	if counter <= maxAllowedCount {
-		return []*junitapi.JUnitTestCase{success}
-	}
-
-	msg := fmt.Sprintf("Etcd logged %d 'took too long' messages in %v, exceeding the rate-based limit of %d "+
-		"(based on max rate of %d messages per 4 hours). This is a strong indicator that etcd was very unhealthy "+
-		"throughout the run. This can cause sporadic e2e failures and disruption and typically indicates faster "+
-		"disks are needed. These log message intervals are included in spyglass chart artifacts and can be used "+
-		"to correlate with disruption and failed tests.",
-		counter, actualDuration.Round(time.Minute), maxAllowedCount, etcdTookTooLongMaxRatePerFourHours)
-	failure := &junitapi.JUnitTestCase{
-		Name: testName,
-		FailureOutput: &junitapi.FailureOutput{
-			Output: msg,
-		},
-	}
-	return []*junitapi.JUnitTestCase{failure}
-}
-
 // etcdOverloadedNetworkMaxRatePerFourHours uses the same rate-based approach for overloaded network messages.
 // We use the same rate limit as the "took too long" messages since both indicate severe etcd health issues.
 //
diff --git a/pkg/monitortests/etcd/legacyetcdmonitortests/monitortest.go b/pkg/monitortests/etcd/legacyetcdmonitortests/monitortest.go
@@ -71,7 +71,6 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
 	junits = append(junits, testEtcdShouldNotLogSlowFdataSyncs(finalIntervals)...)
 	junits = append(junits, testEtcdShouldNotLogDroppedRaftMessages(finalIntervals)...)
 	junits = append(junits, testOperatorStatusChanged(finalIntervals)...)
-	junits = append(junits, testEtcdDoesNotLogExcessiveTookTooLongMessages(finalIntervals, w.startTime)...)
 	junits = append(junits, testEtcdDoesNotLogExcessiveOverloadedNetworkMessages(finalIntervals, w.startTime)...)
 
 	return junits, nil