Skip to content

Commit fa60863

Browse files
Merge pull request #28386 from xueqzhan/revert-28381-revert-28233-1699276048783
Revert "TRT-1339: Revert #28233 "ignore repeated TopologyAwareHintsDisabled events""
2 parents 9aa824d + 89829e1 commit fa60863

File tree

4 files changed

+251
-12
lines changed

4 files changed

+251
-12
lines changed

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,6 @@ var KnownEventsBugs = []KnownProblem{
223223
Regexp: regexp.MustCompile("ns/openshift-etcd-operator namespace/openshift-etcd-operator -.*rpc error: code = Canceled desc = grpc: the client connection is closing.*"),
224224
BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2006975",
225225
},
226-
{
227-
Regexp: regexp.MustCompile("reason/TopologyAwareHintsDisabled"),
228-
BZ: "https://issues.redhat.com/browse/OCPBUGS-13366",
229-
},
230226
{
231227
Regexp: regexp.MustCompile("ns/.*reason/.*APICheckFailed.*503.*"),
232228
BZ: "https://bugzilla.redhat.com/show_bug.cgi?id=2017435",

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"strconv"
88
"strings"
9+
"time"
910

1011
"github.com/sirupsen/logrus"
1112
e2e "k8s.io/kubernetes/test/e2e/framework"
@@ -235,6 +236,13 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
235236
})
236237
logrus.Infof("found %d NodeUpdate intervals", len(nodeUpdateIntervals))
237238

239+
type timeRange struct {
240+
from time.Time
241+
to time.Time
242+
}
243+
buildTopologyHintAllowedTimeRanges := true
244+
topologyHintAllowedTimeRanges := []*timeRange{}
245+
238246
displayToCount := map[string]*pathologicalEvents{}
239247
for _, event := range events {
240248
// TODO: port to use structured message reason once kube event intervals are ported over
@@ -252,9 +260,73 @@ func (d duplicateEventsEvaluator) testDuplicatedEvents(testName string, flakeOnl
252260
continue
253261
}
254262
}
263+
if strings.Contains(event.Message, "reason/TopologyAwareHintsDisabled") {
264+
// Build the allowed time range only once
265+
if buildTopologyHintAllowedTimeRanges {
266+
taintManagerTestIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
267+
return eventInterval.Source == monitorapi.SourceE2ETest &&
268+
strings.Contains(eventInterval.StructuredLocator.Keys[monitorapi.LocatorE2ETestKey], "NoExecuteTaintManager")
269+
})
270+
// Start the allowed time range from time range of the tests. But events lag behind the tests since the tests do not wait
271+
// until all dns pods are properly scheduled and reach ready state. So we will need to expand the allowed time range after.
272+
for _, test := range taintManagerTestIntervals {
273+
topologyHintAllowedTimeRanges = append(topologyHintAllowedTimeRanges, &timeRange{from: test.From, to: test.To})
274+
logrus.WithField("from", test.From).WithField("to", test.To).Infof("found time range for test: %s", testName)
275+
}
276+
dnsUpdateIntervals := events.Filter(func(eventInterval monitorapi.Interval) bool {
277+
return eventInterval.Source == monitorapi.SourcePodState &&
278+
(eventInterval.StructuredLocator.Type == monitorapi.LocatorTypePod || eventInterval.StructuredLocator.Type == monitorapi.LocatorTypeContainer) &&
279+
eventInterval.StructuredLocator.Keys[monitorapi.LocatorNamespaceKey] == "openshift-dns" &&
280+
eventInterval.StructuredMessage.Annotations[monitorapi.AnnotationConstructed] == monitorapi.ConstructionOwnerPodLifecycle
281+
})
282+
283+
// Now expand the allowed time range until the replacement dns pod gets ready
284+
for _, r := range topologyHintAllowedTimeRanges {
285+
var lastReadyTime time.Time
286+
count := 0
287+
for _, interval := range dnsUpdateIntervals {
288+
if interval.From.Before(r.from) {
289+
continue
290+
}
291+
// If there is a GracefulDelete of dns-default pod, we will have to wait until the replacement dns container becomes ready
292+
if interval.StructuredMessage.Reason == monitorapi.PodReasonGracefulDeleteStarted &&
293+
strings.Contains(interval.StructuredLocator.Keys[monitorapi.LocatorPodKey], "dns-default") {
294+
count++
295+
}
296+
if interval.StructuredMessage.Reason == monitorapi.ContainerReasonReady &&
297+
interval.StructuredLocator.Keys[monitorapi.LocatorContainerKey] == "dns" && count > 0 {
298+
lastReadyTime = interval.From
299+
count--
300+
}
301+
if interval.From.After(r.to) && count == 0 {
302+
if lastReadyTime.After(r.to) {
303+
r.to = lastReadyTime
304+
}
305+
break
306+
}
307+
}
308+
}
309+
// Log final adjusted time ranges
310+
for _, test := range taintManagerTestIntervals {
311+
logrus.WithField("from", test.From).WithField("to", test.To).Infof("adjusted time range for test: %s", testName)
312+
}
313+
buildTopologyHintAllowedTimeRanges = false
314+
}
315+
// Filter out TopologyAwareHintsDisabled events within allowed time range
316+
var allowed bool
317+
for _, r := range topologyHintAllowedTimeRanges {
318+
if r.from.Before(event.From) && r.to.After(event.To) {
319+
logrus.Infof("%s was found to fall into the allowed time range %+v, ignoring pathological event as we expect these during NoExecuteTaintManager test", event, r)
320+
allowed = true
321+
break
322+
}
323+
}
324+
if allowed {
325+
continue
326+
}
327+
}
255328
eventDisplayMessage, times := GetTimesAnEventHappened(fmt.Sprintf("%s - %s", event.Locator, event.Message))
256329
if times > DuplicateEventThreshold {
257-
258330
// If we marked this message earlier in recordAddOrUpdateEvent as interesting/true, we know it matched one of
259331
// the existing patterns or one of the AllowedRepeatedEventFns functions returned true.
260332
if strings.Contains(eventDisplayMessage, InterestingMark) {

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,3 +558,173 @@ func TestMakeProbeTestEventsGroup(t *testing.T) {
558558
})
559559
}
560560
}
561+
562+
func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
563+
evaluator := duplicateEventsEvaluator{
564+
allowedRepeatedEventPatterns: AllowedRepeatedEventPatterns,
565+
knownRepeatedEventsBugs: []KnownProblem{},
566+
}
567+
from := time.Unix(872827200, 0).In(time.UTC)
568+
to := time.Unix(872827200, 0).In(time.UTC)
569+
570+
tests := []struct {
571+
name string
572+
namespace string
573+
platform v1.PlatformType
574+
topology v1.TopologyMode
575+
intervals []monitorapi.Interval
576+
expectedMessage string
577+
}{
578+
{
579+
// This is ignored because the node is tainted by test
580+
name: "ignore TopologyAwareHintsDisabled before dns container ready",
581+
intervals: []monitorapi.Interval{
582+
{
583+
Condition: monitorapi.Condition{
584+
Level: monitorapi.Info,
585+
StructuredLocator: monitorapi.Locator{
586+
Type: monitorapi.LocatorTypeE2ETest,
587+
Keys: map[monitorapi.LocatorKey]string{
588+
monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
589+
},
590+
},
591+
StructuredMessage: monitorapi.Message{},
592+
},
593+
Source: monitorapi.SourceE2ETest,
594+
From: from.Add(-10 * time.Minute),
595+
To: to.Add(10 * time.Minute),
596+
},
597+
{
598+
Condition: monitorapi.Condition{
599+
Level: monitorapi.Info,
600+
StructuredLocator: monitorapi.Locator{
601+
Type: monitorapi.LocatorTypePod,
602+
Keys: map[monitorapi.LocatorKey]string{
603+
monitorapi.LocatorNamespaceKey: "openshift-dns",
604+
monitorapi.LocatorPodKey: "dns-default-jq2qn",
605+
},
606+
},
607+
StructuredMessage: monitorapi.Message{
608+
Reason: monitorapi.PodReasonGracefulDeleteStarted,
609+
Annotations: map[monitorapi.AnnotationKey]string{
610+
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
611+
monitorapi.AnnotationReason: "GracefulDelete",
612+
},
613+
},
614+
},
615+
Source: monitorapi.SourcePodState,
616+
From: from.Add(-5 * time.Minute),
617+
To: to.Add(1 * time.Minute),
618+
},
619+
{
620+
Condition: monitorapi.Condition{
621+
Level: monitorapi.Info,
622+
Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
623+
},
624+
From: from.Add(11 * time.Minute),
625+
To: to.Add(12 * time.Minute),
626+
},
627+
{
628+
Condition: monitorapi.Condition{
629+
Level: monitorapi.Info,
630+
StructuredLocator: monitorapi.Locator{
631+
Type: monitorapi.LocatorTypeContainer,
632+
Keys: map[monitorapi.LocatorKey]string{
633+
monitorapi.LocatorNamespaceKey: "openshift-dns",
634+
monitorapi.LocatorContainerKey: "dns",
635+
monitorapi.LocatorPodKey: "dns-default-jq2qn",
636+
},
637+
},
638+
StructuredMessage: monitorapi.Message{
639+
Reason: monitorapi.ContainerReasonReady,
640+
Annotations: map[monitorapi.AnnotationKey]string{
641+
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
642+
monitorapi.AnnotationReason: "Ready",
643+
},
644+
},
645+
},
646+
Source: monitorapi.SourcePodState,
647+
From: from.Add(15 * time.Minute),
648+
To: to.Add(16 * time.Minute),
649+
},
650+
},
651+
namespace: "openshift-dns",
652+
expectedMessage: "",
653+
},
654+
{
655+
// This is not ignored because there is no dns ready following
656+
name: "fire TopologyAwareHintsDisabled when there is no dns container ready",
657+
intervals: []monitorapi.Interval{
658+
{
659+
Condition: monitorapi.Condition{
660+
Level: monitorapi.Info,
661+
StructuredLocator: monitorapi.Locator{
662+
Type: monitorapi.LocatorTypeE2ETest,
663+
Keys: map[monitorapi.LocatorKey]string{
664+
monitorapi.LocatorE2ETestKey: "[sig-node] NoExecuteTaintManager Single Pod [Serial] doesn't evict pod with tolerations from tainted nodes [Skipped:SingleReplicaTopology] [Suite:openshift/conformance/serial] [Suite:k8s]",
665+
},
666+
},
667+
StructuredMessage: monitorapi.Message{},
668+
},
669+
Source: monitorapi.SourceE2ETest,
670+
From: from.Add(-10 * time.Minute),
671+
To: to.Add(10 * time.Minute),
672+
},
673+
{
674+
Condition: monitorapi.Condition{
675+
Level: monitorapi.Info,
676+
StructuredLocator: monitorapi.Locator{
677+
Type: monitorapi.LocatorTypePod,
678+
Keys: map[monitorapi.LocatorKey]string{
679+
monitorapi.LocatorNamespaceKey: "openshift-dns",
680+
monitorapi.LocatorPodKey: "dns-default-jq2qn",
681+
},
682+
},
683+
StructuredMessage: monitorapi.Message{
684+
Reason: monitorapi.PodReasonGracefulDeleteStarted,
685+
Annotations: map[monitorapi.AnnotationKey]string{
686+
monitorapi.AnnotationConstructed: "pod-lifecycle-constructor",
687+
monitorapi.AnnotationReason: "GracefulDelete",
688+
},
689+
},
690+
},
691+
Source: monitorapi.SourcePodState,
692+
From: from.Add(-5 * time.Minute),
693+
To: to.Add(1 * time.Minute),
694+
},
695+
{
696+
Condition: monitorapi.Condition{
697+
Level: monitorapi.Info,
698+
Locator: "ns/openshift-dns service/dns-default hmsg/ade328ddf3",
699+
Message: "pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 (23 times)",
700+
},
701+
From: from.Add(11 * time.Minute),
702+
To: to.Add(12 * time.Minute),
703+
},
704+
},
705+
namespace: "openshift-dns",
706+
expectedMessage: "1 events happened too frequently\n\nevent happened 23 times, something is wrong: ns/openshift-dns service/dns-default hmsg/ade328ddf3 - pathological/true reason/TopologyAwareHintsDisabled Unable to allocate minimum required endpoints to each zone without exceeding overload threshold (5 endpoints, 3 zones), addressType: IPv4 From: 04:11:00Z To: 04:12:00Z result=reject ",
707+
},
708+
}
709+
710+
for _, test := range tests {
711+
t.Run(test.name, func(t *testing.T) {
712+
713+
events := monitorapi.Intervals(test.intervals)
714+
715+
testName := "events should not repeat"
716+
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
717+
jUnitName := getJUnitName(testName, test.namespace)
718+
for _, junit := range junits {
719+
if (junit.Name == jUnitName) && (test.expectedMessage != "") {
720+
assert.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
721+
} else {
722+
if !assert.Nil(t, junit.FailureOutput, "expected success but got failure output") {
723+
t.Logf(junit.FailureOutput.Output)
724+
}
725+
}
726+
}
727+
728+
})
729+
}
730+
}

pkg/monitortests/testframework/watchevents/event_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,25 +98,26 @@ func Test_recordAddOrUpdateEvent(t *testing.T) {
9898
expectedMessage: "pathological/true interesting/true reason/SomethingHappened Readiness probe failed (40 times)",
9999
},
100100
{
101-
name: "allowed pathological event with known bug",
101+
name: "allowed pathological event with known bug (BZ 2000234)",
102102
args: args{
103103
ctx: context.TODO(),
104104
m: monitor.NewRecorder(),
105105
kubeEvent: &corev1.Event{
106106
Count: 40,
107-
Reason: "TopologyAwareHintsDisabled",
107+
Reason: "ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy",
108108
InvolvedObject: corev1.ObjectReference{
109109
Kind: "Pod",
110-
Namespace: "any",
111-
Name: "any",
110+
Namespace: "openshift-etcd",
111+
Name: "etcd-quorum-guard-42",
112112
},
113-
Message: "irrelevant",
113+
Message: "Readiness probe failed:",
114114
LastTimestamp: metav1.Now(),
115115
},
116116
significantlyBeforeNow: now.UTC().Add(-15 * time.Minute),
117117
},
118-
expectedLocator: "ns/any pod/any hmsg/e13faa98ab",
119-
expectedMessage: "pathological/true interesting/true reason/TopologyAwareHintsDisabled irrelevant (40 times)",
118+
// hmsg in expectedLocator is the hash of the entire expectedMessage except the number of times
119+
expectedLocator: "ns/openshift-etcd pod/etcd-quorum-guard-42 hmsg/9100aa725d",
120+
expectedMessage: "pathological/true interesting/true reason/ns/openshift-etcd pod/etcd-quorum-guard-42 node/worker-42 - reason/Unhealthy Readiness probe failed: (40 times)",
120121
},
121122
}
122123
for _, tt := range tests {

0 commit comments

Comments
 (0)