@@ -16,14 +16,89 @@ import (
1616 "k8s.io/client-go/rest"
1717)
1818
19+ // exceptionCallback consumes a suspicious condition and returns an
20+ // exception string if does not think the condition should be fatal.
21+ type exceptionCallback func (operator string , condition * configv1.ClusterOperatorStatusCondition ) (string , error )
22+
1923func testStableSystemOperatorStateTransitions (events monitorapi.Intervals ) []* junitapi.JUnitTestCase {
20- return testOperatorStateTransitions (events , []configv1.ClusterStatusConditionType {configv1 .OperatorAvailable , configv1 .OperatorDegraded })
24+ except := func (_ string , condition * configv1.ClusterOperatorStatusCondition ) (string , error ) {
25+ if condition .Status == configv1 .ConditionTrue {
26+ if condition .Type == configv1 .OperatorAvailable {
27+ return fmt .Sprintf ("%s=%s is the happy case" , condition .Type , condition .Status ), nil
28+ }
29+ } else if condition .Status == configv1 .ConditionFalse {
30+ if condition .Type == configv1 .OperatorDegraded {
31+ return fmt .Sprintf ("%s=%s is the happy case" , condition .Type , condition .Status ), nil
32+ }
33+ }
34+
35+ return "We are not worried about Available=False or Degraded=True blips for stable-system tests yet." , nil
36+ }
37+
38+ return testOperatorStateTransitions (events , []configv1.ClusterStatusConditionType {configv1 .OperatorAvailable , configv1 .OperatorDegraded }, except )
2139}
2240
2341func testUpgradeOperatorStateTransitions (events monitorapi.Intervals ) []* junitapi.JUnitTestCase {
24- return testOperatorStateTransitions (events , []configv1.ClusterStatusConditionType {configv1 .OperatorAvailable , configv1 .OperatorDegraded })
42+ except := func (operator string , condition * configv1.ClusterOperatorStatusCondition ) (string , error ) {
43+ if condition .Status == configv1 .ConditionTrue {
44+ if condition .Type == configv1 .OperatorAvailable {
45+ return fmt .Sprintf ("%s=%s is the happy case" , condition .Type , condition .Status ), nil
46+ }
47+ } else if condition .Status == configv1 .ConditionFalse {
48+ if condition .Type == configv1 .OperatorDegraded {
49+ return fmt .Sprintf ("%s=%s is the happy case" , condition .Type , condition .Status ), nil
50+ }
51+ }
52+
53+ if condition .Type == configv1 .OperatorDegraded {
54+ return "We are not worried about Degraded=True blips for update tests yet." , nil
55+ }
56+
57+ switch operator {
58+ case "authentication" :
59+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && (condition .Reason == "APIServices_Error" || condition .Reason == "APIServerDeployment_NoDeployment" || condition .Reason == "APIServerDeployment_NoPod" || condition .Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition .Reason == "APIServices_PreconditionNotReady" || condition .Reason == "OAuthServerDeployment_NoDeployment" || condition .Reason == "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable" || condition .Reason == "OAuthServerServiceEndpointAccessibleController_EndpointUnavailable" || condition .Reason == "WellKnown_NotReady" ) {
60+ return "https://issues.redhat.com/browse/OCPBUGS-20056" , nil
61+ }
62+ case "console" :
63+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && (condition .Reason == "RouteHealth_FailedGet" || condition .Reason == "RouteHealth_RouteNotAdmitted" || condition .Reason == "RouteHealth_StatusError" ) {
64+ return "https://issues.redhat.com/browse/OCPBUGS-24041" , nil
65+ }
66+ case "control-plane-machine-set" :
67+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && condition .Reason == "UnavailableReplicas" {
68+ return "https://issues.redhat.com/browse/OCPBUGS-20061" , nil
69+ }
70+ case "kube-storage-version-migrator" :
71+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && condition .Reason == "KubeStorageVersionMigrator_Deploying" {
72+ return "https://issues.redhat.com/browse/OCPBUGS-20062" , nil
73+ }
74+ case "machine-config" :
75+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && condition .Reason == "MachineConfigControllerFailed" && strings .Contains (condition .Message , "notAfter: Required value" ) {
76+ return "https://issues.redhat.com/browse/OCPBUGS-22364" , nil
77+ }
78+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && strings .Contains (condition .Message , "missing HTTP content-type" ) {
79+ return "https://issues.redhat.com/browse/OCPBUGS-24228" , nil
80+ }
81+ case "monitoring" :
82+ if condition .Type == configv1 .OperatorAvailable && (condition .Status == configv1 .ConditionFalse && (condition .Reason == "PlatformTasksFailed" || condition .Reason == "UpdatingAlertmanagerFailed" || condition .Reason == "UpdatingConsolePluginComponentsFailed" || condition .Reason == "UpdatingPrometheusK8SFailed" || condition .Reason == "UpdatingPrometheusOperatorFailed" )) || (condition .Status == configv1 .ConditionUnknown && condition .Reason == "UpdatingPrometheusFailed" ) {
83+ return "https://issues.redhat.com/browse/OCPBUGS-23745" , nil
84+ }
85+ case "openshift-apiserver" :
86+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && (condition .Reason == "APIServerDeployment_NoDeployment" || condition .Reason == "APIServerDeployment_NoPod" || condition .Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition .Reason == "APIServices_Error" ) {
87+ return "https://issues.redhat.com/browse/OCPBUGS-23746" , nil
88+ }
89+ case "operator-lifecycle-manager-packageserver" :
90+ if condition .Type == configv1 .OperatorAvailable && condition .Status == configv1 .ConditionFalse && condition .Reason == "ClusterServiceVersionNotSucceeded" {
91+ return "https://issues.redhat.com/browse/OCPBUGS-23744" , nil
92+ }
93+ }
94+
95+ return "" , nil
96+ }
97+
98+ return testOperatorStateTransitions (events , []configv1.ClusterStatusConditionType {configv1 .OperatorAvailable , configv1 .OperatorDegraded }, except )
2599}
26- func testOperatorStateTransitions (events monitorapi.Intervals , conditionTypes []configv1.ClusterStatusConditionType ) []* junitapi.JUnitTestCase {
100+
101+ func testOperatorStateTransitions (events monitorapi.Intervals , conditionTypes []configv1.ClusterStatusConditionType , except exceptionCallback ) []* junitapi.JUnitTestCase {
27102 ret := []* junitapi.JUnitTestCase {}
28103
29104 var start , stop time.Time
@@ -39,13 +114,13 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
39114
40115 eventsByOperator := getEventsByOperator (events )
41116 e2eEventIntervals := operatorstateanalyzer .E2ETestEventIntervals (events )
42- for _ , condition := range conditionTypes {
117+ for _ , conditionType := range conditionTypes {
43118 for _ , operatorName := range platformidentification .KnownOperators .List () {
44119 bzComponent := platformidentification .GetBugzillaComponentForOperator (operatorName )
45120 if bzComponent == "Unknown" {
46121 bzComponent = operatorName
47122 }
48- testName := fmt .Sprintf ("[bz-%v] clusteroperator/%v should not change condition/%v" , bzComponent , operatorName , condition )
123+ testName := fmt .Sprintf ("[bz-%v] clusteroperator/%v should not change condition/%v" , bzComponent , operatorName , conditionType )
49124 operatorEvents := eventsByOperator [operatorName ]
50125 if len (operatorEvents ) == 0 {
51126 ret = append (ret , & junitapi.JUnitTestCase {
@@ -55,19 +130,78 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
55130 continue
56131 }
57132
58- failures := testOperatorState (condition , operatorEvents , e2eEventIntervals )
59- if len (failures ) > 0 {
133+ excepted := []string {}
134+ fatal := []string {}
135+
136+ for _ , eventInterval := range operatorEvents {
137+ condition := monitorapi .GetOperatorConditionStatus (eventInterval )
138+ if condition == nil {
139+ continue // ignore non-condition intervals
140+ }
141+ if len (condition .Type ) == 0 {
142+ fatal = append (fatal , fmt .Sprintf ("failed to convert %v into a condition with a type" , eventInterval ))
143+ }
144+
145+ if condition .Type != conditionType {
146+ continue
147+ }
148+
149+ // if there was any switch, it was wrong/unexpected at some point
150+ failure := fmt .Sprintf ("%v" , eventInterval )
151+
152+ overlappingE2EIntervals := operatorstateanalyzer .FindOverlap (e2eEventIntervals , eventInterval .From , eventInterval .From )
153+ concurrentE2E := []string {}
154+ for _ , overlap := range overlappingE2EIntervals {
155+ if overlap .Level == monitorapi .Info {
156+ continue
157+ }
158+ e2eTest , ok := monitorapi .E2ETestFromLocator (overlap .StructuredLocator )
159+ if ! ok {
160+ continue
161+ }
162+ concurrentE2E = append (concurrentE2E , fmt .Sprintf ("%v" , e2eTest ))
163+ }
164+
165+ if len (concurrentE2E ) > 0 {
166+ failure = fmt .Sprintf ("%s\n %d tests failed during this blip (%v to %v): %v" , failure , len (concurrentE2E ), eventInterval .From , eventInterval .From , strings .Join (concurrentE2E , "\n " ))
167+ }
168+
169+ exception , err := except (operatorName , condition )
170+ if err != nil || exception == "" {
171+ fatal = append (fatal , failure )
172+ } else {
173+ excepted = append (excepted , fmt .Sprintf ("%s (exception: %s)" , failure , exception ))
174+ }
175+ }
176+
177+ output := fmt .Sprintf ("%d unexpected clusteroperator state transitions during e2e test run" , len (fatal ))
178+ if len (fatal ) > 0 {
179+ output = fmt .Sprintf ("%s. These did not match any known exceptions, so they cause this test-case to fail:\n \n %v\n " , output , strings .Join (fatal , "\n " ))
180+ } else {
181+ output = fmt .Sprintf ("%s, as desired." , output )
182+ }
183+ output = fmt .Sprintf ("%s\n %d unwelcome but acceptable clusteroperator state transitions during e2e test run" , output , len (excepted ))
184+ if len (excepted ) > 0 {
185+ output = fmt .Sprintf ("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n \n %v\n " , output , strings .Join (excepted , "\n " ))
186+ } else {
187+ output = fmt .Sprintf ("%s, as desired." , output )
188+ }
189+
190+ if len (fatal ) > 0 || len (excepted ) > 0 {
60191 ret = append (ret , & junitapi.JUnitTestCase {
61192 Name : testName ,
62193 Duration : duration ,
63- SystemOut : strings . Join ( failures , " \n " ) ,
194+ SystemOut : output ,
64195 FailureOutput : & junitapi.FailureOutput {
65- Output : fmt . Sprintf ( "%d unexpected clusteroperator state transitions during e2e test run \n \n %v" , len ( failures ), strings . Join ( failures , " \n " )) ,
196+ Output : output ,
66197 },
67198 })
68199 }
69- // always add a success so we flake and not fail
70- ret = append (ret , & junitapi.JUnitTestCase {Name : testName })
200+
201+ if len (fatal ) == 0 {
202+ // add a success so we flake (or pass) and don't fail
203+ ret = append (ret , & junitapi.JUnitTestCase {Name : testName })
204+ }
71205 }
72206 }
73207
@@ -267,44 +401,3 @@ func getEventsByOperator(events monitorapi.Intervals) map[string]monitorapi.Inte
267401 }
268402 return eventsByClusterOperator
269403}
270-
271- func testOperatorState (interestingCondition configv1.ClusterStatusConditionType , eventIntervals monitorapi.Intervals , e2eEventIntervals monitorapi.Intervals ) []string {
272- failures := []string {}
273-
274- for _ , eventInterval := range eventIntervals {
275- // ignore non-interval eventInterval intervals
276- if eventInterval .From == eventInterval .To {
277- continue
278- }
279-
280- condition := monitorapi .GetOperatorConditionStatus (eventInterval )
281- if condition == nil {
282- continue
283- }
284-
285- if condition .Type != interestingCondition {
286- continue
287- }
288-
289- // if there was any switch, it was wrong/unexpected at some point
290- failures = append (failures , fmt .Sprintf ("%v" , eventInterval ))
291-
292- overlappingE2EIntervals := operatorstateanalyzer .FindOverlap (e2eEventIntervals , eventInterval .From , eventInterval .From )
293- concurrentE2E := []string {}
294- for _ , overlap := range overlappingE2EIntervals {
295- if overlap .Level == monitorapi .Info {
296- continue
297- }
298- e2eTest , ok := monitorapi .E2ETestFromLocator (overlap .StructuredLocator )
299- if ! ok {
300- continue
301- }
302- concurrentE2E = append (concurrentE2E , fmt .Sprintf ("%v" , e2eTest ))
303- }
304-
305- if len (concurrentE2E ) > 0 {
306- failures = append (failures , fmt .Sprintf ("%d tests failed during this blip (%v to %v): %v" , len (concurrentE2E ), eventInterval .From , eventInterval .From , strings .Join (concurrentE2E , "\n " )))
307- }
308- }
309- return failures
310- }
0 commit comments