Skip to content

Commit d75092b

Browse files
Merge pull request #27231 from wking/fatal-available-false
OTA-362: pkg/monitortests/clusterversionoperator: Fatal unless Available=False in allow-list
2 parents 5b587d9 + 0b4bb1d commit d75092b

File tree

1 file changed

+145
-52
lines changed
  • pkg/monitortests/clusterversionoperator/legacycvomonitortests

1 file changed

+145
-52
lines changed

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 145 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,89 @@ import (
1616
"k8s.io/client-go/rest"
1717
)
1818

19+
// exceptionCallback consumes a suspicious condition and returns an
20+
// exception string if does not think the condition should be fatal.
21+
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition) (string, error)
22+
1923
func testStableSystemOperatorStateTransitions(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
20-
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded})
24+
except := func(_ string, condition *configv1.ClusterOperatorStatusCondition) (string, error) {
25+
if condition.Status == configv1.ConditionTrue {
26+
if condition.Type == configv1.OperatorAvailable {
27+
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
28+
}
29+
} else if condition.Status == configv1.ConditionFalse {
30+
if condition.Type == configv1.OperatorDegraded {
31+
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
32+
}
33+
}
34+
35+
return "We are not worried about Available=False or Degraded=True blips for stable-system tests yet.", nil
36+
}
37+
38+
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except)
2139
}
2240

2341
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
24-
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded})
42+
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition) (string, error) {
43+
if condition.Status == configv1.ConditionTrue {
44+
if condition.Type == configv1.OperatorAvailable {
45+
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
46+
}
47+
} else if condition.Status == configv1.ConditionFalse {
48+
if condition.Type == configv1.OperatorDegraded {
49+
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status), nil
50+
}
51+
}
52+
53+
if condition.Type == configv1.OperatorDegraded {
54+
return "We are not worried about Degraded=True blips for update tests yet.", nil
55+
}
56+
57+
switch operator {
58+
case "authentication":
59+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServices_Error" || condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_PreconditionNotReady" || condition.Reason == "OAuthServerDeployment_NoDeployment" || condition.Reason == "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "OAuthServerServiceEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "WellKnown_NotReady") {
60+
return "https://issues.redhat.com/browse/OCPBUGS-20056", nil
61+
}
62+
case "console":
63+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "RouteHealth_FailedGet" || condition.Reason == "RouteHealth_RouteNotAdmitted" || condition.Reason == "RouteHealth_StatusError") {
64+
return "https://issues.redhat.com/browse/OCPBUGS-24041", nil
65+
}
66+
case "control-plane-machine-set":
67+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "UnavailableReplicas" {
68+
return "https://issues.redhat.com/browse/OCPBUGS-20061", nil
69+
}
70+
case "kube-storage-version-migrator":
71+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "KubeStorageVersionMigrator_Deploying" {
72+
return "https://issues.redhat.com/browse/OCPBUGS-20062", nil
73+
}
74+
case "machine-config":
75+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "MachineConfigControllerFailed" && strings.Contains(condition.Message, "notAfter: Required value") {
76+
return "https://issues.redhat.com/browse/OCPBUGS-22364", nil
77+
}
78+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && strings.Contains(condition.Message, "missing HTTP content-type") {
79+
return "https://issues.redhat.com/browse/OCPBUGS-24228", nil
80+
}
81+
case "monitoring":
82+
if condition.Type == configv1.OperatorAvailable && (condition.Status == configv1.ConditionFalse && (condition.Reason == "PlatformTasksFailed" || condition.Reason == "UpdatingAlertmanagerFailed" || condition.Reason == "UpdatingConsolePluginComponentsFailed" || condition.Reason == "UpdatingPrometheusK8SFailed" || condition.Reason == "UpdatingPrometheusOperatorFailed")) || (condition.Status == configv1.ConditionUnknown && condition.Reason == "UpdatingPrometheusFailed") {
83+
return "https://issues.redhat.com/browse/OCPBUGS-23745", nil
84+
}
85+
case "openshift-apiserver":
86+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_Error") {
87+
return "https://issues.redhat.com/browse/OCPBUGS-23746", nil
88+
}
89+
case "operator-lifecycle-manager-packageserver":
90+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" {
91+
return "https://issues.redhat.com/browse/OCPBUGS-23744", nil
92+
}
93+
}
94+
95+
return "", nil
96+
}
97+
98+
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except)
2599
}
26-
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType) []*junitapi.JUnitTestCase {
100+
101+
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback) []*junitapi.JUnitTestCase {
27102
ret := []*junitapi.JUnitTestCase{}
28103

29104
var start, stop time.Time
@@ -39,13 +114,13 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
39114

40115
eventsByOperator := getEventsByOperator(events)
41116
e2eEventIntervals := operatorstateanalyzer.E2ETestEventIntervals(events)
42-
for _, condition := range conditionTypes {
117+
for _, conditionType := range conditionTypes {
43118
for _, operatorName := range platformidentification.KnownOperators.List() {
44119
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
45120
if bzComponent == "Unknown" {
46121
bzComponent = operatorName
47122
}
48-
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, condition)
123+
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType)
49124
operatorEvents := eventsByOperator[operatorName]
50125
if len(operatorEvents) == 0 {
51126
ret = append(ret, &junitapi.JUnitTestCase{
@@ -55,19 +130,78 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
55130
continue
56131
}
57132

58-
failures := testOperatorState(condition, operatorEvents, e2eEventIntervals)
59-
if len(failures) > 0 {
133+
excepted := []string{}
134+
fatal := []string{}
135+
136+
for _, eventInterval := range operatorEvents {
137+
condition := monitorapi.GetOperatorConditionStatus(eventInterval)
138+
if condition == nil {
139+
continue // ignore non-condition intervals
140+
}
141+
if len(condition.Type) == 0 {
142+
fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", eventInterval))
143+
}
144+
145+
if condition.Type != conditionType {
146+
continue
147+
}
148+
149+
// if there was any switch, it was wrong/unexpected at some point
150+
failure := fmt.Sprintf("%v", eventInterval)
151+
152+
overlappingE2EIntervals := operatorstateanalyzer.FindOverlap(e2eEventIntervals, eventInterval.From, eventInterval.From)
153+
concurrentE2E := []string{}
154+
for _, overlap := range overlappingE2EIntervals {
155+
if overlap.Level == monitorapi.Info {
156+
continue
157+
}
158+
e2eTest, ok := monitorapi.E2ETestFromLocator(overlap.StructuredLocator)
159+
if !ok {
160+
continue
161+
}
162+
concurrentE2E = append(concurrentE2E, fmt.Sprintf("%v", e2eTest))
163+
}
164+
165+
if len(concurrentE2E) > 0 {
166+
failure = fmt.Sprintf("%s\n%d tests failed during this blip (%v to %v): %v", failure, len(concurrentE2E), eventInterval.From, eventInterval.From, strings.Join(concurrentE2E, "\n"))
167+
}
168+
169+
exception, err := except(operatorName, condition)
170+
if err != nil || exception == "" {
171+
fatal = append(fatal, failure)
172+
} else {
173+
excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
174+
}
175+
}
176+
177+
output := fmt.Sprintf("%d unexpected clusteroperator state transitions during e2e test run", len(fatal))
178+
if len(fatal) > 0 {
179+
output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
180+
} else {
181+
output = fmt.Sprintf("%s, as desired.", output)
182+
}
183+
output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions during e2e test run", output, len(excepted))
184+
if len(excepted) > 0 {
185+
output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
186+
} else {
187+
output = fmt.Sprintf("%s, as desired.", output)
188+
}
189+
190+
if len(fatal) > 0 || len(excepted) > 0 {
60191
ret = append(ret, &junitapi.JUnitTestCase{
61192
Name: testName,
62193
Duration: duration,
63-
SystemOut: strings.Join(failures, "\n"),
194+
SystemOut: output,
64195
FailureOutput: &junitapi.FailureOutput{
65-
Output: fmt.Sprintf("%d unexpected clusteroperator state transitions during e2e test run \n\n%v", len(failures), strings.Join(failures, "\n")),
196+
Output: output,
66197
},
67198
})
68199
}
69-
// always add a success so we flake and not fail
70-
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
200+
201+
if len(fatal) == 0 {
202+
// add a success so we flake (or pass) and don't fail
203+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
204+
}
71205
}
72206
}
73207

@@ -267,44 +401,3 @@ func getEventsByOperator(events monitorapi.Intervals) map[string]monitorapi.Inte
267401
}
268402
return eventsByClusterOperator
269403
}
270-
271-
func testOperatorState(interestingCondition configv1.ClusterStatusConditionType, eventIntervals monitorapi.Intervals, e2eEventIntervals monitorapi.Intervals) []string {
272-
failures := []string{}
273-
274-
for _, eventInterval := range eventIntervals {
275-
// ignore non-interval eventInterval intervals
276-
if eventInterval.From == eventInterval.To {
277-
continue
278-
}
279-
280-
condition := monitorapi.GetOperatorConditionStatus(eventInterval)
281-
if condition == nil {
282-
continue
283-
}
284-
285-
if condition.Type != interestingCondition {
286-
continue
287-
}
288-
289-
// if there was any switch, it was wrong/unexpected at some point
290-
failures = append(failures, fmt.Sprintf("%v", eventInterval))
291-
292-
overlappingE2EIntervals := operatorstateanalyzer.FindOverlap(e2eEventIntervals, eventInterval.From, eventInterval.From)
293-
concurrentE2E := []string{}
294-
for _, overlap := range overlappingE2EIntervals {
295-
if overlap.Level == monitorapi.Info {
296-
continue
297-
}
298-
e2eTest, ok := monitorapi.E2ETestFromLocator(overlap.StructuredLocator)
299-
if !ok {
300-
continue
301-
}
302-
concurrentE2E = append(concurrentE2E, fmt.Sprintf("%v", e2eTest))
303-
}
304-
305-
if len(concurrentE2E) > 0 {
306-
failures = append(failures, fmt.Sprintf("%d tests failed during this blip (%v to %v): %v", len(concurrentE2E), eventInterval.From, eventInterval.From, strings.Join(concurrentE2E, "\n")))
307-
}
308-
}
309-
return failures
310-
}

0 commit comments

Comments
 (0)