Skip to content

Commit 15d3649

Browse files
committed
ClusterOperators should not go Progressing only for a node reboot
This is to cover the node rebooting case from the rule [1] that is introduced recently: ``` Operators should not report Progressing only because DaemonSets owned by them are adjusting to a new node from cluster scaleup or a node rebooting from cluster upgrade. ``` The test fails if - `co/machine-config` never became Progressing=True during a cluster upgrade, or - some CO left Progressing=False during the upgrade after `machine-config` became Progressing=True. This should not have taken place as `machine-config` was rebooting the nodes which was the only thing ongoing to the cluster during that time. [1]. https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164
1 parent 83a1f05 commit 15d3649

File tree

2 files changed

+143
-7
lines changed

2 files changed

+143
-7
lines changed

pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
4545
isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
4646
if isUpgrade {
4747
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
48+
junits = append(junits, clusterOperatorIsNotProgressingWhenMachineConfigIs(finalIntervals)...)
4849
} else {
4950
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
5051
}

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 142 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,21 @@ import (
66
"strings"
77
"time"
88

9-
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
9+
configv1 "github.com/openshift/api/config/v1"
10+
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
11+
"github.com/sirupsen/logrus"
1012
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/sets"
1114
"k8s.io/client-go/kubernetes"
15+
"k8s.io/client-go/rest"
1216

13-
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
14-
"github.com/sirupsen/logrus"
15-
16-
configv1 "github.com/openshift/api/config/v1"
17-
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
1817
"github.com/openshift/origin/pkg/monitor/monitorapi"
1918
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
2019
platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
20+
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
21+
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
2122
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
2223
exutil "github.com/openshift/origin/test/extended/util"
23-
"k8s.io/client-go/rest"
2424
)
2525

2626
// exceptionCallback consumes a suspicious condition and returns an
@@ -586,6 +586,9 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
586586
}
587587

588588
if len(fatal) > 0 || len(excepted) > 0 {
589+
// add a failure so we
590+
// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
591+
// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
589592
ret = append(ret, &junitapi.JUnitTestCase{
590593
Name: testName,
591594
Duration: duration,
@@ -606,6 +609,138 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
606609
return ret
607610
}
608611

612+
func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
613+
var ret []*junitapi.JUnitTestCase
614+
upgradeWindows := getUpgradeWindows(events)
615+
616+
var machineConfigProgressing time.Time
617+
var eventsInUpgradeWindows monitorapi.Intervals
618+
619+
var start, stop time.Time
620+
for _, event := range events {
621+
if !isInUpgradeWindow(upgradeWindows, event) {
622+
continue
623+
}
624+
eventsInUpgradeWindows = append(eventsInUpgradeWindows, event)
625+
if start.IsZero() || event.From.Before(start) {
626+
start = event.From
627+
}
628+
if stop.IsZero() || event.To.After(stop) {
629+
stop = event.To
630+
}
631+
}
632+
duration := stop.Sub(start).Seconds()
633+
634+
eventsByOperator := getEventsByOperator(eventsInUpgradeWindows)
635+
for _, mcEvent := range eventsByOperator["machine-config"] {
636+
condition := monitorapi.GetOperatorConditionStatus(mcEvent)
637+
if condition == nil {
638+
continue // ignore non-condition intervals
639+
}
640+
if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
641+
machineConfigProgressing = mcEvent.To
642+
break
643+
}
644+
}
645+
646+
mcTestCase := &junitapi.JUnitTestCase{
647+
Name: fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"),
648+
Duration: duration,
649+
}
650+
if machineConfigProgressing.IsZero() {
651+
mcTestCase.FailureOutput = &junitapi.FailureOutput{
652+
Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)),
653+
}
654+
return []*junitapi.JUnitTestCase{mcTestCase}
655+
} else {
656+
mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressing.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339))
657+
}
658+
ret = append(ret, mcTestCase)
659+
660+
for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() {
661+
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
662+
if bzComponent == "Unknown" {
663+
bzComponent = operatorName
664+
}
665+
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName)
666+
operatorEvents := eventsByOperator[operatorName]
667+
if len(operatorEvents) == 0 {
668+
ret = append(ret, &junitapi.JUnitTestCase{
669+
Name: testName,
670+
Duration: duration,
671+
})
672+
continue
673+
}
674+
675+
except := func(co string, condition *configv1.ClusterOperatorStatusCondition) string {
676+
return ""
677+
}
678+
679+
var excepted, fatal []string
680+
for _, operatorEvent := range operatorEvents {
681+
if operatorEvent.From.Before(machineConfigProgressing) {
682+
continue
683+
}
684+
condition := monitorapi.GetOperatorConditionStatus(operatorEvent)
685+
if condition == nil {
686+
continue // ignore non-condition intervals
687+
}
688+
if condition.Type == "" {
689+
fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", operatorEvent))
690+
continue
691+
}
692+
693+
if condition.Type != configv1.OperatorProgressing || condition.Status == configv1.ConditionFalse {
694+
continue
695+
}
696+
697+
// if there was any switch, it was wrong/unexpected at some point
698+
failure := fmt.Sprintf("%v", operatorEvent)
699+
700+
exception := except(operatorName, condition)
701+
if exception == "" {
702+
fatal = append(fatal, failure)
703+
} else {
704+
excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
705+
}
706+
}
707+
708+
output := fmt.Sprintf("%d (out of %d) unexpected clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", len(fatal), len(operatorEvents), start.Format(time.RFC3339), stop.Format(time.RFC3339))
709+
if len(fatal) > 0 {
710+
output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
711+
} else {
712+
output = fmt.Sprintf("%s, as desired.", output)
713+
}
714+
output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", output, len(excepted), start.Format(time.RFC3339), stop.Format(time.RFC3339))
715+
if len(excepted) > 0 {
716+
output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
717+
} else {
718+
output = fmt.Sprintf("%s, as desired.", output)
719+
}
720+
721+
if len(fatal) > 0 || len(excepted) > 0 {
722+
// add a failure so we
723+
// either flake (or pass) in case len(fatal) == 0 by adding a success to the same test
724+
// or fail in case len(fatal) > 0 by leaving the failure as the only output for the test
725+
ret = append(ret, &junitapi.JUnitTestCase{
726+
Name: testName,
727+
Duration: duration,
728+
SystemOut: output,
729+
FailureOutput: &junitapi.FailureOutput{
730+
Output: output,
731+
},
732+
})
733+
}
734+
735+
if len(fatal) == 0 {
736+
// add a success so we flake (or pass) and don't fail
737+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName})
738+
}
739+
}
740+
741+
return ret
742+
}
743+
609744
type startedStaged struct {
610745
// OSUpdateStarted is the event Reason emitted by the machine config operator when a node begins extracting
611746
// it's OS content.

0 commit comments

Comments
 (0)