@@ -3,6 +3,7 @@ package main
33import (
44 "context"
55 "crypto/x509"
6+ "fmt"
67 "os"
78 "path/filepath"
89 "strings"
@@ -15,6 +16,8 @@ import (
1516 apierrors "k8s.io/apimachinery/pkg/api/errors"
1617 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1718 "k8s.io/apimachinery/pkg/fields"
19+ "k8s.io/apimachinery/pkg/labels"
20+ "k8s.io/apimachinery/pkg/util/sets"
1821 "k8s.io/apimachinery/pkg/util/wait"
1922 "k8s.io/apimachinery/pkg/watch"
2023 "k8s.io/client-go/kubernetes"
@@ -25,6 +28,8 @@ import (
2528
2629 configv1 "github.com/openshift/api/config/v1"
2730 configclient "github.com/openshift/client-go/config/clientset/versioned"
31+ configinformers "github.com/openshift/client-go/config/informers/externalversions"
32+ configlisters "github.com/openshift/client-go/config/listers/config/v1"
2833 routeclient "github.com/openshift/client-go/route/clientset/versioned"
2934 "github.com/openshift/installer/cmd/openshift-install/command"
3035 "github.com/openshift/installer/pkg/asset"
@@ -54,6 +59,11 @@ const (
5459 exitCodeInfrastructureFailed
5560 exitCodeBootstrapFailed
5661 exitCodeInstallFailed
62+ exitCodeOperatorStabilityFailed
63+
64+ // coStabilityThreshold is how long a cluster operator must have Progressing=False
65+ // in order to be considered stable. Measured in seconds.
66+ coStabilityThreshold float64 = 30
5767)
5868
5969// each target is a variable to preserve the order when creating subcommands and still
@@ -501,7 +511,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
501511 defer cancel ()
502512
503513 failing := configv1 .ClusterStatusConditionType ("Failing" )
504- timer .StartTimer ("Cluster Operators" )
514+ timer .StartTimer ("Cluster Operators Available " )
505515 var lastError string
506516 _ , err = clientwatch .UntilWithSync (
507517 clusterVersionContext ,
@@ -519,7 +529,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
519529 if cov1helpers .IsStatusConditionTrue (cv .Status .Conditions , configv1 .OperatorAvailable ) &&
520530 cov1helpers .IsStatusConditionFalse (cv .Status .Conditions , failing ) &&
521531 cov1helpers .IsStatusConditionFalse (cv .Status .Conditions , configv1 .OperatorProgressing ) {
522- timer .StopTimer ("Cluster Operators" )
532+ timer .StopTimer ("Cluster Operators Available " )
523533 return true , nil
524534 }
525535 if cov1helpers .IsStatusConditionTrue (cv .Status .Conditions , failing ) {
@@ -551,6 +561,54 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
551561 return errors .Wrap (err , "failed to initialize the cluster" )
552562}
553563
564+ // waitForStableOperators ensures that each cluster operator is "stable", i.e. the
565+ // operator has not been in a progressing state for at least a certain duration,
566+ // 30 seconds by default. Returns an error if any operator does meet this threshold
567+ // after a deadline, 30 minutes by default.
568+ func waitForStableOperators (ctx context.Context , config * rest.Config ) error {
569+ timer .StartTimer ("Cluster Operators Stable" )
570+
571+ stabilityCheckDuration := 30 * time .Minute
572+ stabilityContext , cancel := context .WithTimeout (ctx , stabilityCheckDuration )
573+ defer cancel ()
574+
575+ untilTime := time .Now ().Add (stabilityCheckDuration )
576+ timezone , _ := untilTime .Zone ()
577+ logrus .Infof ("Waiting up to %v (until %v %s) to ensure each cluster operator has finished progressing..." ,
578+ stabilityCheckDuration , untilTime .Format (time .Kitchen ), timezone )
579+
580+ cc , err := configclient .NewForConfig (config )
581+ if err != nil {
582+ return errors .Wrap (err , "failed to create a config client" )
583+ }
584+ configInformers := configinformers .NewSharedInformerFactory (cc , 0 )
585+ clusterOperatorInformer := configInformers .Config ().V1 ().ClusterOperators ().Informer ()
586+ clusterOperatorLister := configInformers .Config ().V1 ().ClusterOperators ().Lister ()
587+ configInformers .Start (ctx .Done ())
588+ if ! cache .WaitForCacheSync (ctx .Done (), clusterOperatorInformer .HasSynced ) {
589+ return fmt .Errorf ("informers never started" )
590+ }
591+
592+ waitErr := wait .PollUntilContextCancel (stabilityContext , 1 * time .Second , true , waitForAllClusterOperators (clusterOperatorLister ))
593+ if waitErr != nil {
594+ logrus .Errorf ("Error checking cluster operator Progressing status: %q" , waitErr )
595+ stableOperators , unstableOperators , err := currentOperatorStability (clusterOperatorLister )
596+ if err != nil {
597+ logrus .Errorf ("Error checking final cluster operator Progressing status: %q" , err )
598+ }
599+ logrus .Debugf ("These cluster operators were stable: [%s]" , strings .Join (sets .List (stableOperators ), ", " ))
600+ logrus .Errorf ("These cluster operators were not stable: [%s]" , strings .Join (sets .List (unstableOperators ), ", " ))
601+
602+ logrus .Exit (exitCodeOperatorStabilityFailed )
603+ }
604+
605+ timer .StopTimer ("Cluster Operators Stable" )
606+
607+ logrus .Info ("All cluster operators have completed progressing" )
608+
609+ return nil
610+ }
611+
554612// getConsole returns the console URL from the route 'console' in namespace openshift-console
555613func getConsole (ctx context.Context , config * rest.Config ) (string , error ) {
556614 url := ""
@@ -637,6 +695,10 @@ func waitForInstallComplete(ctx context.Context, config *rest.Config, directory
637695 return err
638696 }
639697
698+ if err := waitForStableOperators (ctx , config ); err != nil {
699+ return err
700+ }
701+
640702 consoleURL , err := getConsole (ctx , config )
641703 if err != nil {
642704 logrus .Warnf ("Cluster does not have a console available: %v" , err )
@@ -657,3 +719,62 @@ func checkIfAgentCommand(assetStore asset.Store) {
657719 logrus .Warning ("An agent configuration was detected but this command is not the agent wait-for command" )
658720 }
659721}
722+
723+ func waitForAllClusterOperators (clusterOperatorLister configlisters.ClusterOperatorLister ) func (ctx context.Context ) (bool , error ) {
724+ previouslyStableOperators := sets.Set [string ]{}
725+
726+ return func (ctx context.Context ) (bool , error ) {
727+ stableOperators , unstableOperators , err := currentOperatorStability (clusterOperatorLister )
728+ if err != nil {
729+ return false , err
730+ }
731+ if newlyStableOperators := stableOperators .Difference (previouslyStableOperators ); len (newlyStableOperators ) > 0 {
732+ for _ , name := range sets .List (newlyStableOperators ) {
733+ logrus .Debugf ("Cluster Operator %s is stable" , name )
734+ }
735+ }
736+ if newlyUnstableOperators := previouslyStableOperators .Difference (stableOperators ); len (newlyUnstableOperators ) > 0 {
737+ for _ , name := range sets .List (newlyUnstableOperators ) {
738+ logrus .Debugf ("Cluster Operator %s became unstable" , name )
739+ }
740+ }
741+ previouslyStableOperators = stableOperators
742+
743+ if len (unstableOperators ) == 0 {
744+ return true , nil
745+ }
746+
747+ return false , nil
748+ }
749+ }
750+
751+ func currentOperatorStability (clusterOperatorLister configlisters.ClusterOperatorLister ) (sets.Set [string ], sets.Set [string ], error ) {
752+ clusterOperators , err := clusterOperatorLister .List (labels .Everything ())
753+ if err != nil {
754+ return nil , nil , err // lister should never fail
755+ }
756+
757+ stableOperators := sets.Set [string ]{}
758+ unstableOperators := sets.Set [string ]{}
759+ for _ , clusterOperator := range clusterOperators {
760+ name := clusterOperator .Name
761+ progressing := cov1helpers .FindStatusCondition (clusterOperator .Status .Conditions , configv1 .OperatorProgressing )
762+ if progressing == nil {
763+ logrus .Debugf ("Cluster Operator %s progressing == nil" , name )
764+ unstableOperators .Insert (name )
765+ continue
766+ }
767+ if meetsStabilityThreshold (progressing ) {
768+ stableOperators .Insert (name )
769+ } else {
770+ logrus .Debugf ("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s" , name , progressing .Status , progressing .LastTransitionTime .Time , time .Since (progressing .LastTransitionTime .Time ).Seconds (), progressing .Reason , progressing .Message )
771+ unstableOperators .Insert (name )
772+ }
773+ }
774+
775+ return stableOperators , unstableOperators , nil
776+ }
777+
778+ func meetsStabilityThreshold (progressing * configv1.ClusterOperatorStatusCondition ) bool {
779+ return progressing .Status == configv1 .ConditionFalse && time .Since (progressing .LastTransitionTime .Time ).Seconds () > coStabilityThreshold
780+ }
0 commit comments