Skip to content

Commit 3a738d8

Browse files
Merge pull request #7535 from deads2k/inefficient
Inefficient wait for all clusteroperators
2 parents 732271d + 94c54e0 commit 3a738d8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+3489
-2
lines changed

cmd/openshift-install/create.go

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"crypto/x509"
6+
"fmt"
67
"os"
78
"path/filepath"
89
"strings"
@@ -15,6 +16,8 @@ import (
1516
apierrors "k8s.io/apimachinery/pkg/api/errors"
1617
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1718
"k8s.io/apimachinery/pkg/fields"
19+
"k8s.io/apimachinery/pkg/labels"
20+
"k8s.io/apimachinery/pkg/util/sets"
1821
"k8s.io/apimachinery/pkg/util/wait"
1922
"k8s.io/apimachinery/pkg/watch"
2023
"k8s.io/client-go/kubernetes"
@@ -25,6 +28,8 @@ import (
2528

2629
configv1 "github.com/openshift/api/config/v1"
2730
configclient "github.com/openshift/client-go/config/clientset/versioned"
31+
configinformers "github.com/openshift/client-go/config/informers/externalversions"
32+
configlisters "github.com/openshift/client-go/config/listers/config/v1"
2833
routeclient "github.com/openshift/client-go/route/clientset/versioned"
2934
"github.com/openshift/installer/cmd/openshift-install/command"
3035
"github.com/openshift/installer/pkg/asset"
@@ -54,6 +59,11 @@ const (
5459
exitCodeInfrastructureFailed
5560
exitCodeBootstrapFailed
5661
exitCodeInstallFailed
62+
exitCodeOperatorStabilityFailed
63+
64+
// coStabilityThreshold is how long a cluster operator must have Progressing=False
65+
// in order to be considered stable. Measured in seconds.
66+
coStabilityThreshold float64 = 30
5767
)
5868

5969
// each target is a variable to preserve the order when creating subcommands and still
@@ -501,7 +511,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
501511
defer cancel()
502512

503513
failing := configv1.ClusterStatusConditionType("Failing")
504-
timer.StartTimer("Cluster Operators")
514+
timer.StartTimer("Cluster Operators Available")
505515
var lastError string
506516
_, err = clientwatch.UntilWithSync(
507517
clusterVersionContext,
@@ -519,7 +529,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
519529
if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorAvailable) &&
520530
cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, failing) &&
521531
cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, configv1.OperatorProgressing) {
522-
timer.StopTimer("Cluster Operators")
532+
timer.StopTimer("Cluster Operators Available")
523533
return true, nil
524534
}
525535
if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, failing) {
@@ -551,6 +561,54 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
551561
return errors.Wrap(err, "failed to initialize the cluster")
552562
}
553563

564+
// waitForStableOperators ensures that each cluster operator is "stable", i.e. the
565+
// operator has not been in a progressing state for at least a certain duration,
566+
// 30 seconds by default. Returns an error if any operator does meet this threshold
567+
// after a deadline, 30 minutes by default.
568+
func waitForStableOperators(ctx context.Context, config *rest.Config) error {
569+
timer.StartTimer("Cluster Operators Stable")
570+
571+
stabilityCheckDuration := 30 * time.Minute
572+
stabilityContext, cancel := context.WithTimeout(ctx, stabilityCheckDuration)
573+
defer cancel()
574+
575+
untilTime := time.Now().Add(stabilityCheckDuration)
576+
timezone, _ := untilTime.Zone()
577+
logrus.Infof("Waiting up to %v (until %v %s) to ensure each cluster operator has finished progressing...",
578+
stabilityCheckDuration, untilTime.Format(time.Kitchen), timezone)
579+
580+
cc, err := configclient.NewForConfig(config)
581+
if err != nil {
582+
return errors.Wrap(err, "failed to create a config client")
583+
}
584+
configInformers := configinformers.NewSharedInformerFactory(cc, 0)
585+
clusterOperatorInformer := configInformers.Config().V1().ClusterOperators().Informer()
586+
clusterOperatorLister := configInformers.Config().V1().ClusterOperators().Lister()
587+
configInformers.Start(ctx.Done())
588+
if !cache.WaitForCacheSync(ctx.Done(), clusterOperatorInformer.HasSynced) {
589+
return fmt.Errorf("informers never started")
590+
}
591+
592+
waitErr := wait.PollUntilContextCancel(stabilityContext, 1*time.Second, true, waitForAllClusterOperators(clusterOperatorLister))
593+
if waitErr != nil {
594+
logrus.Errorf("Error checking cluster operator Progressing status: %q", waitErr)
595+
stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
596+
if err != nil {
597+
logrus.Errorf("Error checking final cluster operator Progressing status: %q", err)
598+
}
599+
logrus.Debugf("These cluster operators were stable: [%s]", strings.Join(sets.List(stableOperators), ", "))
600+
logrus.Errorf("These cluster operators were not stable: [%s]", strings.Join(sets.List(unstableOperators), ", "))
601+
602+
logrus.Exit(exitCodeOperatorStabilityFailed)
603+
}
604+
605+
timer.StopTimer("Cluster Operators Stable")
606+
607+
logrus.Info("All cluster operators have completed progressing")
608+
609+
return nil
610+
}
611+
554612
// getConsole returns the console URL from the route 'console' in namespace openshift-console
555613
func getConsole(ctx context.Context, config *rest.Config) (string, error) {
556614
url := ""
@@ -637,6 +695,10 @@ func waitForInstallComplete(ctx context.Context, config *rest.Config, directory
637695
return err
638696
}
639697

698+
if err := waitForStableOperators(ctx, config); err != nil {
699+
return err
700+
}
701+
640702
consoleURL, err := getConsole(ctx, config)
641703
if err != nil {
642704
logrus.Warnf("Cluster does not have a console available: %v", err)
@@ -657,3 +719,62 @@ func checkIfAgentCommand(assetStore asset.Store) {
657719
logrus.Warning("An agent configuration was detected but this command is not the agent wait-for command")
658720
}
659721
}
722+
723+
func waitForAllClusterOperators(clusterOperatorLister configlisters.ClusterOperatorLister) func(ctx context.Context) (bool, error) {
724+
previouslyStableOperators := sets.Set[string]{}
725+
726+
return func(ctx context.Context) (bool, error) {
727+
stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
728+
if err != nil {
729+
return false, err
730+
}
731+
if newlyStableOperators := stableOperators.Difference(previouslyStableOperators); len(newlyStableOperators) > 0 {
732+
for _, name := range sets.List(newlyStableOperators) {
733+
logrus.Debugf("Cluster Operator %s is stable", name)
734+
}
735+
}
736+
if newlyUnstableOperators := previouslyStableOperators.Difference(stableOperators); len(newlyUnstableOperators) > 0 {
737+
for _, name := range sets.List(newlyUnstableOperators) {
738+
logrus.Debugf("Cluster Operator %s became unstable", name)
739+
}
740+
}
741+
previouslyStableOperators = stableOperators
742+
743+
if len(unstableOperators) == 0 {
744+
return true, nil
745+
}
746+
747+
return false, nil
748+
}
749+
}
750+
751+
func currentOperatorStability(clusterOperatorLister configlisters.ClusterOperatorLister) (sets.Set[string], sets.Set[string], error) {
752+
clusterOperators, err := clusterOperatorLister.List(labels.Everything())
753+
if err != nil {
754+
return nil, nil, err // lister should never fail
755+
}
756+
757+
stableOperators := sets.Set[string]{}
758+
unstableOperators := sets.Set[string]{}
759+
for _, clusterOperator := range clusterOperators {
760+
name := clusterOperator.Name
761+
progressing := cov1helpers.FindStatusCondition(clusterOperator.Status.Conditions, configv1.OperatorProgressing)
762+
if progressing == nil {
763+
logrus.Debugf("Cluster Operator %s progressing == nil", name)
764+
unstableOperators.Insert(name)
765+
continue
766+
}
767+
if meetsStabilityThreshold(progressing) {
768+
stableOperators.Insert(name)
769+
} else {
770+
logrus.Debugf("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, progressing.Status, progressing.LastTransitionTime.Time, time.Since(progressing.LastTransitionTime.Time).Seconds(), progressing.Reason, progressing.Message)
771+
unstableOperators.Insert(name)
772+
}
773+
}
774+
775+
return stableOperators, unstableOperators, nil
776+
}
777+
778+
func meetsStabilityThreshold(progressing *configv1.ClusterOperatorStatusCondition) bool {
779+
return progressing.Status == configv1.ConditionFalse && time.Since(progressing.LastTransitionTime.Time).Seconds() > coStabilityThreshold
780+
}

vendor/github.com/openshift/client-go/config/informers/externalversions/config/interface.go

Lines changed: 38 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/openshift/client-go/config/informers/externalversions/config/v1/apiserver.go

Lines changed: 73 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/openshift/client-go/config/informers/externalversions/config/v1/authentication.go

Lines changed: 73 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)