Skip to content

Commit 2ea001d

Browse files
Merge pull request #9261 from benluddy/wait-for-etcd-bootstrap-member-removal
OCPBUGS-45482: Wait for etcd bootstrap member removal
2 parents 0170a6a + e1f9d5c commit 2ea001d

File tree

246 files changed

+22322
-64
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

246 files changed

+22322
-64
lines changed

cmd/openshift-install/create.go

Lines changed: 13 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,9 @@ import (
1818
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1919
"k8s.io/apimachinery/pkg/fields"
2020
"k8s.io/apimachinery/pkg/labels"
21-
"k8s.io/apimachinery/pkg/runtime"
22-
"k8s.io/apimachinery/pkg/runtime/schema"
2321
"k8s.io/apimachinery/pkg/util/sets"
2422
"k8s.io/apimachinery/pkg/util/wait"
2523
"k8s.io/apimachinery/pkg/watch"
26-
"k8s.io/client-go/dynamic"
2724
"k8s.io/client-go/kubernetes"
2825
"k8s.io/client-go/rest"
2926
"k8s.io/client-go/tools/cache"
@@ -35,6 +32,7 @@ import (
3532
configclient "github.com/openshift/client-go/config/clientset/versioned"
3633
configinformers "github.com/openshift/client-go/config/informers/externalversions"
3734
configlisters "github.com/openshift/client-go/config/listers/config/v1"
35+
operatorclient "github.com/openshift/client-go/operator/clientset/versioned"
3836
routeclient "github.com/openshift/client-go/route/clientset/versioned"
3937
"github.com/openshift/installer/cmd/openshift-install/command"
4038
"github.com/openshift/installer/pkg/asset"
@@ -483,7 +481,7 @@ func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *cluster
483481
return err
484482
}
485483

486-
if err := waitForStableSNOBootstrap(ctx, config); err != nil {
484+
if err := waitForEtcdBootstrapMemberRemoval(ctx, config); err != nil {
487485
return newBootstrapError(err)
488486
}
489487

@@ -525,58 +523,31 @@ func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset
525523
return nil
526524
}
527525

528-
// When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
529-
// here we make sure that the deployment is stable.
530-
// Given the nature of single node we just need to make sure things such as etcd are in the proper state
531-
// before continuing.
532-
func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
526+
// If the bootstrap etcd member is cleaned up before it has been removed from the etcd cluster, the
527+
// etcd cluster cannot maintain quorum through the rollout of any single permanent member.
528+
func waitForEtcdBootstrapMemberRemoval(ctx context.Context, config *rest.Config) error {
533529
timeout := 5 * time.Minute
534530

535-
// If we're not in a single node deployment, bail early
536-
if isSNO, err := IsSingleNode(); err != nil {
537-
logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
538-
return nil
539-
} else if !isSNO {
540-
return nil
541-
}
542-
543-
snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
544-
defer cancel()
545-
546531
untilTime := time.Now().Add(timeout)
547532
timezone, _ := untilTime.Zone()
548-
logrus.Info("Detected Single Node deployment")
549533
logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
550534
timeout, untilTime.Format(time.Kitchen), timezone)
551535

552-
client, err := dynamic.NewForConfig(config)
536+
client, err := operatorclient.NewForConfig(config)
553537
if err != nil {
554-
return fmt.Errorf("error creating dynamic client: %w", err)
538+
return fmt.Errorf("error creating operator client: %w", err)
555539
}
556-
gvr := schema.GroupVersionResource{
557-
Group: operatorv1.SchemeGroupVersion.Group,
558-
Version: operatorv1.SchemeGroupVersion.Version,
559-
Resource: "etcds",
560-
}
561-
resourceClient := client.Resource(gvr)
562540
// Validate the etcd operator has removed the bootstrap etcd member
563-
return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
564-
etcdOperator := &operatorv1.Etcd{}
565-
etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
541+
return wait.PollUntilContextTimeout(ctx, 1*time.Second, timeout, true, func(ctx context.Context) (done bool, err error) {
542+
etcd, err := client.OperatorV1().Etcds().Get(ctx, "cluster", metav1.GetOptions{})
566543
if err != nil {
567-
// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
568-
logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
544+
logrus.Debugf("Error getting etcd operator singleton, retrying: %v", err)
569545
return false, nil
570546
}
571-
err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
572-
if err != nil {
573-
// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
574-
logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
575-
return false, nil
576-
}
577-
for _, condition := range etcdOperator.Status.Conditions {
547+
548+
for _, condition := range etcd.Status.Conditions {
578549
if condition.Type == "EtcdBootstrapMemberRemoved" {
579-
return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
550+
return condition.Status == operatorv1.ConditionTrue, nil
580551
}
581552
}
582553
return false, nil
@@ -942,25 +913,3 @@ func handleUnreachableAPIServer(ctx context.Context, config *rest.Config) error
942913

943914
return nil
944915
}
945-
946-
// IsSingleNode determines if we are in a single node configuration based off of the install config
947-
// loaded from the asset store.
948-
func IsSingleNode() (bool, error) {
949-
assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
950-
if err != nil {
951-
return false, fmt.Errorf("error loading asset store: %w", err)
952-
}
953-
installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
954-
if err != nil {
955-
return false, fmt.Errorf("error loading installConfig: %w", err)
956-
}
957-
if installConfig == nil {
958-
return false, fmt.Errorf("installConfig loaded from asset store was nil")
959-
}
960-
961-
config := installConfig.(*installconfig.InstallConfig).Config
962-
if machinePool := config.ControlPlane; machinePool != nil {
963-
return *machinePool.Replicas == int64(1), nil
964-
}
965-
return false, nil
966-
}

0 commit comments

Comments
 (0)