Skip to content

Commit 5f466c2

Browse files
Merge pull request openshift#8004 from eggfoobar/fix-sno-bootstrap-race-condition
OCPBUGS-29429: feat: add check for SNO bootstrap condition
2 parents 882a294 + 4cbcb6c commit 5f466c2

File tree

1 file changed

+93
-1
lines changed

1 file changed

+93
-1
lines changed

cmd/openshift-install/create.go

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,20 @@ import (
1818
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1919
"k8s.io/apimachinery/pkg/fields"
2020
"k8s.io/apimachinery/pkg/labels"
21+
"k8s.io/apimachinery/pkg/runtime"
22+
"k8s.io/apimachinery/pkg/runtime/schema"
2123
"k8s.io/apimachinery/pkg/util/sets"
2224
"k8s.io/apimachinery/pkg/util/wait"
2325
"k8s.io/apimachinery/pkg/watch"
26+
"k8s.io/client-go/dynamic"
2427
"k8s.io/client-go/kubernetes"
2528
"k8s.io/client-go/rest"
2629
"k8s.io/client-go/tools/cache"
2730
"k8s.io/client-go/tools/clientcmd"
2831
clientwatch "k8s.io/client-go/tools/watch"
2932

3033
configv1 "github.com/openshift/api/config/v1"
34+
operatorv1 "github.com/openshift/api/operator/v1"
3135
configclient "github.com/openshift/client-go/config/clientset/versioned"
3236
configinformers "github.com/openshift/client-go/config/informers/externalversions"
3337
configlisters "github.com/openshift/client-go/config/listers/config/v1"
@@ -431,7 +435,15 @@ func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *cluster
431435
return newAPIError(err)
432436
}
433437

434-
return waitForBootstrapConfigMap(ctx, client)
438+
if err := waitForBootstrapConfigMap(ctx, client); err != nil {
439+
return err
440+
}
441+
442+
if err := waitForStableSNOBootstrap(ctx, config); err != nil {
443+
return newBootstrapError(err)
444+
}
445+
446+
return nil
435447
}
436448

437449
// waitForBootstrapConfigMap watches the configmaps in the kube-system namespace
@@ -488,6 +500,64 @@ func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset
488500
return nil
489501
}
490502

503+
// When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
504+
// here we make sure that the deployment is stable.
505+
// Given the nature of single node we just need to make sure things such as etcd are in the proper state
506+
// before continuing.
507+
func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
508+
timeout := 5 * time.Minute
509+
510+
// If we're not in a single node deployment, bail early
511+
if isSNO, err := IsSingleNode(); err != nil {
512+
logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
513+
return nil
514+
} else if !isSNO {
515+
return nil
516+
}
517+
518+
snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
519+
defer cancel()
520+
521+
untilTime := time.Now().Add(timeout)
522+
timezone, _ := untilTime.Zone()
523+
logrus.Info("Detected Single Node deployment")
524+
logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
525+
timeout, untilTime.Format(time.Kitchen), timezone)
526+
527+
client, err := dynamic.NewForConfig(config)
528+
if err != nil {
529+
return fmt.Errorf("error creating dynamic client: %w", err)
530+
}
531+
gvr := schema.GroupVersionResource{
532+
Group: operatorv1.SchemeGroupVersion.Group,
533+
Version: operatorv1.SchemeGroupVersion.Version,
534+
Resource: "etcds",
535+
}
536+
resourceClient := client.Resource(gvr)
537+
// Validate the etcd operator has removed the bootstrap etcd member
538+
return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
539+
etcdOperator := &operatorv1.Etcd{}
540+
etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
541+
if err != nil {
542+
// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
543+
logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
544+
return false, nil
545+
}
546+
err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
547+
if err != nil {
548+
// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
549+
logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
550+
return false, nil
551+
}
552+
for _, condition := range etcdOperator.Status.Conditions {
553+
if condition.Type == "EtcdBootstrapMemberRemoved" {
554+
return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
555+
}
556+
}
557+
return false, nil
558+
})
559+
}
560+
491561
// waitForInitializedCluster watches the ClusterVersion waiting for confirmation
492562
// that the cluster has been initialized.
493563
func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
@@ -839,3 +909,25 @@ func handleUnreachableAPIServer(config *rest.Config) error {
839909

840910
return nil
841911
}
912+
913+
// IsSingleNode determines if we are in a single node configuration based off of the install config
914+
// loaded from the asset store.
915+
func IsSingleNode() (bool, error) {
916+
assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
917+
if err != nil {
918+
return false, fmt.Errorf("error loading asset store: %w", err)
919+
}
920+
installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
921+
if err != nil {
922+
return false, fmt.Errorf("error loading installConfig: %w", err)
923+
}
924+
if installConfig == nil {
925+
return false, fmt.Errorf("installConfig loaded from asset store was nil")
926+
}
927+
928+
config := installConfig.(*installconfig.InstallConfig).Config
929+
if machinePool := config.ControlPlane; machinePool != nil {
930+
return *machinePool.Replicas == int64(1), nil
931+
}
932+
return false, nil
933+
}

0 commit comments

Comments
 (0)