diff --git a/cloud/interfaces.go b/cloud/interfaces.go index 27abdcb42..0202bb9f1 100644 --- a/cloud/interfaces.go +++ b/cloud/interfaces.go @@ -86,6 +86,8 @@ type MachineGetter interface { Project() string Role() string IsControlPlane() bool + IsFirstMachine() bool + IsAPIServerHealthy() bool ControlPlaneGroupName() string GetInstanceID() *string GetProviderID() string diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 8173728f2..25e2842e6 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -27,6 +27,7 @@ import ( "github.com/go-logr/logr" "github.com/pkg/errors" + "golang.org/x/mod/semver" "google.golang.org/api/compute/v1" corev1 "k8s.io/api/core/v1" @@ -47,6 +48,7 @@ type MachineScopeParams struct { ClusterGetter cloud.ClusterGetter Machine *clusterv1.Machine GCPMachine *infrav1.GCPMachine + IsFirst bool } // NewMachineScope creates a new MachineScope from the supplied parameters. @@ -73,6 +75,7 @@ func NewMachineScope(params MachineScopeParams) (*MachineScope, error) { GCPMachine: params.GCPMachine, ClusterGetter: params.ClusterGetter, patchHelper: helper, + IsFirst: params.IsFirst, }, nil } @@ -83,6 +86,7 @@ type MachineScope struct { ClusterGetter cloud.ClusterGetter Machine *clusterv1.Machine GCPMachine *infrav1.GCPMachine + IsFirst bool } // ANCHOR: MachineGetter @@ -140,6 +144,24 @@ func (m *MachineScope) IsControlPlane() bool { return IsControlPlaneMachine(m.Machine) } +// IsFirstMachine returns true if the machine is the first machine in the cluster. +func (m *MachineScope) IsFirstMachine() bool { + return m.IsFirst +} + +// IsAPIServerHealthy returns true if the machine's API server pod is healthy. +func (m *MachineScope) IsAPIServerHealthy() bool { + if m.Machine.Status.V1Beta2 == nil { + return false + } + for _, condition := range m.Machine.Status.V1Beta2.Conditions { + if condition.Type == "APIServerPodHealthy" && condition.Status == "True" { + return true + } + } + return false +} + // Role returns the machine role from the labels. func (m *MachineScope) Role() string { if IsControlPlaneMachine(m.Machine) { diff --git a/cloud/services/compute/instances/reconcile.go b/cloud/services/compute/instances/reconcile.go index 76277d072..462b38072 100644 --- a/cloud/services/compute/instances/reconcile.go +++ b/cloud/services/compute/instances/reconcile.go @@ -89,8 +89,17 @@ func (s *Service) Reconcile(ctx context.Context) error { s.scope.SetInstanceStatus(infrav1.InstanceStatus(instance.Status)) if s.scope.IsControlPlane() { - if err := s.registerControlPlaneInstance(ctx, instance); err != nil { - return err + // If the instance is part of the control plane, we need to ensure it's + // registered with the instance group. We only do this if the API server is healthy or if this is the first + // control plane machine. This prevents a hairpinning issue where a new control plane machine attempts to reach + // the API server via a load balancer that is not yet ready. The first control plane machine is handled specially + // by the kubeadm controller, so it can be added to the instance group immediately. + if s.scope.IsAPIServerHealthy() || s.scope.IsFirstMachine() { + if err := s.registerControlPlaneInstance(ctx, instance); err != nil { + return err + } + } else { + log.Info("Waiting for API server to be healthy before registering control plane instance in instance group") } } diff --git a/controllers/gcpmachine_controller.go b/controllers/gcpmachine_controller.go index 87c4f582c..6ca685e79 100644 --- a/controllers/gcpmachine_controller.go +++ b/controllers/gcpmachine_controller.go @@ -178,6 +178,13 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, nil } + // List all machines in the cluster to check if this is the first one. + machineList := &clusterv1.MachineList{} + if err := r.List(ctx, machineList, client.InNamespace(cluster.Namespace), client.MatchingLabels{clusterv1.ClusterNameLabel: cluster.Name}); err != nil { + log.Error(err, "failed to list machines for cluster") + return ctrl.Result{}, err + } + // Create the cluster scope clusterScope, err := scope.NewClusterScope(ctx, scope.ClusterScopeParams{ Client: r.Client, @@ -194,6 +201,7 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) Machine: machine, GCPMachine: gcpMachine, ClusterGetter: clusterScope, + IsFirst: len(machineList.Items) == 1, }) if err != nil { return ctrl.Result{}, errors.Errorf("failed to create scope: %+v", err)