Skip to content

Commit 07b6f8d

Browse files
Merge pull request #8450 from patrickdillon/debug-gather
OCPBUGS-34295: wait for ip addresses to be present on machines
2 parents 555a04b + 943f807 commit 07b6f8d

File tree

10 files changed

+95
-16
lines changed

10 files changed

+95
-16
lines changed

pkg/infrastructure/aws/clusterapi/aws.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ type Provider struct{}
4040
// Name gives the name of the provider, AWS.
4141
func (*Provider) Name() string { return awstypes.Name }
4242

43+
// BootstrapHasPublicIP indicates that machine ready checks
44+
// should wait for an ExternalIP in the status.
45+
func (*Provider) BootstrapHasPublicIP() bool { return true }
46+
4347
// PreProvision creates the IAM roles used by all nodes in the cluster.
4448
func (*Provider) PreProvision(ctx context.Context, in clusterapi.PreProvisionInput) error {
4549
if err := createIAMRoles(ctx, in.InfraID, in.InstallConfig); err != nil {

pkg/infrastructure/azure/azure.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ func (p *Provider) Name() string {
5151
return aztypes.Name
5252
}
5353

54+
// BootstrapHasPublicIP indicates that an ExternalIP is not
55+
// required in the machine ready checks.
56+
func (*Provider) BootstrapHasPublicIP() bool { return false }
57+
5458
// PreProvision is called before provisioning using CAPI controllers has begun.
5559
func (p *Provider) PreProvision(ctx context.Context, in clusterapi.PreProvisionInput) error {
5660
session, err := in.InstallConfig.Azure.Session()

pkg/infrastructure/clusterapi/clusterapi.go

Lines changed: 57 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1616
utilerrors "k8s.io/apimachinery/pkg/util/errors"
1717
"k8s.io/apimachinery/pkg/util/wait"
18+
"k8s.io/utils/ptr"
1819
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
1920
utilkubeconfig "sigs.k8s.io/cluster-api/util/kubeconfig"
2021
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -269,36 +270,39 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
269270
masterIgnSecret := IgnitionSecret(masterIgnAsset.Files()[0].Data, clusterID.InfraID, "master")
270271
machineManifests = append(machineManifests, bootstrapIgnSecret, masterIgnSecret)
271272

272-
timer.StartTimer(machineStage)
273273
// Create the machine manifests.
274+
timer.StartTimer(machineStage)
275+
machineNames := []string{}
276+
274277
for _, m := range machineManifests {
275278
m.SetNamespace(capiutils.Namespace)
276279
if err := cl.Create(ctx, m); err != nil {
277280
return fileList, fmt.Errorf("failed to create control-plane manifest: %w", err)
278281
}
279282
i.appliedManifests = append(i.appliedManifests, m)
283+
284+
if machine, ok := m.(*clusterv1.Machine); ok {
285+
machineNames = append(machineNames, machine.Name)
286+
}
280287
logrus.Infof("Created manifest %+T, namespace=%s name=%s", m, m.GetNamespace(), m.GetName())
281288
}
282289

283290
{
284-
masterCount := int64(1)
285-
if reps := installConfig.Config.ControlPlane.Replicas; reps != nil {
286-
masterCount = *reps
287-
}
288-
289291
untilTime := time.Now().Add(timeout)
290292
timezone, _ := untilTime.Zone()
291-
logrus.Infof("Waiting up to %v (until %v %s) for machines to provision...", timeout, untilTime.Format(time.Kitchen), timezone)
293+
reqBootstrapPubIP := installConfig.Config.Publish == types.ExternalPublishingStrategy && i.impl.BootstrapHasPublicIP()
294+
logrus.Infof("Waiting up to %v (until %v %s) for machines %v to provision...", timeout, untilTime.Format(time.Kitchen), timezone, machineNames)
292295
if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
293296
Duration: time.Second * 10,
294297
Factor: float64(1.5),
295298
Steps: 32,
296299
Cap: timeout,
297300
}, func(ctx context.Context) (bool, error) {
298-
for i := int64(0); i < masterCount; i++ {
301+
allReady := true
302+
for _, machineName := range machineNames {
299303
machine := &clusterv1.Machine{}
300304
if err := cl.Get(ctx, client.ObjectKey{
301-
Name: fmt.Sprintf("%s-%s-%d", clusterID.InfraID, "master", i),
305+
Name: machineName,
302306
Namespace: capiutils.Namespace,
303307
}, machine); err != nil {
304308
if apierrors.IsNotFound(err) {
@@ -307,15 +311,18 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
307311
}
308312
return false, err
309313
}
310-
if machine.Status.Phase != string(clusterv1.MachinePhaseProvisioned) &&
311-
machine.Status.Phase != string(clusterv1.MachinePhaseRunning) {
312-
return false, nil
313-
} else if machine.Status.Phase == string(clusterv1.MachinePhaseFailed) {
314-
return false, fmt.Errorf("machine %s failed to provision: %q", machine.Name, *machine.Status.FailureMessage)
314+
reqPubIP := reqBootstrapPubIP && machineName == capiutils.GenerateBoostrapMachineName(clusterID.InfraID)
315+
ready, err := checkMachineReady(machine, reqPubIP)
316+
if err != nil {
317+
return false, fmt.Errorf("failed waiting for machines: %w", err)
318+
}
319+
if !ready {
320+
allReady = false
321+
} else {
322+
logrus.Debugf("Machine %s is ready. Phase: %s", machine.Name, machine.Status.Phase)
315323
}
316-
logrus.Debugf("Machine %s is ready. Phase: %s", machine.Name, machine.Status.Phase)
317324
}
318-
return true, nil
325+
return allReady, nil
319326
}); err != nil {
320327
if wait.Interrupted(err) {
321328
return fileList, fmt.Errorf("control-plane machines were not provisioned within %v: %w", timeout, err)
@@ -550,3 +557,37 @@ func (i *InfraProvider) collectManifests(ctx context.Context, cl client.Client)
550557
}
551558
return fileList, errorList
552559
}
560+
561+
func checkMachineReady(machine *clusterv1.Machine, requirePublicIP bool) (bool, error) {
562+
logrus.Debugf("Checking that machine %s has provisioned...", machine.Name)
563+
if machine.Status.Phase != string(clusterv1.MachinePhaseProvisioned) &&
564+
machine.Status.Phase != string(clusterv1.MachinePhaseRunning) {
565+
logrus.Debugf("Machine %s has not yet provisioned: %s", machine.Name, machine.Status.Phase)
566+
return false, nil
567+
} else if machine.Status.Phase == string(clusterv1.MachinePhaseFailed) {
568+
msg := ptr.Deref(machine.Status.FailureMessage, "machine.Status.FailureMessage was not set")
569+
return false, fmt.Errorf("machine %s failed to provision: %s", machine.Name, msg)
570+
}
571+
logrus.Debugf("Machine %s has status: %s", machine.Name, machine.Status.Phase)
572+
return hasRequiredIP(machine, requirePublicIP), nil
573+
}
574+
575+
func hasRequiredIP(machine *clusterv1.Machine, requirePublicIP bool) bool {
576+
logrus.Debugf("Checking that IP addresses are populated in the status of machine %s...", machine.Name)
577+
578+
for _, addr := range machine.Status.Addresses {
579+
switch {
580+
case len(addr.Address) == 0:
581+
continue
582+
case addr.Type == clusterv1.MachineExternalIP:
583+
logrus.Debugf("Found external IP address: %s", addr.Address)
584+
return true
585+
case addr.Type == clusterv1.MachineInternalIP && !requirePublicIP:
586+
logrus.Debugf("Found internal IP address: %s", addr.Address)
587+
return true
588+
}
589+
logrus.Debugf("Checked IP %s: %s", addr.Type, addr.Address)
590+
}
591+
logrus.Debugf("Still waiting for machine %s to get required IPs", machine.Name)
592+
return false
593+
}

pkg/infrastructure/clusterapi/types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ import (
1818
type Provider interface {
1919
// Name provides the name for the cloud platform.
2020
Name() string
21+
22+
// BootstrapHasPublicIP indicates whether a public IP address
23+
// is expected on the bootstrap node in a public cluster.
24+
// When BootstrapHasPublicIP returns true, the machine ready checks
25+
// wait for an ExternalIP address to be populated in the machine status.
26+
BootstrapHasPublicIP() bool
2127
}
2228

2329
// PreProvider defines the PreProvision hook, which is called prior to

pkg/infrastructure/gcp/clusterapi/clusterapi.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ func (p Provider) Name() string {
3737
return gcptypes.Name
3838
}
3939

40+
// BootstrapHasPublicIP indicates that machine ready checks
41+
// should wait for an ExternalIP in the status.
42+
func (Provider) BootstrapHasPublicIP() bool { return true }
43+
4044
// PreProvision is called before provisioning using CAPI controllers has initiated.
4145
// GCP resources that are not created by CAPG (and are required for other stages of the install) are
4246
// created here using the gcp sdk.

pkg/infrastructure/ibmcloud/clusterapi/clusterapi.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ func (p Provider) Name() string {
1818
return ibmcloudtypes.Name
1919
}
2020

21+
// BootstrapHasPublicIP indicates that an ExternalIP is not
22+
// required in the machine ready checks.
23+
func (Provider) BootstrapHasPublicIP() bool { return false }
24+
2125
// PreProvision creates the IBM Cloud objects required prior to running capibmcloud.
2226
func (p Provider) PreProvision(ctx context.Context, in clusterapi.PreProvisionInput) error {
2327
return nil

pkg/infrastructure/nutanix/clusterapi/clusterapi.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ func (p Provider) Name() string {
2626
return nutanixtypes.Name
2727
}
2828

29+
// BootstrapHasPublicIP indicates that an ExternalIP is not
30+
// required in the machine ready checks.
31+
func (Provider) BootstrapHasPublicIP() bool { return false }
32+
2933
// PreProvision creates the resources required prior to running capi nutanix controller.
3034
func (p Provider) PreProvision(ctx context.Context, in infracapi.PreProvisionInput) error {
3135
// create categories with name "kubernetes-io-cluster-<cluster_id>" and values ["owned", "shared"].

pkg/infrastructure/openstack/clusterapi/clusterapi.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ func (p Provider) Name() string {
3131
return openstack.Name
3232
}
3333

34+
// BootstrapHasPublicIP indicates that an ExternalIP is not
35+
// required in the machine ready checks.
36+
func (Provider) BootstrapHasPublicIP() bool { return false }
37+
3438
var _ clusterapi.PreProvider = Provider{}
3539

3640
// PreProvision tags the VIP ports, and creates the security groups and the

pkg/infrastructure/powervs/clusterapi/powervs.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ func (p Provider) Name() string {
3838
return powervstypes.Name
3939
}
4040

41+
// BootstrapHasPublicIP indicates that an ExternalIP is not
42+
// required in the machine ready checks.
43+
func (Provider) BootstrapHasPublicIP() bool { return false }
44+
4145
func leftInContext(ctx context.Context) time.Duration {
4246
deadline, ok := ctx.Deadline()
4347
if !ok {

pkg/infrastructure/vsphere/clusterapi/clusterapi.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ func (p Provider) Name() string {
2727
return vsphere.Name
2828
}
2929

30+
// BootstrapHasPublicIP indicates that an ExternalIP is not
31+
// required in the machine ready checks.
32+
func (Provider) BootstrapHasPublicIP() bool { return false }
33+
3034
func initializeFoldersAndTemplates(ctx context.Context, cachedImage string, failureDomain vsphere.FailureDomain, session *session.Session, diskType vsphere.DiskType, clusterID, tagID string) error {
3135
finder := session.Finder
3236

0 commit comments

Comments
 (0)