Skip to content

Commit 9c8cfd4

Browse files
Merge pull request #8307 from patrickdillon/capi-timeout
CORS-3437: infra/capi: add provisioning timeout
2 parents faa2e21 + 4e2c8f6 commit 9c8cfd4

File tree

1 file changed

+43
-4
lines changed

1 file changed

+43
-4
lines changed

pkg/infrastructure/clusterapi/clusterapi.go

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/openshift/installer/pkg/asset/rhcos"
3333
"github.com/openshift/installer/pkg/clusterapi"
3434
"github.com/openshift/installer/pkg/infrastructure"
35+
"github.com/openshift/installer/pkg/metrics/timer"
3536
"github.com/openshift/installer/pkg/types"
3637
)
3738

@@ -40,6 +41,18 @@ import (
4041
// interface the installer uses to call this provider.
4142
var _ infrastructure.Provider = (*InfraProvider)(nil)
4243

44+
const (
45+
// timeout for each provisioning step.
46+
timeout = 15 * time.Minute
47+
48+
preProvisionStage = "Infrastructure Pre-provisioning"
49+
infrastructureStage = "Network-infrastructure Provisioning"
50+
infrastructureReadyStage = "Post-network, pre-machine Provisioning"
51+
ignitionStage = "Bootstrap Ignition Provisioning"
52+
machineStage = "Machine Provisioning"
53+
postProvisionStage = "Infrastructure Post-provisioning"
54+
)
55+
4356
// InfraProvider implements common Cluster API logic and
4457
// contains the platform CAPI provider, which is called
4558
// in the lifecycle defined by the Provider interface.
@@ -107,15 +120,17 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
107120
MachineManifests: machineManifests,
108121
WorkersAsset: workersAsset,
109122
}
110-
123+
timer.StartTimer(preProvisionStage)
111124
if err := p.PreProvision(ctx, preProvisionInput); err != nil {
112125
return fileList, fmt.Errorf("failed during pre-provisioning: %w", err)
113126
}
127+
timer.StopTimer(preProvisionStage)
114128
} else {
115129
logrus.Debugf("No pre-provisioning requirements for the %s provider", i.impl.Name())
116130
}
117131

118132
// Run the CAPI system.
133+
timer.StartTimer(infrastructureStage)
119134
capiSystem := clusterapi.System()
120135
if err := capiSystem.Run(ctx, installConfig); err != nil {
121136
return fileList, fmt.Errorf("failed to run cluster api system: %w", err)
@@ -153,12 +168,16 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
153168

154169
// Wait for successful provisioning by checking the InfrastructureReady
155170
// status on the cluster object.
171+
untilTime := time.Now().Add(timeout)
172+
timezone, _ := untilTime.Zone()
173+
logrus.Infof("Waiting up to %v (until %v %s) for network infrastructure to become ready...", timeout, untilTime.Format(time.Kitchen), timezone)
156174
var cluster *clusterv1.Cluster
157175
{
158176
if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
159177
Duration: time.Second * 10,
160178
Factor: float64(1.5),
161179
Steps: 32,
180+
Cap: timeout,
162181
}, func(ctx context.Context) (bool, error) {
163182
c := &clusterv1.Cluster{}
164183
if err := cl.Get(ctx, client.ObjectKey{
@@ -173,7 +192,10 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
173192
cluster = c
174193
return cluster.Status.InfrastructureReady, nil
175194
}); err != nil {
176-
return fileList, err
195+
if wait.Interrupted(err) {
196+
return fileList, fmt.Errorf("infrastructure was not ready within %v: %w", timeout, err)
197+
}
198+
return fileList, fmt.Errorf("infrastructure is not ready: %w", err)
177199
}
178200
if cluster == nil {
179201
return fileList, fmt.Errorf("error occurred during load balancer ready check")
@@ -182,6 +204,8 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
182204
return fileList, fmt.Errorf("control plane endpoint is not set")
183205
}
184206
}
207+
timer.StopTimer(infrastructureStage)
208+
logrus.Info("Netork infrastructure is ready")
185209

186210
if p, ok := i.impl.(InfraReadyProvider); ok {
187211
infraReadyInput := InfraReadyInput{
@@ -190,9 +214,11 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
190214
InfraID: clusterID.InfraID,
191215
}
192216

217+
timer.StartTimer(infrastructureReadyStage)
193218
if err := p.InfraReady(ctx, infraReadyInput); err != nil {
194219
return fileList, fmt.Errorf("failed provisioning resources after infrastructure ready: %w", err)
195220
}
221+
timer.StopTimer(infrastructureReadyStage)
196222
} else {
197223
logrus.Debugf("No infrastructure ready requirements for the %s provider", i.impl.Name())
198224
}
@@ -213,16 +239,19 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
213239
TFVarsAsset: tfvarsAsset,
214240
}
215241

242+
timer.StartTimer(ignitionStage)
216243
if bootstrapIgnData, err = p.Ignition(ctx, ignInput); err != nil {
217244
return fileList, fmt.Errorf("failed preparing ignition data: %w", err)
218245
}
246+
timer.StopTimer(ignitionStage)
219247
} else {
220248
logrus.Debugf("No Ignition requirements for the %s provider", i.impl.Name())
221249
}
222250
bootstrapIgnSecret := IgnitionSecret(bootstrapIgnData, clusterID.InfraID, "bootstrap")
223251
masterIgnSecret := IgnitionSecret(masterIgnAsset.Files()[0].Data, clusterID.InfraID, "master")
224252
machineManifests = append(machineManifests, bootstrapIgnSecret, masterIgnSecret)
225253

254+
timer.StartTimer(machineStage)
226255
// Create the machine manifests.
227256
for _, m := range machineManifests {
228257
m.SetNamespace(capiutils.Namespace)
@@ -238,11 +267,14 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
238267
masterCount = *reps
239268
}
240269

241-
logrus.Debugf("Waiting for machines to provision")
270+
untilTime := time.Now().Add(timeout)
271+
timezone, _ := untilTime.Zone()
272+
logrus.Infof("Waiting up to %v (until %v %s) for machines to provision...", timeout, untilTime.Format(time.Kitchen), timezone)
242273
if err := wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
243274
Duration: time.Second * 10,
244275
Factor: float64(1.5),
245276
Steps: 32,
277+
Cap: timeout,
246278
}, func(ctx context.Context) (bool, error) {
247279
for i := int64(0); i < masterCount; i++ {
248280
machine := &clusterv1.Machine{}
@@ -266,9 +298,14 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
266298
}
267299
return true, nil
268300
}); err != nil {
269-
return fileList, err
301+
if wait.Interrupted(err) {
302+
return fileList, fmt.Errorf("control-plane machines were not provisioned within %v: %w", timeout, err)
303+
}
304+
return fileList, fmt.Errorf("control-plane machines are not ready: %w", err)
270305
}
271306
}
307+
timer.StopTimer(machineStage)
308+
logrus.Info("Control-plane machines are ready")
272309

273310
if p, ok := i.impl.(PostProvider); ok {
274311
postMachineInput := PostProvisionInput{
@@ -277,9 +314,11 @@ func (i *InfraProvider) Provision(ctx context.Context, dir string, parents asset
277314
InfraID: clusterID.InfraID,
278315
}
279316

317+
timer.StartTimer(postProvisionStage)
280318
if err = p.PostProvision(ctx, postMachineInput); err != nil {
281319
return fileList, fmt.Errorf("failed during post-machine creation hook: %w", err)
282320
}
321+
timer.StopTimer(postProvisionStage)
283322
}
284323

285324
// For each manifest we created, retrieve it and store it in the asset.

0 commit comments

Comments
 (0)