Skip to content

Commit d4bcb83

Browse files
committed
OCPBUGS-33789: [Nutanix] installer intermittent failed to upload bootstrap image data to prism-central when CAPI enabled
1 parent d0d6bad commit d4bcb83

File tree

1 file changed

+114
-27
lines changed

1 file changed

+114
-27
lines changed

pkg/infrastructure/nutanix/clusterapi/clusterapi.go

Lines changed: 114 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ package clusterapi
33
import (
44
"context"
55
"fmt"
6+
"strings"
7+
"time"
68

79
nutanixclientv3 "github.com/nutanix-cloud-native/prism-go-client/v3"
810
"github.com/sirupsen/logrus"
11+
"k8s.io/apimachinery/pkg/util/wait"
912
"k8s.io/utils/ptr"
1013

1114
infracapi "github.com/openshift/installer/pkg/infrastructure/clusterapi"
@@ -146,44 +149,110 @@ func (p Provider) Ignition(ctx context.Context, in infracapi.IgnitionInput) ([]b
146149
}
147150
imgReq.Metadata = imgMeta
148151

152+
// Wait for successful creation of the bootstrap image object and upload the image data to it in PC.
153+
// Put both createImage() and uploadImageData() in the same wait.ExponentialBackoffWithContext().
154+
// Because if createImage() succeeds but uploadImageData() fails, we need to delete the image object
155+
// and retry to call both createImage() and uploadImageData() again. The old-version prism-api server sometimes
156+
// returns error for the uploadImage call and does not allow to retry the uploadImage call to the same image object.
157+
timeout := 20 * time.Minute
158+
if err = wait.ExponentialBackoffWithContext(ctx, wait.Backoff{
159+
Duration: time.Minute * 4,
160+
Factor: float64(1.0),
161+
Steps: 5,
162+
Cap: timeout,
163+
}, func(ctx context.Context) (bool, error) {
164+
// create the bootstrap image object in PC
165+
imgUUID, err1 := createImage(ctx, nutanixCl, imgReq, imgName)
166+
if err1 != nil {
167+
logrus.Errorf("failed to create the bootstrap image object %s in PC: %v", imgName, err1)
168+
// no need to retry if the error code is 401 or 403
169+
if strings.Contains(err1.Error(), `"code": 401`) || strings.Contains(err1.Error(), `"code": 403`) {
170+
return false, err1
171+
}
172+
173+
// delete the image object if uuid is not empty
174+
if imgUUID != "" {
175+
if e2 := deleteImage(ctx, nutanixCl, imgUUID); e2 != nil {
176+
logrus.Errorf("failed to delete image object %s (uuid: %s): %v", imgName, imgUUID, e2)
177+
}
178+
}
179+
return false, nil
180+
}
181+
182+
// upload the image data to the bootstrap image object in PC
183+
err2 := uploadImageData(ctx, nutanixCl, imgName, imgUUID, imgPath)
184+
if err2 != nil {
185+
logrus.Errorf("failed to upload the bootstrap image %s data: %v", imgName, err2)
186+
// no need to retry if the error code is 401 or 403
187+
if strings.Contains(err2.Error(), `"code": 401`) || strings.Contains(err2.Error(), `"code": 403`) {
188+
return false, err2
189+
}
190+
191+
// delete the image object
192+
if e2 := deleteImage(ctx, nutanixCl, imgUUID); e2 != nil {
193+
logrus.Errorf("failed to delete image object %s (uuid: %s): %v", imgName, imgUUID, e2)
194+
}
195+
return false, nil
196+
}
197+
198+
return true, nil
199+
}); err != nil {
200+
if wait.Interrupted(err) {
201+
err = fmt.Errorf("timeout/interrupt to create/upload the bootstrap image object %s in PC within %v: %w", imgName, timeout, err)
202+
} else {
203+
err = fmt.Errorf("failed to create/upload the bootstrap image object %s in PC: %w", imgName, err)
204+
}
205+
206+
return in.BootstrapIgnData, err
207+
}
208+
logrus.Infof("Successfully created the bootstrap image object %s and uploaded its image data", imgName)
209+
210+
return in.BootstrapIgnData, nil
211+
}
212+
213+
// createImage creates the image object in PC, with the provided request input.
214+
// Returns the imageUUID if the image is created.
215+
func createImage(ctx context.Context, nutanixCl *nutanixclientv3.Client, imgReq *nutanixclientv3.ImageIntentInput, imgName string) (string, error) {
216+
t1 := time.Now()
217+
149218
// create the image object.
150219
respi, err := nutanixCl.V3.CreateImage(ctx, imgReq)
151220
if err != nil {
152-
return nil, fmt.Errorf("failed to create the bootstrap image %q: %w", imgName, err)
221+
return "", fmt.Errorf("failed to create the image %q: %w", imgName, err)
153222
}
154223
imgUUID := *respi.Metadata.UUID
155224

156225
if taskUUID, ok := respi.Status.ExecutionContext.TaskUUID.(string); ok {
157-
logrus.Infof("creating the bootstrap image %s (uuid: %s), taskUUID: %s.", imgName, imgUUID, taskUUID)
226+
logrus.Infof("creating the image %s (uuid: %s), taskUUID: %s", imgName, imgUUID, taskUUID)
158227

159-
// Wait till the image creation task is successed.
228+
// Wait for the image creation task
160229
if err = nutanixtypes.WaitForTask(nutanixCl.V3, taskUUID); err != nil {
161-
err = fmt.Errorf("failed to create the bootstrap image %q: %w", imgName, err)
162-
logrus.Errorf(err.Error())
163-
return nil, err
230+
err = fmt.Errorf("failed to create the image %s (uuid: %s), taskUUID: %s: %w", imgName, imgUUID, taskUUID, err)
231+
} else {
232+
logrus.Infof("created the image %s (uuid: %s). used_time %v", imgName, imgUUID, time.Since(t1))
164233
}
165-
logrus.Infof("created the bootstrap image %s (uuid: %s).", imgName, imgUUID)
166234
} else {
167235
err = fmt.Errorf("failed to convert the task UUID %v to string", respi.Status.ExecutionContext.TaskUUID)
168-
logrus.Error(err)
169-
return nil, err
170236
}
171237

238+
return imgUUID, err
239+
}
240+
241+
// uploadImageData upload the image data from the specified file path to the image object in PC.
242+
func uploadImageData(ctx context.Context, nutanixCl *nutanixclientv3.Client, imgName, imgUUID, imgPath string) error {
172243
// upload the image data.
173-
logrus.Infof("preparing to upload the bootstrap image %s (uuid: %s) data from file %s", imgName, imgUUID, imgPath)
174-
err = nutanixCl.V3.UploadImage(ctx, imgUUID, imgPath)
244+
logrus.Infof("preparing to upload the image %s (uuid: %s) data from file %s", imgName, imgUUID, imgPath)
245+
t1 := time.Now()
246+
err := nutanixCl.V3.UploadImage(ctx, imgUUID, imgPath)
175247
if err != nil {
176-
e1 := fmt.Errorf("failed to upload the bootstrap image data %q from filepath %s: %w", imgName, imgPath, err)
177-
logrus.Error(e1)
178-
return nil, e1
248+
return fmt.Errorf("failed to upload the image data %q from filepath %s: %w used_time %v", imgName, imgPath, err, time.Since(t1))
179249
}
180-
logrus.Infof("uploading the bootstrap image %s data", imgName)
250+
logrus.Infof("uploading the image %s data. used_time %v", imgName, time.Since(t1))
251+
181252
// wait for the image data uploading task to complete.
182253
respb, err := nutanixCl.V3.GetImage(ctx, imgUUID)
183254
if err != nil {
184-
e1 := fmt.Errorf("failed to get the bootstrap image %q. %w", imgName, err)
185-
logrus.Error(e1)
186-
return nil, e1
255+
return fmt.Errorf("failed to get the image %q. %w", imgName, err)
187256
}
188257

189258
if taskUUIDs, ok := respb.Status.ExecutionContext.TaskUUID.([]interface{}); ok {
@@ -193,18 +262,36 @@ func (p Provider) Ignition(ctx context.Context, in infracapi.IgnitionInput) ([]b
193262
tUUIDs = append(tUUIDs, tUUIDstr)
194263
}
195264
}
196-
logrus.Infof("waiting for the bootstrap image data uploading task to complete, taskUUIDs: %v", tUUIDs)
265+
logrus.Infof("waiting for the image data uploading task to complete, taskUUIDs: %v", tUUIDs)
197266
if err = nutanixtypes.WaitForTasks(nutanixCl.V3, tUUIDs); err != nil {
198-
e1 := fmt.Errorf("failed to upload the bootstrap image data %q from filepath %s: %w", imgName, imgPath, err)
199-
logrus.Error(e1)
200-
return nil, e1
267+
return fmt.Errorf("failed to upload the bootstrap image data %q from filepath %s: %w", imgName, imgPath, err)
201268
}
202-
logrus.Infof("completed uploading the bootstrap image data %s (uuid: %s)", imgName, imgUUID)
203269
} else {
204-
err = fmt.Errorf("failed to convert the taskUUIDs %v to array", respb.Status.ExecutionContext.TaskUUID)
205-
logrus.Error(err)
206-
return nil, err
270+
return fmt.Errorf("failed to convert the taskUUIDs %v to array", respb.Status.ExecutionContext.TaskUUID)
207271
}
208272

209-
return in.BootstrapIgnData, nil
273+
return nil
274+
}
275+
276+
// deleteImage deletes the image object with the given uuid in PC.
277+
func deleteImage(ctx context.Context, nutanixCl *nutanixclientv3.Client, imgUUID string) error {
278+
logrus.Infof("preparing to delete the image with uuid %s", imgUUID)
279+
280+
respd, err := nutanixCl.V3.DeleteImage(ctx, imgUUID)
281+
if err != nil {
282+
return fmt.Errorf("failed to delete the image with uuid %s: %w", imgUUID, err)
283+
}
284+
285+
if taskUUID, ok := respd.Status.ExecutionContext.TaskUUID.(string); ok {
286+
logrus.Infof("deleting the image with uuid %s, taskUUID: %s", imgUUID, taskUUID)
287+
288+
// Wait till the image deletion task is successed.
289+
if err = nutanixtypes.WaitForTask(nutanixCl.V3, taskUUID); err != nil {
290+
return fmt.Errorf("failed to delete the image with uuid: %s, taskUUID: %s: %w", imgUUID, taskUUID, err)
291+
}
292+
} else {
293+
return fmt.Errorf("failed to convert the task UUID %v to string", respd.Status.ExecutionContext.TaskUUID)
294+
}
295+
296+
return nil
210297
}

0 commit comments

Comments
 (0)