Skip to content

Commit 77e070f

Browse files
committed
Make AttachDisk idempotent again
if DescribeInstance tells us that Volume is attached, we enter waitForAttach loop without calling AttachDisk. But if DescribeVolume tell us - device is actually detached, then we don't wait for 20 minutes and return error early.
1 parent db28b02 commit 77e070f

File tree

1 file changed

+22
-8
lines changed
  • staging/src/k8s.io/legacy-cloud-providers/aws

1 file changed

+22
-8
lines changed

staging/src/k8s.io/legacy-cloud-providers/aws/aws.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,14 @@ const (
257257
filterNodeLimit = 150
258258
)
259259

260+
const (
261+
// represents expected attachment status of a volume after attach
262+
volumeAttachedStatus = "attached"
263+
264+
// represents expected attachment status of a volume after detach
265+
volumeDetachedStatus = "detached"
266+
)
267+
260268
// awsTagNameMasterRoles is a set of well-known AWS tag names that indicate the instance is a master
261269
// The major consequence is that it is then not considered for AWS zone discovery for dynamic volume creation.
262270
var awsTagNameMasterRoles = sets.NewString("kubernetes.io/role/master", "k8s.io/role/master")
@@ -1967,7 +1975,6 @@ func (c *Cloud) getMountDevice(
19671975
// AWS API returns consistent result next time (i.e. the volume is detached).
19681976
status := volumeStatus[mappingVolumeID]
19691977
klog.Warningf("Got assignment call for already-assigned volume: %s@%s, volume status: %s", mountDevice, mappingVolumeID, status)
1970-
return mountDevice, false, fmt.Errorf("volume is still being detached from the node")
19711978
}
19721979
return mountDevice, true, nil
19731980
}
@@ -2168,7 +2175,7 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
21682175

21692176
// waitForAttachmentStatus polls until the attachment status is the expected value
21702177
// On success, it returns the last attachment state.
2171-
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) {
2178+
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string, alreadyAttached bool) (*ec2.VolumeAttachment, error) {
21722179
backoff := wait.Backoff{
21732180
Duration: volumeAttachmentStatusPollDelay,
21742181
Factor: volumeAttachmentStatusFactor,
@@ -2193,7 +2200,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
21932200
if err != nil {
21942201
// The VolumeNotFound error is special -- we don't need to wait for it to repeat
21952202
if isAWSErrorVolumeNotFound(err) {
2196-
if status == "detached" {
2203+
if status == volumeDetachedStatus {
21972204
// The disk doesn't exist, assume it's detached, log warning and stop waiting
21982205
klog.Warningf("Waiting for volume %q to be detached but the volume does not exist", d.awsID)
21992206
stateStr := "detached"
@@ -2202,7 +2209,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
22022209
}
22032210
return true, nil
22042211
}
2205-
if status == "attached" {
2212+
if status == volumeAttachedStatus {
22062213
// The disk doesn't exist, complain, give up waiting and report error
22072214
klog.Warningf("Waiting for volume %q to be attached but the volume does not exist", d.awsID)
22082215
return false, err
@@ -2237,7 +2244,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
22372244
}
22382245
}
22392246
if attachmentStatus == "" {
2240-
attachmentStatus = "detached"
2247+
attachmentStatus = volumeDetachedStatus
22412248
}
22422249
if attachment != nil {
22432250
// AWS eventual consistency can go back in time.
@@ -2266,6 +2273,13 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
22662273
}
22672274
}
22682275

2276+
// if we expected volume to be attached and it was reported as already attached via DescribeInstance call
2277+
// but DescribeVolume told us volume is detached, we will short-circuit this long wait loop and return error
2278+
// so as AttachDisk can be retried without waiting for 20 minutes.
2279+
if (status == volumeAttachedStatus) && alreadyAttached && (attachmentStatus != status) {
2280+
return false, fmt.Errorf("attachment of disk %q failed, expected device to be attached but was %s", d.name, attachmentStatus)
2281+
}
2282+
22692283
if attachmentStatus == status {
22702284
// Attachment is in requested state, finish waiting
22712285
return true, nil
@@ -2411,7 +2425,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
24112425
klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse)
24122426
}
24132427

2414-
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device)
2428+
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device, alreadyAttached)
24152429

24162430
if err != nil {
24172431
if err == wait.ErrWaitTimeout {
@@ -2489,7 +2503,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
24892503
return "", errors.New("no response from DetachVolume")
24902504
}
24912505

2492-
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "")
2506+
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "", false)
24932507
if err != nil {
24942508
return "", err
24952509
}
@@ -4797,7 +4811,7 @@ func setNodeDisk(
47974811
}
47984812

47994813
func getInitialAttachDetachDelay(status string) time.Duration {
4800-
if status == "detached" {
4814+
if status == volumeDetachedStatus {
48014815
return volumeDetachmentStatusInitialDelay
48024816
}
48034817
return volumeAttachmentStatusInitialDelay

0 commit comments

Comments
 (0)