Skip to content

Commit b920b38

Browse files
authored
Merge pull request kubernetes#85675 from jsafrane/aws-attach-consistency
Fix AWS eventual consistency of AttachDisk
2 parents 650c797 + 044b315 commit b920b38

File tree

1 file changed

+38
-10
lines changed
  • staging/src/k8s.io/legacy-cloud-providers/aws

1 file changed

+38
-10
lines changed

staging/src/k8s.io/legacy-cloud-providers/aws/aws.go

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2105,17 +2105,18 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
21052105

21062106
// waitForAttachmentStatus polls until the attachment status is the expected value
21072107
// On success, it returns the last attachment state.
2108-
func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment, error) {
2108+
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) {
21092109
backoff := wait.Backoff{
21102110
Duration: volumeAttachmentStatusPollDelay,
21112111
Factor: volumeAttachmentStatusFactor,
21122112
Steps: volumeAttachmentStatusSteps,
21132113
}
21142114

2115-
// Because of rate limiting, we often see errors from describeVolume
2115+
// Because of rate limiting, we often see errors from describeVolume.
2116+
// Or AWS eventual consistency returns unexpected data.
21162117
// So we tolerate a limited number of failures.
2117-
// But once we see more than 10 errors in a row, we return the error
2118-
describeErrorCount := 0
2118+
// But once we see more than 10 errors in a row, we return the error.
2119+
errorCount := 0
21192120

21202121
// Attach/detach usually takes time. It does not make sense to start
21212122
// polling DescribeVolumes before some initial delay to let AWS
@@ -2144,8 +2145,8 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
21442145
return false, err
21452146
}
21462147
}
2147-
describeErrorCount++
2148-
if describeErrorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2148+
errorCount++
2149+
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
21492150
// report the error
21502151
return false, err
21512152
}
@@ -2154,8 +2155,6 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
21542155
return false, nil
21552156
}
21562157

2157-
describeErrorCount = 0
2158-
21592158
if len(info.Attachments) > 1 {
21602159
// Shouldn't happen; log so we know if it is
21612160
klog.Warningf("Found multiple attachments for volume %q: %v", d.awsID, info)
@@ -2177,11 +2176,39 @@ func (d *awsDisk) waitForAttachmentStatus(status string) (*ec2.VolumeAttachment,
21772176
if attachmentStatus == "" {
21782177
attachmentStatus = "detached"
21792178
}
2179+
if attachment != nil {
2180+
// AWS eventual consistency can go back in time.
2181+
// For example, we're waiting for a volume to be attached as /dev/xvdba, but AWS can tell us it's
2182+
// attached as /dev/xvdbb, where it was attached before and it was already detached.
2183+
// Retry couple of times, hoping AWS starts reporting the right status.
2184+
device := aws.StringValue(attachment.Device)
2185+
if expectedDevice != "" && device != "" && device != expectedDevice {
2186+
klog.Warningf("Expected device %s %s for volume %s, but found device %s %s", expectedDevice, status, d.name, device, attachmentStatus)
2187+
errorCount++
2188+
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2189+
// report the error
2190+
return false, fmt.Errorf("attachment of disk %q failed: requested device %q but found %q", d.name, expectedDevice, device)
2191+
}
2192+
return false, nil
2193+
}
2194+
instanceID := aws.StringValue(attachment.InstanceId)
2195+
if expectedInstance != "" && instanceID != "" && instanceID != expectedInstance {
2196+
klog.Warningf("Expected instance %s/%s for volume %s, but found instance %s/%s", expectedInstance, status, d.name, instanceID, attachmentStatus)
2197+
errorCount++
2198+
if errorCount > volumeAttachmentStatusConsecutiveErrorLimit {
2199+
// report the error
2200+
return false, fmt.Errorf("attachment of disk %q failed: requested device %q but found %q", d.name, expectedDevice, device)
2201+
}
2202+
return false, nil
2203+
}
2204+
}
2205+
21802206
if attachmentStatus == status {
21812207
// Attachment is in requested state, finish waiting
21822208
return true, nil
21832209
}
21842210
// continue waiting
2211+
errorCount = 0
21852212
klog.V(2).Infof("Waiting for volume %q state: actual=%s, desired=%s", d.awsID, attachmentStatus, status)
21862213
return false, nil
21872214
})
@@ -2321,7 +2348,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
23212348
klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse)
23222349
}
23232350

2324-
attachment, err := disk.waitForAttachmentStatus("attached")
2351+
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device)
23252352

23262353
if err != nil {
23272354
if err == wait.ErrWaitTimeout {
@@ -2341,6 +2368,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
23412368
return "", fmt.Errorf("unexpected state: attachment nil after attached %q to %q", diskName, nodeName)
23422369
}
23432370
if ec2Device != aws.StringValue(attachment.Device) {
2371+
// Already checked in waitForAttachmentStatus(), but just to be sure...
23442372
return "", fmt.Errorf("disk attachment of %q to %q failed: requested device %q but found %q", diskName, nodeName, ec2Device, aws.StringValue(attachment.Device))
23452373
}
23462374
if awsInstance.awsID != aws.StringValue(attachment.InstanceId) {
@@ -2398,7 +2426,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
23982426
return "", errors.New("no response from DetachVolume")
23992427
}
24002428

2401-
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached")
2429+
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "")
24022430
if err != nil {
24032431
return "", err
24042432
}

0 commit comments

Comments
 (0)