Skip to content

Commit a640545

Browse files
authored
Merge pull request kubernetes#93567 from gnufied/fix-stale-attachments
Make AttachDisk for EBS idempotent again
2 parents 97c5f1f + f91d448 commit a640545

File tree

4 files changed

+353
-8
lines changed

4 files changed

+353
-8
lines changed

staging/src/k8s.io/legacy-cloud-providers/aws/aws.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,14 @@ const (
257257
filterNodeLimit = 150
258258
)
259259

260+
const (
261+
// represents expected attachment status of a volume after attach
262+
volumeAttachedStatus = "attached"
263+
264+
// represents expected attachment status of a volume after detach
265+
volumeDetachedStatus = "detached"
266+
)
267+
260268
// awsTagNameMasterRoles is a set of well-known AWS tag names that indicate the instance is a master
261269
// The major consequence is that it is then not considered for AWS zone discovery for dynamic volume creation.
262270
var awsTagNameMasterRoles = sets.NewString("kubernetes.io/role/master", "k8s.io/role/master")
@@ -1943,7 +1951,6 @@ func (c *Cloud) getMountDevice(
19431951
// AWS API returns consistent result next time (i.e. the volume is detached).
19441952
status := volumeStatus[mappingVolumeID]
19451953
klog.Warningf("Got assignment call for already-assigned volume: %s@%s, volume status: %s", mountDevice, mappingVolumeID, status)
1946-
return mountDevice, false, fmt.Errorf("volume is still being detached from the node")
19471954
}
19481955
return mountDevice, true, nil
19491956
}
@@ -2144,7 +2151,7 @@ func (c *Cloud) applyUnSchedulableTaint(nodeName types.NodeName, reason string)
21442151

21452152
// waitForAttachmentStatus polls until the attachment status is the expected value
21462153
// On success, it returns the last attachment state.
2147-
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string) (*ec2.VolumeAttachment, error) {
2154+
func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expectedDevice string, alreadyAttached bool) (*ec2.VolumeAttachment, error) {
21482155
backoff := wait.Backoff{
21492156
Duration: volumeAttachmentStatusPollDelay,
21502157
Factor: volumeAttachmentStatusFactor,
@@ -2169,7 +2176,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
21692176
if err != nil {
21702177
// The VolumeNotFound error is special -- we don't need to wait for it to repeat
21712178
if isAWSErrorVolumeNotFound(err) {
2172-
if status == "detached" {
2179+
if status == volumeDetachedStatus {
21732180
// The disk doesn't exist, assume it's detached, log warning and stop waiting
21742181
klog.Warningf("Waiting for volume %q to be detached but the volume does not exist", d.awsID)
21752182
stateStr := "detached"
@@ -2178,7 +2185,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
21782185
}
21792186
return true, nil
21802187
}
2181-
if status == "attached" {
2188+
if status == volumeAttachedStatus {
21822189
// The disk doesn't exist, complain, give up waiting and report error
21832190
klog.Warningf("Waiting for volume %q to be attached but the volume does not exist", d.awsID)
21842191
return false, err
@@ -2213,7 +2220,7 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
22132220
}
22142221
}
22152222
if attachmentStatus == "" {
2216-
attachmentStatus = "detached"
2223+
attachmentStatus = volumeDetachedStatus
22172224
}
22182225
if attachment != nil {
22192226
// AWS eventual consistency can go back in time.
@@ -2242,6 +2249,13 @@ func (d *awsDisk) waitForAttachmentStatus(status string, expectedInstance, expec
22422249
}
22432250
}
22442251

2252+
// if we expected volume to be attached and it was reported as already attached via DescribeInstance call
2253+
// but DescribeVolume told us volume is detached, we will short-circuit this long wait loop and return error
2254+
// so as AttachDisk can be retried without waiting for 20 minutes.
2255+
if (status == volumeAttachedStatus) && alreadyAttached && (attachmentStatus != status) {
2256+
return false, fmt.Errorf("attachment of disk %q failed, expected device to be attached but was %s", d.name, attachmentStatus)
2257+
}
2258+
22452259
if attachmentStatus == status {
22462260
// Attachment is in requested state, finish waiting
22472261
return true, nil
@@ -2387,7 +2401,7 @@ func (c *Cloud) AttachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
23872401
klog.V(2).Infof("AttachVolume volume=%q instance=%q request returned %v", disk.awsID, awsInstance.awsID, attachResponse)
23882402
}
23892403

2390-
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device)
2404+
attachment, err := disk.waitForAttachmentStatus("attached", awsInstance.awsID, ec2Device, alreadyAttached)
23912405

23922406
if err != nil {
23932407
if err == wait.ErrWaitTimeout {
@@ -2465,7 +2479,7 @@ func (c *Cloud) DetachDisk(diskName KubernetesVolumeID, nodeName types.NodeName)
24652479
return "", errors.New("no response from DetachVolume")
24662480
}
24672481

2468-
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "")
2482+
attachment, err := diskInfo.disk.waitForAttachmentStatus("detached", awsInstance.awsID, "", false)
24692483
if err != nil {
24702484
return "", err
24712485
}
@@ -4773,7 +4787,7 @@ func setNodeDisk(
47734787
}
47744788

47754789
func getInitialAttachDetachDelay(status string) time.Duration {
4776-
if status == "detached" {
4790+
if status == volumeDetachedStatus {
47774791
return volumeDetachmentStatusInitialDelay
47784792
}
47794793
return volumeAttachmentStatusInitialDelay

test/e2e/storage/pd.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,37 @@ var _ = utils.SIGDescribe("Pod Disks", func() {
451451
ginkgo.By("delete a PD")
452452
framework.ExpectNoError(e2epv.DeletePDWithRetry("non-exist"))
453453
})
454+
455+
// This test is marked to run as serial so as device selection on AWS does not
456+
// conflict with other concurrent attach operations.
457+
ginkgo.It("[Serial] attach on previously attached volumes should work", func() {
458+
e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws")
459+
ginkgo.By("creating PD")
460+
diskName, err := e2epv.CreatePDWithRetry()
461+
framework.ExpectNoError(err, "Error creating PD")
462+
463+
// this should be safe to do because if attach fails then detach will be considered
464+
// successful and we will delete the volume.
465+
defer func() {
466+
detachAndDeletePDs(diskName, []types.NodeName{host0Name})
467+
}()
468+
469+
ginkgo.By("Attaching volume to a node")
470+
err = attachPD(host0Name, diskName)
471+
framework.ExpectNoError(err, "Error attaching PD")
472+
473+
pod := testPDPod([]string{diskName}, host0Name /*readOnly*/, false, 1)
474+
ginkgo.By("Creating test pod with same volume")
475+
_, err = podClient.Create(context.TODO(), pod, metav1.CreateOptions{})
476+
framework.ExpectNoError(err, "Failed to create pod")
477+
framework.ExpectNoError(e2epod.WaitForPodRunningInNamespaceSlow(f.ClientSet, pod.Name, f.Namespace.Name))
478+
479+
ginkgo.By("deleting the pod")
480+
framework.ExpectNoError(podClient.Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0)), "Failed to delete pod")
481+
framework.Logf("deleted pod %q", pod.Name)
482+
ginkgo.By("waiting for PD to detach")
483+
framework.ExpectNoError(waitForPDDetach(diskName, host0Name))
484+
})
454485
})
455486

456487
func countReadyNodes(c clientset.Interface, hostName types.NodeName) int {
@@ -474,6 +505,7 @@ func verifyPDContentsViaContainer(namespace string, f *framework.Framework, podN
474505
}
475506
}
476507

508+
// TODO: move detachPD to standard cloudprovider functions so as these tests can run on other cloudproviders too
477509
func detachPD(nodeName types.NodeName, pdName string) error {
478510
if framework.TestContext.Provider == "gce" || framework.TestContext.Provider == "gke" {
479511
gceCloud, err := gce.GetGCECloud()
@@ -512,6 +544,38 @@ func detachPD(nodeName types.NodeName, pdName string) error {
512544
}
513545
}
514546

547+
// TODO: move attachPD to standard cloudprovider functions so as these tests can run on other cloudproviders too
548+
func attachPD(nodeName types.NodeName, pdName string) error {
549+
if framework.TestContext.Provider == "gce" || framework.TestContext.Provider == "gke" {
550+
gceCloud, err := gce.GetGCECloud()
551+
if err != nil {
552+
return err
553+
}
554+
err = gceCloud.AttachDisk(pdName, nodeName, false /*readOnly*/, false /*regional*/)
555+
if err != nil {
556+
framework.Logf("Error attaching PD %q: %v", pdName, err)
557+
}
558+
return err
559+
560+
} else if framework.TestContext.Provider == "aws" {
561+
awsSession, err := session.NewSession()
562+
if err != nil {
563+
return fmt.Errorf("error creating session: %v", err)
564+
}
565+
client := ec2.New(awsSession)
566+
tokens := strings.Split(pdName, "/")
567+
awsVolumeID := tokens[len(tokens)-1]
568+
ebsUtil := utils.NewEBSUtil(client)
569+
err = ebsUtil.AttachDisk(awsVolumeID, string(nodeName))
570+
if err != nil {
571+
return fmt.Errorf("error attaching volume %s to node %s: %v", awsVolumeID, nodeName, err)
572+
}
573+
return nil
574+
} else {
575+
return fmt.Errorf("Provider does not support volume attaching")
576+
}
577+
}
578+
515579
// Returns pod spec suitable for api Create call. Handles gce, gke and aws providers only and
516580
// escapes if a different provider is supplied.
517581
// The first container name is hard-coded to "mycontainer". Subsequent containers are named:

test/e2e/storage/utils/BUILD

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ go_library(
77
srcs = [
88
"create.go",
99
"deployment.go",
10+
"ebs.go",
1011
"framework.go",
1112
"host_exec.go",
1213
"local.go",
@@ -36,9 +37,12 @@ go_library(
3637
"//test/e2e/framework/ssh:go_default_library",
3738
"//test/e2e/framework/testfiles:go_default_library",
3839
"//test/utils/image:go_default_library",
40+
"//vendor/github.com/aws/aws-sdk-go/aws:go_default_library",
41+
"//vendor/github.com/aws/aws-sdk-go/service/ec2:go_default_library",
3942
"//vendor/github.com/onsi/ginkgo:go_default_library",
4043
"//vendor/github.com/onsi/gomega:go_default_library",
4144
"//vendor/github.com/pkg/errors:go_default_library",
45+
"//vendor/k8s.io/klog/v2:go_default_library",
4246
"//vendor/k8s.io/utils/exec:go_default_library",
4347
],
4448
)

0 commit comments

Comments
 (0)