Skip to content

Commit f989ec1

Browse files
authored
test: add options for additional resources and verify volume detach to node drain test (#11526)
1 parent 3a66e8f commit f989ec1

File tree

3 files changed

+137
-7
lines changed

3 files changed

+137
-7
lines changed

test/e2e/node_drain.go

Lines changed: 89 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ type NodeDrainTimeoutSpecInput struct {
6464
// Allows to inject a function to be run after test namespace is created.
6565
// If not specified, this is a no-op.
6666
PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string)
67+
68+
// Enables additional verification for volumes blocking machine deletion.
69+
// Requires to add appropriate resources via CreateAdditionalResources.
70+
VerifyNodeVolumeDetach bool
71+
72+
// Allows to overwrite the default function used for unblocking volume detachments.
73+
UnblockNodeVolumeDetachment func(ctx context.Context, bootstrapClusterProxy framework.ClusterProxy, cluster *clusterv1.Cluster)
74+
75+
// Allows to create additional resources.
76+
CreateAdditionalResources func(ctx context.Context, clusterProxy framework.ClusterProxy, cluster *clusterv1.Cluster)
6777
}
6878

6979
// NodeDrainTimeoutSpec goes through the following steps:
@@ -72,13 +82,16 @@ type NodeDrainTimeoutSpecInput struct {
7282
// * Deploy MachineDrainRules
7383
// * Deploy Deployment with unevictable Pods on CP & MD Nodes
7484
// * Deploy Deployment with evictable Pods with finalizer on CP & MD Nodes
85+
// * Deploy additional resources if defined in input
7586
// * Trigger Node drain by scaling down the control plane to 1 and MachineDeployments to 0
7687
// * Get draining control plane and MachineDeployment Machines
7788
// * Verify drain of Deployments with order 1
7889
// * Verify drain of Deployments with order 5
7990
// * Verify skipped Pods are still there and don't have a deletionTimestamp
8091
// * Verify Node drains for control plane and MachineDeployment Machines are blocked (only by PDBs)
8192
// * Set NodeDrainTimeout to 1s to unblock Node drain
93+
// * Verify machine deletion is blocked by waiting for volume detachment (only if VerifyNodeVolumeDetach is enabled)
94+
// * Unblocks waiting for volume detachment (only if VerifyNodeVolumeDetach is enabled)
8295
// * Verify scale down succeeded because Node drains were unblocked.
8396
func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeoutSpecInput) {
8497
var (
@@ -100,6 +113,10 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
100113
Expect(input.E2EConfig.GetIntervals(specName, "wait-deployment-available")).ToNot(BeNil())
101114
Expect(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")).ToNot(BeNil())
102115

116+
if input.VerifyNodeVolumeDetach && input.UnblockNodeVolumeDetachment == nil {
117+
input.UnblockNodeVolumeDetachment = unblockNodeVolumeDetachmentFunc(input.E2EConfig.GetIntervals(specName, "wait-control-plane"), input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"))
118+
}
119+
103120
// Setup a Namespace where to host objects for this spec and create a watcher for the namespace events.
104121
namespace, cancelWatches = framework.SetupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, input.PostNamespaceCreated)
105122
clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
@@ -147,6 +164,9 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
147164
Cluster: cluster,
148165
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
149166
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
167+
if input.VerifyNodeVolumeDetach {
168+
topology.NodeVolumeDetachTimeout = &metav1.Duration{Duration: time.Duration(0)}
169+
}
150170
if topology.Metadata.Labels == nil {
151171
topology.Metadata.Labels = map[string]string{}
152172
}
@@ -159,6 +179,9 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
159179
Cluster: cluster,
160180
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
161181
topology.NodeDrainTimeout = &metav1.Duration{Duration: time.Duration(0)}
182+
if input.VerifyNodeVolumeDetach {
183+
topology.NodeVolumeDetachTimeout = &metav1.Duration{Duration: time.Duration(0)}
184+
}
162185
if topology.Metadata.Labels == nil {
163186
topology.Metadata.Labels = map[string]string{}
164187
}
@@ -174,12 +197,14 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
174197
workloadClusterProxy := input.BootstrapClusterProxy.GetWorkloadCluster(ctx, cluster.Namespace, cluster.Name)
175198

176199
By("Deploy MachineDrainRules.")
177-
Expect(input.BootstrapClusterProxy.GetClient().Create(ctx,
178-
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-1", 1))).To(Succeed())
179-
Expect(input.BootstrapClusterProxy.GetClient().Create(ctx,
180-
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-5", 5))).To(Succeed())
181-
Expect(input.BootstrapClusterProxy.GetClient().Create(ctx,
182-
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-10", 10))).To(Succeed())
200+
machineDrainRules := []*clusterv1.MachineDrainRule{
201+
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-1", 1),
202+
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-5", 5),
203+
generateMachineDrainRule(namespace.Name, clusterName, "drain-order-10", 10),
204+
}
205+
for _, rule := range machineDrainRules {
206+
Expect(input.BootstrapClusterProxy.GetClient().Create(ctx, rule)).To(Succeed())
207+
}
183208

184209
By("Deploy Deployment with unevictable Pods on control plane and MachineDeployment Nodes.")
185210
framework.DeployUnevictablePod(ctx, framework.DeployUnevictablePodInput{
@@ -248,6 +273,10 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
248273
}
249274
}
250275

276+
if input.CreateAdditionalResources != nil {
277+
input.CreateAdditionalResources(ctx, input.BootstrapClusterProxy, cluster)
278+
}
279+
251280
By("Trigger Node drain by scaling down the control plane to 1 and MachineDeployments to 0.")
252281
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
253282
ClusterProxy: input.BootstrapClusterProxy,
@@ -432,7 +461,35 @@ func NodeDrainTimeoutSpec(ctx context.Context, inputGetter func() NodeDrainTimeo
432461
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"),
433462
})
434463

435-
By("Verify scale down succeeded because Node drains were unblocked")
464+
if input.VerifyNodeVolumeDetach {
465+
By("Verify Node removal for control plane and MachineDeployment Machines are blocked (only by volume detachments)")
466+
Eventually(func(g Gomega) {
467+
waitingCPMachine := &clusterv1.Machine{}
468+
g.Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, drainingCPMachineKey, waitingCPMachine)).To(Succeed())
469+
470+
condition := conditions.Get(waitingCPMachine, clusterv1.VolumeDetachSucceededCondition)
471+
g.Expect(condition).ToNot(BeNil())
472+
g.Expect(condition.Status).To(Equal(corev1.ConditionFalse))
473+
// Deletion still not be blocked because of the volume.
474+
g.Expect(condition.Message).To(ContainSubstring("Waiting for node volumes to be detached"))
475+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
476+
for _, machineKey := range drainingMDMachineKeys {
477+
Eventually(func(g Gomega) {
478+
drainedMDMachine := &clusterv1.Machine{}
479+
g.Expect(input.BootstrapClusterProxy.GetClient().Get(ctx, machineKey, drainedMDMachine)).To(Succeed())
480+
481+
condition := conditions.Get(drainedMDMachine, clusterv1.VolumeDetachSucceededCondition)
482+
g.Expect(condition).ToNot(BeNil())
483+
g.Expect(condition.Status).To(Equal(corev1.ConditionFalse)) // Deletion still not be blocked because of the volume.
484+
g.Expect(condition.Message).To(ContainSubstring("Waiting for node volumes to be detached"))
485+
}, input.E2EConfig.GetIntervals(specName, "wait-machine-deleted")...).Should(Succeed())
486+
}
487+
488+
By("Executing input.UnblockNodeVolumeDetachment to unblock waiting for volume detachments")
489+
input.UnblockNodeVolumeDetachment(ctx, input.BootstrapClusterProxy, cluster)
490+
}
491+
492+
By("Verify scale down succeeded because Node drains and Volume detachments were unblocked")
436493
// When we scale down the KCP, controlplane machines are deleted one by one, so it requires more time
437494
// MD Machine deletion is done in parallel and will be faster.
438495
nodeDrainTimeoutKCPInterval := getDrainAndDeleteInterval(input.E2EConfig.GetIntervals(specName, "wait-machine-deleted"), drainTimeout, controlPlaneReplicas)
@@ -641,3 +698,28 @@ func getDrainAndDeleteInterval(deleteInterval []interface{}, drainTimeout *metav
641698
res := []interface{}{intervalDuration.String(), deleteInterval[1]}
642699
return res
643700
}
701+
702+
func unblockNodeVolumeDetachmentFunc(waitControlPlaneIntervals, waitWorkerNodeIntervals []interface{}) func(ctx context.Context, bootstrapClusterProxy framework.ClusterProxy, cluster *clusterv1.Cluster) {
703+
return func(ctx context.Context, bootstrapClusterProxy framework.ClusterProxy, cluster *clusterv1.Cluster) {
704+
By("Set NodeVolumeDetachTimeout to 1s to unblock waiting for volume detachments")
705+
// Note: This also verifies that KCP & MachineDeployments are still propagating changes to NodeVolumeDetachTimeout down to
706+
// Machines that already have a deletionTimestamp.
707+
nodeVolumeDetachTimeout := &metav1.Duration{Duration: time.Duration(1) * time.Second}
708+
modifyControlPlaneViaClusterAndWait(ctx, modifyControlPlaneViaClusterAndWaitInput{
709+
ClusterProxy: bootstrapClusterProxy,
710+
Cluster: cluster,
711+
ModifyControlPlaneTopology: func(topology *clusterv1.ControlPlaneTopology) {
712+
topology.NodeVolumeDetachTimeout = nodeVolumeDetachTimeout
713+
},
714+
WaitForControlPlane: waitControlPlaneIntervals,
715+
})
716+
modifyMachineDeploymentViaClusterAndWait(ctx, modifyMachineDeploymentViaClusterAndWaitInput{
717+
ClusterProxy: bootstrapClusterProxy,
718+
Cluster: cluster,
719+
ModifyMachineDeploymentTopology: func(topology *clusterv1.MachineDeploymentTopology) {
720+
topology.NodeVolumeDetachTimeout = nodeVolumeDetachTimeout
721+
},
722+
WaitForMachineDeployments: waitWorkerNodeIntervals,
723+
})
724+
}
725+
}

test/e2e/node_drain_test.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,18 @@ limitations under the License.
2020
package e2e
2121

2222
import (
23+
"context"
24+
"fmt"
25+
2326
. "github.com/onsi/ginkgo/v2"
27+
. "github.com/onsi/gomega"
28+
corev1 "k8s.io/api/core/v1"
29+
storagev1 "k8s.io/api/storage/v1"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2431
"k8s.io/utils/ptr"
32+
33+
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
34+
"sigs.k8s.io/cluster-api/test/framework"
2535
)
2636

2737
var _ = Describe("When testing Node drain", func() {
@@ -34,6 +44,40 @@ var _ = Describe("When testing Node drain", func() {
3444
SkipCleanup: skipCleanup,
3545
Flavor: ptr.To("topology"),
3646
InfrastructureProvider: ptr.To("docker"),
47+
VerifyNodeVolumeDetach: true,
48+
CreateAdditionalResources: func(ctx context.Context, clusterProxy framework.ClusterProxy, cluster *clusterv1.Cluster) {
49+
workloadClusterClient := clusterProxy.GetWorkloadCluster(ctx, cluster.Namespace, cluster.Name).GetClient()
50+
51+
nodeList := &corev1.NodeList{}
52+
Expect(workloadClusterClient.List(ctx, nodeList)).To(Succeed())
53+
54+
// Create a fake VolumeAttachment object for each Node without having a real backing csi driver.
55+
for _, node := range nodeList.Items {
56+
va := generateVolumeAttachment(node)
57+
Expect(workloadClusterClient.Create(ctx, va)).To(Succeed())
58+
// Set .Status.Attached to true to make the VolumeAttachment blocking for machine deletions.
59+
va.Status.Attached = true
60+
Expect(workloadClusterClient.Status().Update(ctx, va)).To(Succeed())
61+
}
62+
},
3763
}
3864
})
3965
})
66+
67+
func generateVolumeAttachment(node corev1.Node) *storagev1.VolumeAttachment {
68+
return &storagev1.VolumeAttachment{
69+
ObjectMeta: metav1.ObjectMeta{
70+
Name: fmt.Sprintf("va-%s", node.GetName()),
71+
Finalizers: []string{
72+
"test.cluster.x-k8s.io/block",
73+
},
74+
},
75+
Spec: storagev1.VolumeAttachmentSpec{
76+
Attacher: "manual",
77+
NodeName: node.GetName(),
78+
Source: storagev1.VolumeAttachmentSource{
79+
PersistentVolumeName: ptr.To("foo"),
80+
},
81+
},
82+
}
83+
}

test/framework/convenience.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
coordinationv1 "k8s.io/api/coordination/v1"
2525
corev1 "k8s.io/api/core/v1"
2626
rbacv1 "k8s.io/api/rbac/v1"
27+
storagev1 "k8s.io/api/storage/v1"
2728
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
2829
apiextensionsv1beta "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
2930
"k8s.io/apimachinery/pkg/runtime"
@@ -88,6 +89,9 @@ func TryAddDefaultSchemes(scheme *runtime.Scheme) {
8889
// Add coordination to the schema
8990
// Note: This is e.g. used to trigger kube-controller-manager restarts by stealing its lease.
9091
_ = coordinationv1.AddToScheme(scheme)
92+
93+
// Add storagev1 to the scheme
94+
_ = storagev1.AddToScheme(scheme)
9195
}
9296

9397
// ObjectToKind returns the Kind without the package prefix. Pass in a pointer to a struct

0 commit comments

Comments
 (0)