@@ -90,10 +90,50 @@ const (
90
90
91
91
// This is the sleep interval specified in the command executed in the pod so that container is restarted within the expected test run time
92
92
sleepIntervalWithRestart string = "60s"
93
+
94
+ // This is the sleep interval specified in the command executed in the pod so that container is restarted within the expected test run time
95
+ sleepIntervalToCompletion string = "5s"
93
96
)
94
97
95
98
func testDevicePlugin (f * framework.Framework , pluginSockDir string ) {
96
99
pluginSockDir = filepath .Join (pluginSockDir ) + "/"
100
+
101
+ type ResourceValue struct {
102
+ Allocatable int
103
+ Capacity int
104
+ }
105
+
106
+ devicePluginGracefulTimeout := 5 * time .Minute // see endpointStopGracePeriod in pkg/kubelet/cm/devicemanager/types.go
107
+
108
+ var getNodeResourceValues = func (ctx context.Context , resourceName string ) ResourceValue {
109
+ ginkgo .GinkgoHelper ()
110
+ node := getLocalNode (ctx , f )
111
+
112
+ // -1 represents that the resource is not found
113
+ result := ResourceValue {
114
+ Allocatable : - 1 ,
115
+ Capacity : - 1 ,
116
+ }
117
+
118
+ for key , val := range node .Status .Capacity {
119
+ resource := string (key )
120
+ if resource == resourceName {
121
+ result .Capacity = int (val .Value ())
122
+ break
123
+ }
124
+ }
125
+
126
+ for key , val := range node .Status .Allocatable {
127
+ resource := string (key )
128
+ if resource == resourceName {
129
+ result .Allocatable = int (val .Value ())
130
+ break
131
+ }
132
+ }
133
+
134
+ return result
135
+ }
136
+
97
137
f .Context ("DevicePlugin" , f .WithSerial (), f .WithDisruptive (), func () {
98
138
var devicePluginPod , dptemplate * v1.Pod
99
139
var v1alphaPodResources * kubeletpodresourcesv1alpha1.ListPodResourcesResponse
@@ -428,6 +468,55 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
428
468
framework .ExpectNoError (err , "inconsistent device assignment after pod restart" )
429
469
})
430
470
471
+ ginkgo .It ("will not attempt to admit the succeeded pod after the kubelet restart and device plugin removed" , func (ctx context.Context ) {
472
+ podRECMD := fmt .Sprintf ("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s" , sleepIntervalToCompletion )
473
+ podSpec := makeBusyboxPod (SampleDeviceResourceName , podRECMD )
474
+ podSpec .Spec .RestartPolicy = v1 .RestartPolicyNever
475
+ // Making sure the pod will not be garbage collected and will stay thru the kubelet restart after
476
+ // it reached the terminated state. Using finalizers makes the test more reliable.
477
+ podSpec .ObjectMeta .Finalizers = []string {testFinalizer }
478
+ pod := e2epod .NewPodClient (f ).CreateSync (ctx , podSpec )
479
+
480
+ deviceIDRE := "stub devices: (Dev-[0-9]+)"
481
+ devID1 , err := parseLog (ctx , f , pod .Name , pod .Name , deviceIDRE )
482
+ framework .ExpectNoError (err , "getting logs for pod %q" , pod .Name )
483
+
484
+ gomega .Expect (devID1 ).To (gomega .Not (gomega .Equal ("" )), "pod requested a device but started successfully without" )
485
+
486
+ pod , err = e2epod .NewPodClient (f ).Get (ctx , pod .Name , metav1.GetOptions {})
487
+ framework .ExpectNoError (err )
488
+
489
+ ginkgo .By ("Wait for node to be ready" )
490
+ gomega .Expect (e2enode .WaitForAllNodesSchedulable (ctx , f .ClientSet , 5 * time .Minute )).To (gomega .Succeed ())
491
+
492
+ ginkgo .By ("Waiting for pod to succeed" )
493
+ gomega .Expect (e2epod .WaitForPodSuccessInNamespace (ctx , f .ClientSet , pod .Name , pod .Namespace )).To (gomega .Succeed ())
494
+
495
+ ginkgo .By ("Deleting the device plugin" )
496
+ e2epod .NewPodClient (f ).DeleteSync (ctx , devicePluginPod .Name , metav1.DeleteOptions {}, time .Minute )
497
+ waitForContainerRemoval (ctx , devicePluginPod .Spec .Containers [0 ].Name , devicePluginPod .Name , devicePluginPod .Namespace )
498
+
499
+ gomega .Eventually (getNodeResourceValues , devicePluginGracefulTimeout , f .Timeouts .Poll ).WithContext (ctx ).WithArguments (SampleDeviceResourceName ).Should (gomega .Equal (ResourceValue {Allocatable : 0 , Capacity : int (expectedSampleDevsAmount )}))
500
+
501
+ ginkgo .By ("Restarting Kubelet" )
502
+ restartKubelet (true )
503
+
504
+ ginkgo .By ("Wait for node to be ready again" )
505
+ gomega .Expect (e2enode .WaitForAllNodesSchedulable (ctx , f .ClientSet , 5 * time .Minute )).To (gomega .Succeed ())
506
+
507
+ ginkgo .By ("Pod should still be in Succeed state" )
508
+ // This ensures that the pod was admitted successfully.
509
+ // In the past we had and issue when kubelet will attempt to re-admit the terminated pod and will change it's phase to Failed.
510
+ // There are no indication that the pod was re-admitted so we just wait for a minute after the node became ready.
511
+ gomega .Consistently (func () v1.PodPhase {
512
+ pod , err = f .ClientSet .CoreV1 ().Pods (f .Namespace .Name ).Get (ctx , pod .Name , metav1.GetOptions {})
513
+ return pod .Status .Phase
514
+ }, 1 * time .Minute , f .Timeouts .Poll ).Should (gomega .Equal (v1 .PodSucceeded ))
515
+
516
+ ginkgo .By ("Removing the finalizer from the pod so it can be deleted now" )
517
+ e2epod .NewPodClient (f ).RemoveFinalizer (context .TODO (), podSpec .Name , testFinalizer )
518
+ })
519
+
431
520
// simulate device plugin re-registration, *but not* container and kubelet restart.
432
521
// After the device plugin has re-registered, the list healthy devices is repopulated based on the devices discovered.
433
522
// Once Pod2 is running we determine the device that was allocated it. As long as the device allocation succeeds the
@@ -826,6 +915,9 @@ func testDevicePluginNodeReboot(f *framework.Framework, pluginSockDir string) {
826
915
continue
827
916
}
828
917
918
+ ginkgo .By ("Removing the finalizer from the pod in case it was used" )
919
+ e2epod .NewPodClient (f ).RemoveFinalizer (context .TODO (), p .Name , testFinalizer )
920
+
829
921
framework .Logf ("Deleting pod: %s" , p .Name )
830
922
e2epod .NewPodClient (f ).DeleteSync (ctx , p .Name , metav1.DeleteOptions {}, 2 * time .Minute )
831
923
}
0 commit comments