@@ -23,6 +23,7 @@ import (
23
23
24
24
v1 "k8s.io/api/core/v1"
25
25
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26
+ "k8s.io/apimachinery/pkg/util/uuid"
26
27
kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics"
27
28
"k8s.io/kubernetes/test/e2e/framework"
28
29
"k8s.io/kubernetes/test/e2e/framework/gpu"
@@ -33,6 +34,34 @@ import (
33
34
"github.com/prometheus/common/model"
34
35
)
35
36
37
+ // numberOfNVIDIAGPUs returns the number of GPUs advertised by a node
38
+ // This is based on the Device Plugin system and expected to run on a COS based node
39
+ // After the NVIDIA drivers were installed
40
+ // TODO make this generic and not linked to COS only
41
+ func numberOfNVIDIAGPUs (node * v1.Node ) int64 {
42
+ val , ok := node .Status .Capacity [gpu .NVIDIAGPUResourceName ]
43
+ if ! ok {
44
+ return 0
45
+ }
46
+ return val .Value ()
47
+ }
48
+
49
+ // NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
50
+ func NVIDIADevicePlugin () * v1.Pod {
51
+ ds , err := framework .DsFromManifest (gpu .GPUDevicePluginDSYAML )
52
+ framework .ExpectNoError (err )
53
+ p := & v1.Pod {
54
+ ObjectMeta : metav1.ObjectMeta {
55
+ Name : "device-plugin-nvidia-gpu-" + string (uuid .NewUUID ()),
56
+ Namespace : metav1 .NamespaceSystem ,
57
+ },
58
+ Spec : ds .Spec .Template .Spec ,
59
+ }
60
+ // Remove node affinity
61
+ p .Spec .Affinity = nil
62
+ return p
63
+ }
64
+
36
65
// Serial because the test restarts Kubelet
37
66
var _ = framework .KubeDescribe ("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]" , func () {
38
67
f := framework .NewDefaultFramework ("device-plugin-gpus-errors" )
@@ -47,15 +76,15 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
47
76
}
48
77
49
78
ginkgo .By ("Creating the Google Device Plugin pod for NVIDIA GPU in GKE" )
50
- devicePluginPod , err = f .ClientSet .CoreV1 ().Pods (metav1 .NamespaceSystem ).Create (gpu . NVIDIADevicePlugin ())
79
+ devicePluginPod , err = f .ClientSet .CoreV1 ().Pods (metav1 .NamespaceSystem ).Create (NVIDIADevicePlugin ())
51
80
framework .ExpectNoError (err )
52
81
53
82
ginkgo .By ("Waiting for GPUs to become available on the local node" )
54
83
gomega .Eventually (func () bool {
55
- return gpu . NumberOfNVIDIAGPUs (getLocalNode (f )) > 0
84
+ return numberOfNVIDIAGPUs (getLocalNode (f )) > 0
56
85
}, 5 * time .Minute , framework .Poll ).Should (gomega .BeTrue ())
57
86
58
- if gpu . NumberOfNVIDIAGPUs (getLocalNode (f )) < 2 {
87
+ if numberOfNVIDIAGPUs (getLocalNode (f )) < 2 {
59
88
ginkgo .Skip ("Not enough GPUs to execute this test (at least two needed)" )
60
89
}
61
90
})
@@ -95,7 +124,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
95
124
restartKubelet ()
96
125
framework .WaitForAllNodesSchedulable (f .ClientSet , framework .TestContext .NodeSchedulableTimeout )
97
126
gomega .Eventually (func () bool {
98
- return gpu . NumberOfNVIDIAGPUs (getLocalNode (f )) > 0
127
+ return numberOfNVIDIAGPUs (getLocalNode (f )) > 0
99
128
}, 5 * time .Minute , framework .Poll ).Should (gomega .BeTrue ())
100
129
p2 := f .PodClient ().CreateSync (makeBusyboxPod (gpu .NVIDIAGPUResourceName , podRECMD ))
101
130
@@ -110,7 +139,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
110
139
gomega .Eventually (func () bool {
111
140
node , err := f .ClientSet .CoreV1 ().Nodes ().Get (framework .TestContext .NodeName , metav1.GetOptions {})
112
141
framework .ExpectNoError (err )
113
- return gpu . NumberOfNVIDIAGPUs (node ) <= 0
142
+ return numberOfNVIDIAGPUs (node ) <= 0
114
143
}, 10 * time .Minute , framework .Poll ).Should (gomega .BeTrue ())
115
144
ginkgo .By ("Checking that scheduled pods can continue to run even after we delete device plugin." )
116
145
ensurePodContainerRestart (f , p1 .Name , p1 .Name )
0 commit comments