Skip to content

Commit fb6dca0

Browse files
committed
Add handling for DRA GPUs exposed in GetGpuInfoForMetrics
1 parent 1acc8c2 commit fb6dca0

File tree

2 files changed

+172
-9
lines changed

2 files changed

+172
-9
lines changed

cluster-autoscaler/utils/gpu/gpu.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package gpu
1818

1919
import (
20+
"fmt"
21+
2022
apiv1 "k8s.io/api/core/v1"
2123
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
2224
podutils "k8s.io/autoscaler/cluster-autoscaler/utils/pod"
@@ -56,13 +58,20 @@ func GetGpuInfoForMetrics(gpuConfig *cloudprovider.GpuConfig, availableGPUTypes
5658
if gpuConfig == nil {
5759
return "", MetricsNoGPU
5860
}
61+
5962
resourceName := gpuConfig.ExtendedResourceName
6063
capacity, capacityFound := node.Status.Capacity[resourceName]
6164
// There is no label value, fallback to generic solution
6265
if gpuConfig.Type == "" && capacityFound && !capacity.IsZero() {
6366
return resourceName.String(), MetricsGenericGPU
6467
}
6568

69+
// GPU is exposed using DRA, capacity won't be present
70+
if gpuConfig.ExposedViaDra() {
71+
draResourceName := fmt.Sprintf("dra_%s", gpuConfig.DraDriverName)
72+
return draResourceName, validateGpuType(availableGPUTypes, gpuConfig.Type)
73+
}
74+
6675
// GKE-specific label & capacity are present - consistent state
6776
if capacityFound {
6877
return resourceName.String(), validateGpuType(availableGPUTypes, gpuConfig.Type)

cluster-autoscaler/utils/gpu/gpu_test.go

Lines changed: 163 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@ See the License for the specific language governing permissions and
1414
limitations under the License.
1515
*/
1616

17-
package gpu
17+
package gpu_test
1818

1919
import (
2020
"testing"
2121

2222
apiv1 "k8s.io/api/core/v1"
2323
"k8s.io/apimachinery/pkg/api/resource"
2424
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
26+
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
27+
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
28+
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
2529
"k8s.io/autoscaler/cluster-autoscaler/utils/test"
2630

2731
"github.com/stretchr/testify/assert"
@@ -45,9 +49,9 @@ func TestNodeHasGpu(t *testing.T) {
4549
Allocatable: apiv1.ResourceList{},
4650
},
4751
}
48-
nodeGpuReady.Status.Allocatable[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
49-
nodeGpuReady.Status.Capacity[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
50-
assert.True(t, NodeHasGpu(GPULabel, nodeGpuReady))
52+
nodeGpuReady.Status.Allocatable[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
53+
nodeGpuReady.Status.Capacity[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
54+
assert.True(t, gpu.NodeHasGpu(GPULabel, nodeGpuReady))
5155

5256
nodeGpuUnready := &apiv1.Node{
5357
ObjectMeta: metav1.ObjectMeta{
@@ -59,7 +63,7 @@ func TestNodeHasGpu(t *testing.T) {
5963
Allocatable: apiv1.ResourceList{},
6064
},
6165
}
62-
assert.True(t, NodeHasGpu(GPULabel, nodeGpuUnready))
66+
assert.True(t, gpu.NodeHasGpu(GPULabel, nodeGpuUnready))
6367

6468
nodeNoGpu := &apiv1.Node{
6569
ObjectMeta: metav1.ObjectMeta{
@@ -71,14 +75,164 @@ func TestNodeHasGpu(t *testing.T) {
7175
Allocatable: apiv1.ResourceList{},
7276
},
7377
}
74-
assert.False(t, NodeHasGpu(GPULabel, nodeNoGpu))
78+
assert.False(t, gpu.NodeHasGpu(GPULabel, nodeNoGpu))
7579
}
7680

7781
func TestPodRequestsGpu(t *testing.T) {
7882
podNoGpu := test.BuildTestPod("podNoGpu", 0, 1000)
7983
podWithGpu := test.BuildTestPod("pod1AnyGpu", 0, 1000)
80-
podWithGpu.Spec.Containers[0].Resources.Requests[ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
84+
podWithGpu.Spec.Containers[0].Resources.Requests[gpu.ResourceNvidiaGPU] = *resource.NewQuantity(1, resource.DecimalSI)
8185

82-
assert.False(t, PodRequestsGpu(podNoGpu))
83-
assert.True(t, PodRequestsGpu(podWithGpu))
86+
assert.False(t, gpu.PodRequestsGpu(podNoGpu))
87+
assert.True(t, gpu.PodRequestsGpu(podWithGpu))
88+
}
89+
90+
func TestGetGpuInfoForMetrics(t *testing.T) {
91+
knownGpu := "nvidia-tesla-k80"
92+
unknownGpu := "unknown-gpu"
93+
availableGPUTypes := map[string]struct{}{
94+
knownGpu: {},
95+
}
96+
resourceName := apiv1.ResourceName(gpu.ResourceNvidiaGPU)
97+
98+
// Basic node
99+
node := test.BuildTestNode("node", 1000, 1000)
100+
101+
// Node with GPU capacity
102+
nodeWithGpu := test.BuildTestNode("node-with-gpu", 1000, 1000)
103+
nodeWithGpu.Status.Capacity[resourceName] = *resource.NewQuantity(1, resource.DecimalSI)
104+
105+
// Node without GPU capacity
106+
nodeWithoutGpu := test.BuildTestNode("node-without-gpu", 1000, 1000)
107+
108+
// Node group with GPU in template
109+
provider := testprovider.TestCloudProvider{}
110+
templateWithGpu := test.BuildTestNode("template-with-gpu", 1000, 1000)
111+
templateWithGpu.Status.Capacity[resourceName] = *resource.NewQuantity(1, resource.DecimalSI)
112+
nodeGroupWithGpu := provider.BuildNodeGroup("ng-with-gpu", 1, 10, 1, false, false, "n1-standard-1", nil)
113+
114+
// Node group without GPU in template
115+
templateWithoutGpu := test.BuildTestNode("template-without-gpu", 1000, 1000)
116+
nodeGroupWithoutGpu := provider.BuildNodeGroup("ng-without-gpu", 1, 10, 1, false, false, "n1-standard-1", nil)
117+
118+
templates := map[string]*framework.NodeInfo{
119+
nodeGroupWithoutGpu.Id(): framework.NewNodeInfo(templateWithoutGpu, nil),
120+
nodeGroupWithGpu.Id(): framework.NewNodeInfo(templateWithGpu, nil),
121+
}
122+
123+
provider.SetMachineTemplates(templates)
124+
125+
testCases := []struct {
126+
name string
127+
gpuConfig *cloudprovider.GpuConfig
128+
node *apiv1.Node
129+
nodeGroup cloudprovider.NodeGroup
130+
expectedGpuResource string
131+
expectedGpuType string
132+
}{
133+
{
134+
name: "no gpu config",
135+
gpuConfig: nil,
136+
node: node,
137+
nodeGroup: nil,
138+
expectedGpuResource: "",
139+
expectedGpuType: gpu.MetricsNoGPU,
140+
},
141+
{
142+
name: "generic gpu",
143+
gpuConfig: &cloudprovider.GpuConfig{
144+
Type: "",
145+
ExtendedResourceName: resourceName,
146+
},
147+
node: nodeWithGpu,
148+
nodeGroup: nil,
149+
expectedGpuResource: gpu.ResourceNvidiaGPU,
150+
expectedGpuType: gpu.MetricsGenericGPU,
151+
},
152+
{
153+
name: "dra gpu, known type",
154+
gpuConfig: &cloudprovider.GpuConfig{
155+
Type: knownGpu,
156+
DraDriverName: "test-driver",
157+
},
158+
node: nodeWithoutGpu,
159+
nodeGroup: nil,
160+
expectedGpuResource: "dra_test-driver",
161+
expectedGpuType: knownGpu,
162+
},
163+
{
164+
name: "dra gpu, unknown type",
165+
gpuConfig: &cloudprovider.GpuConfig{
166+
Type: unknownGpu,
167+
DraDriverName: "test-driver",
168+
},
169+
node: nodeWithoutGpu,
170+
nodeGroup: nil,
171+
expectedGpuResource: "dra_test-driver",
172+
expectedGpuType: gpu.MetricsUnknownGPU,
173+
},
174+
{
175+
name: "capacity present, known type",
176+
gpuConfig: &cloudprovider.GpuConfig{
177+
Type: knownGpu,
178+
ExtendedResourceName: resourceName,
179+
},
180+
node: nodeWithGpu,
181+
nodeGroup: nil,
182+
expectedGpuResource: gpu.ResourceNvidiaGPU,
183+
expectedGpuType: knownGpu,
184+
},
185+
{
186+
name: "capacity present, unknown type",
187+
gpuConfig: &cloudprovider.GpuConfig{
188+
Type: unknownGpu,
189+
ExtendedResourceName: resourceName,
190+
},
191+
node: nodeWithGpu,
192+
nodeGroup: nil,
193+
expectedGpuResource: gpu.ResourceNvidiaGPU,
194+
expectedGpuType: gpu.MetricsUnknownGPU,
195+
},
196+
{
197+
name: "no capacity, template has gpu",
198+
gpuConfig: &cloudprovider.GpuConfig{
199+
Type: knownGpu,
200+
ExtendedResourceName: resourceName,
201+
},
202+
node: nodeWithoutGpu,
203+
nodeGroup: nodeGroupWithGpu,
204+
expectedGpuResource: gpu.ResourceNvidiaGPU,
205+
expectedGpuType: gpu.MetricsMissingGPU,
206+
},
207+
{
208+
name: "no capacity, template has no gpu",
209+
gpuConfig: &cloudprovider.GpuConfig{
210+
Type: knownGpu,
211+
ExtendedResourceName: resourceName,
212+
},
213+
node: nodeWithoutGpu,
214+
nodeGroup: nodeGroupWithoutGpu,
215+
expectedGpuResource: gpu.ResourceNvidiaGPU,
216+
expectedGpuType: gpu.MetricsUnexpectedLabelGPU,
217+
},
218+
{
219+
name: "no capacity, no node group",
220+
gpuConfig: &cloudprovider.GpuConfig{
221+
Type: knownGpu,
222+
ExtendedResourceName: resourceName,
223+
},
224+
node: nodeWithoutGpu,
225+
nodeGroup: nil,
226+
expectedGpuResource: gpu.ResourceNvidiaGPU,
227+
expectedGpuType: gpu.MetricsUnexpectedLabelGPU,
228+
},
229+
}
230+
231+
for _, tc := range testCases {
232+
t.Run(tc.name, func(t *testing.T) {
233+
gpuResource, gpuType := gpu.GetGpuInfoForMetrics(tc.gpuConfig, availableGPUTypes, tc.node, tc.nodeGroup)
234+
assert.Equal(t, tc.expectedGpuResource, gpuResource)
235+
assert.Equal(t, tc.expectedGpuType, gpuType)
236+
})
237+
}
84238
}

0 commit comments

Comments
 (0)