Skip to content

Commit d529b17

Browse files
committed
Handle resource utilization calculation for GPUs exposed using DRA
1 parent fb6dca0 commit d529b17

File tree

5 files changed

+60
-26
lines changed

5 files changed

+60
-26
lines changed

cluster-autoscaler/cloudprovider/cloud_provider.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ type GpuConfig struct {
104104
DraDriverName string
105105
}
106106

107+
// ExposedViaDra determines whether a GPU described in the config
108+
// is exposed via device plugin or DRA driver
107109
func (gpu *GpuConfig) ExposedViaDra() bool {
108110
return gpu.DraDriverName != ""
109111
}

cluster-autoscaler/cloudprovider/gce/dynamicresources.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
117
package gce
218

319
import apiv1 "k8s.io/api/core/v1"

cluster-autoscaler/processors/customresources/gpu_processor.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
5050
_, hasGpuLabel := node.Labels[context.CloudProvider.GPULabel()]
5151
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
5252
directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
53-
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) && !gpuExposedViaDra(context, node) {
53+
if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
5454
klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
5555
node.Name)
5656
nodesWithUnreadyGpu[node.Name] = kubernetes.GetUnreadyNodeCopy(node, kubernetes.ResourceUnready)
@@ -72,22 +72,22 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
7272
// GetNodeResourceTargets returns mapping of resource names to their targets.
7373
// This includes resources which are not yet ready to use and visible in kubernetes.
7474
func (p *GpuCustomResourcesProcessor) GetNodeResourceTargets(context *context.AutoscalingContext, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) ([]CustomResourceTarget, errors.AutoscalerError) {
75-
if gpuExposedViaDra(context, node) {
76-
return []CustomResourceTarget{}, nil
77-
}
78-
79-
gpuTarget, err := p.getNodeGpuTarget(context.CloudProvider.GPULabel(), node, nodeGroup)
75+
gpuTarget, err := p.GetNodeGpuTarget(context, node, nodeGroup)
8076
return []CustomResourceTarget{gpuTarget}, err
8177
}
8278

83-
// getNodeGpuTarget returns the gpu target of a given node. This includes gpus
79+
// GetNodeGpuTarget returns the gpu target of a given node. This includes gpus
8480
// that are not ready to use and visible in kubernetes.
85-
func (p *GpuCustomResourcesProcessor) getNodeGpuTarget(GPULabel string, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (CustomResourceTarget, errors.AutoscalerError) {
86-
gpuLabel, found := node.Labels[GPULabel]
81+
func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(context *context.AutoscalingContext, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (CustomResourceTarget, errors.AutoscalerError) {
82+
gpuLabel, found := node.Labels[context.CloudProvider.GPULabel()]
8783
if !found {
8884
return CustomResourceTarget{}, nil
8985
}
9086

87+
if gpuExposedViaDra(context, node) {
88+
return CustomResourceTarget{}, nil
89+
}
90+
9191
gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
9292
if found && gpuAllocatable.Value() > 0 {
9393
return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil

cluster-autoscaler/simulator/utilization/info.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ type Info struct {
4848
// utilization is the sum of requests for it divided by allocatable. It also
4949
// returns the individual cpu, memory and gpu utilization.
5050
func Calculate(nodeInfo *framework.NodeInfo, skipDaemonSetPods, skipMirrorPods, draEnabled bool, gpuConfig *cloudprovider.GpuConfig, currentTime time.Time) (utilInfo Info, err error) {
51-
if gpuConfig != nil {
51+
if gpuConfig != nil && !gpuConfig.ExposedViaDra() {
5252
gpuUtil, err := CalculateUtilizationOfResource(nodeInfo, gpuConfig.ExtendedResourceName, skipDaemonSetPods, skipMirrorPods, currentTime)
5353
if err != nil {
5454
klog.V(3).Infof("node %s has unready GPU resource: %s", nodeInfo.Node().Name, gpuConfig.ExtendedResourceName)

cluster-autoscaler/simulator/utilization/info_test.go

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ func TestCalculate(t *testing.T) {
8282
SetNodeReadyState(node, true, time.Time{})
8383
nodeInfo := framework.NewTestNodeInfo(node, pod, pod, pod2)
8484

85-
gpuConfig := getGpuConfigFromNode(nodeInfo.Node())
85+
gpuConfig := getGpuConfigFromNode(nodeInfo.Node(), false)
8686
utilInfo, err := Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
8787
assert.NoError(t, err)
8888
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@@ -91,15 +91,15 @@ func TestCalculate(t *testing.T) {
9191
node2 := BuildTestNode("node2", 2000, -1)
9292
nodeInfo = framework.NewTestNodeInfo(node2, pod, pod, pod2)
9393

94-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
94+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
9595
_, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
9696
assert.Error(t, err)
9797

9898
node3 := BuildTestNode("node3", 2000, 2000000)
9999
SetNodeReadyState(node3, true, time.Time{})
100100
nodeInfo = framework.NewTestNodeInfo(node3, pod, podWithInitContainers, podWithLargeNonRestartableInitContainers)
101101

102-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
102+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
103103
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
104104
assert.NoError(t, err)
105105
assert.InEpsilon(t, 50.25, utilInfo.Utilization, 0.01)
@@ -113,21 +113,21 @@ func TestCalculate(t *testing.T) {
113113
daemonSetPod4.Annotations = map[string]string{"cluster-autoscaler.kubernetes.io/daemonset-pod": "true"}
114114

115115
nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, daemonSetPod3, daemonSetPod4)
116-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
116+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
117117
utilInfo, err = Calculate(nodeInfo, true, false, false, gpuConfig, testTime)
118118
assert.NoError(t, err)
119119
assert.InEpsilon(t, 2.5/10, utilInfo.Utilization, 0.01)
120120

121121
nodeInfo = framework.NewTestNodeInfo(node, pod, pod2, daemonSetPod3)
122-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
122+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
123123
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
124124
assert.NoError(t, err)
125125
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
126126

127127
terminatedPod := BuildTestPod("podTerminated", 100, 200000)
128128
terminatedPod.DeletionTimestamp = &metav1.Time{Time: testTime.Add(-10 * time.Minute)}
129129
nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, terminatedPod)
130-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
130+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
131131
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
132132
assert.NoError(t, err)
133133
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@@ -138,19 +138,19 @@ func TestCalculate(t *testing.T) {
138138
}
139139

140140
nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, mirrorPod)
141-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
141+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
142142
utilInfo, err = Calculate(nodeInfo, false, true, false, gpuConfig, testTime)
143143
assert.NoError(t, err)
144144
assert.InEpsilon(t, 2.0/9.0, utilInfo.Utilization, 0.01)
145145

146146
nodeInfo = framework.NewTestNodeInfo(node, pod, pod2, mirrorPod)
147-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
147+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
148148
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
149149
assert.NoError(t, err)
150150
assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
151151

152152
nodeInfo = framework.NewTestNodeInfo(node, pod, mirrorPod, daemonSetPod3)
153-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
153+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
154154
utilInfo, err = Calculate(nodeInfo, true, true, false, gpuConfig, testTime)
155155
assert.NoError(t, err)
156156
assert.InEpsilon(t, 1.0/8.0, utilInfo.Utilization, 0.01)
@@ -161,7 +161,7 @@ func TestCalculate(t *testing.T) {
161161
RequestGpuForPod(gpuPod, 1)
162162
TolerateGpuForPod(gpuPod)
163163
nodeInfo = framework.NewTestNodeInfo(gpuNode, pod, pod, gpuPod)
164-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
164+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
165165
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
166166
assert.NoError(t, err)
167167
assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
@@ -170,7 +170,7 @@ func TestCalculate(t *testing.T) {
170170
gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
171171
AddGpuLabelToNode(gpuNode)
172172
nodeInfo = framework.NewTestNodeInfo(gpuNode, pod, pod)
173-
gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
173+
gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
174174
utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
175175
assert.NoError(t, err)
176176
assert.Zero(t, utilInfo.Utilization)
@@ -182,7 +182,8 @@ func TestCalculateWithDynamicResources(t *testing.T) {
182182
gpuNode := BuildTestNode("gpuNode", 1000, 1000)
183183
AddGpusToNode(gpuNode, 1)
184184
AddGpuLabelToNode(gpuNode)
185-
gpuConfig := getGpuConfigFromNode(gpuNode)
185+
gpuConfig := getGpuConfigFromNode(gpuNode, false)
186+
gpuConfigDra := getGpuConfigFromNode(gpuNode, true)
186187
pod1 := BuildTestPod("pod1", 250, 0, WithNodeName("node"))
187188
pod2 := BuildTestPod("pod2", 250, 0, WithNodeName("node"))
188189
resourceSlice1 := &resourceapi.ResourceSlice{
@@ -345,6 +346,13 @@ func TestCalculateWithDynamicResources(t *testing.T) {
345346
draEnabled: true,
346347
wantUtilInfo: Info{Utilization: 0, ResourceName: gpuConfig.ExtendedResourceName},
347348
},
349+
{
350+
testName: "DRA slices and claims present, DRA enabled, DRA GPU config passed -> DRA util returned",
351+
nodeInfo: nodeInfoGpuAndDra,
352+
gpuConfig: gpuConfigDra,
353+
draEnabled: true,
354+
wantUtilInfo: Info{DynamicResourceUtil: 0.8, Utilization: 0.8, ResourceName: apiv1.ResourceName("driver.foo.com/node-pool1")},
355+
},
348356
{
349357
testName: "DRA slices and claims present, DRA enabled, error while calculating DRA util -> error returned",
350358
nodeInfo: nodeInfoIncompleteSlices,
@@ -365,15 +373,23 @@ func TestCalculateWithDynamicResources(t *testing.T) {
365373
}
366374
}
367375

368-
func getGpuConfigFromNode(node *apiv1.Node) *cloudprovider.GpuConfig {
376+
func getGpuConfigFromNode(node *apiv1.Node, dra bool) *cloudprovider.GpuConfig {
369377
gpuLabel := "cloud.google.com/gke-accelerator"
370378
gpuType, hasGpuLabel := node.Labels[gpuLabel]
371379
gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
372380
if hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero()) {
381+
if !dra {
382+
return &cloudprovider.GpuConfig{
383+
Label: gpuLabel,
384+
Type: gpuType,
385+
ExtendedResourceName: gpu.ResourceNvidiaGPU,
386+
}
387+
}
388+
373389
return &cloudprovider.GpuConfig{
374-
Label: gpuLabel,
375-
Type: gpuType,
376-
ExtendedResourceName: gpu.ResourceNvidiaGPU,
390+
Label: gpuLabel,
391+
Type: gpuType,
392+
DraDriverName: "gpu.nvidia.com",
377393
}
378394
}
379395
return nil

0 commit comments

Comments
 (0)