Merge pull request #8547 from mtrqq/dra-gpu-processor

k8s-ci-robot · web-flow · commit cef695b25069 · 2025-09-29T09:18:18.000-07:00
Disable GPU resource processor for nodes using DRA for accelerator attachment
diff --git a/cluster-autoscaler/cloudprovider/cloud_provider.go b/cluster-autoscaler/cloudprovider/cloud_provider.go
@@ -98,9 +98,16 @@ const (
 
 // GpuConfig contains the label, type and the resource name for a GPU.
 type GpuConfig struct {
-	Label        string
-	Type         string
-	ResourceName apiv1.ResourceName
+	Label                string
+	Type                 string
+	ExtendedResourceName apiv1.ResourceName
+	DraDriverName        string
+}
+
+// ExposedViaDra determines whether a GPU described in the config
+// is exposed via device plugin or DRA driver
+func (gpu *GpuConfig) ExposedViaDra() bool {
+	return gpu.DraDriverName != ""
 }
 
 // CloudProvider contains configuration info and functions for interacting with
diff --git a/cluster-autoscaler/cloudprovider/gce/dynamicresources.go b/cluster-autoscaler/cloudprovider/gce/dynamicresources.go
@@ -0,0 +1,31 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gce
+
+import apiv1 "k8s.io/api/core/v1"
+
+const (
+	// DraGPUDriver name of the driver used to expose NVIDIA GPU resources
+	DraGPUDriver = "gpu.nvidia.com"
+	// DraGPULabel is the label added to nodes with GPU resource exposed via DRA.
+	DraGPULabel = "cloud.google.com/gke-gpu-dra-driver"
+)
+
+// GpuDraDriverEnabled checks whether GPU driver is enabled on the node
+func GpuDraDriverEnabled(node *apiv1.Node) bool {
+	return node.Labels[DraGPULabel] == "true"
+}
diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go
@@ -82,9 +82,20 @@ func (gce *GceCloudProvider) GetAvailableGPUTypes() map[string]struct{} {
 }
 
 // GetNodeGpuConfig returns the label, type and resource name for the GPU added to node. If node doesn't have
-// any GPUs, it returns nil.
+// any GPUs, it returns nil. If node has GPU attached using DRA - populates the according field in GpuConfig
 func (gce *GceCloudProvider) GetNodeGpuConfig(node *apiv1.Node) *cloudprovider.GpuConfig {
-	return gpu.GetNodeGPUFromCloudProvider(gce, node)
+	gpuConfig := gpu.GetNodeGPUFromCloudProvider(gce, node)
+
+	// If GPU devices are exposed using DRA - extended resource
+	// won't be present in the node alloctable or capacity
+	// so we overwrite extended resource name as it won't ever
+	// be there
+	if GpuDraDriverEnabled(node) {
+		gpuConfig.DraDriverName = DraGPUDriver
+		gpuConfig.ExtendedResourceName = ""
+	}
+
+	return gpuConfig
 }
 
 // NodeGroups returns all node groups configured for this cloud provider.
diff --git a/cluster-autoscaler/cloudprovider/kwok/kwok_provider_test.go b/cluster-autoscaler/cloudprovider/kwok/kwok_provider_test.go
@@ -486,7 +486,7 @@ func TestGetNodeGpuConfig(t *testing.T) {
 	l := p.GetNodeGpuConfig(nodeWithGPU)
 	assert.NotNil(t, l)
 	assert.Equal(t, "k8s.amazonaws.com/accelerator", l.Label)
-	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ResourceName))
+	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ExtendedResourceName))
 	assert.Equal(t, "nvidia-tesla-k80", l.Type)
 
 	nodeWithNoAllocatableGPU := &apiv1.Node{
@@ -499,7 +499,7 @@ func TestGetNodeGpuConfig(t *testing.T) {
 	l = p.GetNodeGpuConfig(nodeWithNoAllocatableGPU)
 	assert.NotNil(t, l)
 	assert.Equal(t, "k8s.amazonaws.com/accelerator", l.Label)
-	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ResourceName))
+	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ExtendedResourceName))
 	assert.Equal(t, "nvidia-tesla-k80", l.Type)
 
 	nodeWithNoGPULabel := &apiv1.Node{
@@ -515,7 +515,7 @@ func TestGetNodeGpuConfig(t *testing.T) {
 	l = p.GetNodeGpuConfig(nodeWithNoGPULabel)
 	assert.NotNil(t, l)
 	assert.Equal(t, "k8s.amazonaws.com/accelerator", l.Label)
-	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ResourceName))
+	assert.Equal(t, gpu.ResourceNvidiaGPU, string(l.ExtendedResourceName))
 	assert.Equal(t, "", l.Type)
 
 }
diff --git a/cluster-autoscaler/processors/customresources/gpu_processor.go b/cluster-autoscaler/processors/customresources/gpu_processor.go
@@ -42,12 +42,14 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
 	newReadyNodes := make([]*apiv1.Node, 0)
 	nodesWithUnreadyGpu := make(map[string]*apiv1.Node)
 	for _, node := range readyNodes {
+		if gpuExposedViaDra(context, node) {
+			newReadyNodes = append(newReadyNodes, node)
+			continue
+		}
+
 		_, hasGpuLabel := node.Labels[context.CloudProvider.GPULabel()]
 		gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
 		directXAllocatable, hasDirectXAllocatable := node.Status.Allocatable[gpu.ResourceDirectX]
-		// We expect node to have GPU based on label, but it doesn't show up
-		// on node object. Assume the node is still not fully started (installing
-		// GPU drivers).
 		if hasGpuLabel && ((!hasGpuAllocatable || gpuAllocatable.IsZero()) && (!hasDirectXAllocatable || directXAllocatable.IsZero())) {
 			klog.V(3).Infof("Overriding status of node %v, which seems to have unready GPU",
 				node.Name)
@@ -70,18 +72,22 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
 // GetNodeResourceTargets returns mapping of resource names to their targets.
 // This includes resources which are not yet ready to use and visible in kubernetes.
 func (p *GpuCustomResourcesProcessor) GetNodeResourceTargets(context *context.AutoscalingContext, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) ([]CustomResourceTarget, errors.AutoscalerError) {
-	gpuTarget, err := p.GetNodeGpuTarget(context.CloudProvider.GPULabel(), node, nodeGroup)
+	gpuTarget, err := p.GetNodeGpuTarget(context, node, nodeGroup)
 	return []CustomResourceTarget{gpuTarget}, err
 }
 
 // GetNodeGpuTarget returns the gpu target of a given node. This includes gpus
 // that are not ready to use and visible in kubernetes.
-func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(GPULabel string, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (CustomResourceTarget, errors.AutoscalerError) {
-	gpuLabel, found := node.Labels[GPULabel]
+func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(context *context.AutoscalingContext, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (CustomResourceTarget, errors.AutoscalerError) {
+	gpuLabel, found := node.Labels[context.CloudProvider.GPULabel()]
 	if !found {
 		return CustomResourceTarget{}, nil
 	}
 
+	if gpuExposedViaDra(context, node) {
+		return CustomResourceTarget{}, nil
+	}
+
 	gpuAllocatable, found := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
 	if found && gpuAllocatable.Value() > 0 {
 		return CustomResourceTarget{gpuLabel, gpuAllocatable.Value()}, nil
@@ -121,3 +127,15 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(GPULabel string, node *ap
 // CleanUp cleans up processor's internal structures.
 func (p *GpuCustomResourcesProcessor) CleanUp() {
 }
+
+func gpuExposedViaDra(ctx *context.AutoscalingContext, node *apiv1.Node) bool {
+	gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node)
+	if gpuConfig == nil {
+		return false
+	}
+
+	// Devices attached through DRA are not using node allocatable
+	// to confirm their attachment, assume that node is ready
+	// and will be checked in the separate processor
+	return gpuConfig.ExposedViaDra()
+}
diff --git a/cluster-autoscaler/processors/customresources/gpu_processor_test.go b/cluster-autoscaler/processors/customresources/gpu_processor_test.go
@@ -25,6 +25,7 @@ import (
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/gce"
 	testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
 	"k8s.io/autoscaler/cluster-autoscaler/context"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
@@ -152,13 +153,28 @@ func TestFilterOutNodesWithUnreadyResources(t *testing.T) {
 	}
 	expectedReadiness[nodeNoGpuUnready.Name] = false
 
+	nodeGPUReadyDra := &apiv1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "nodeGPUViaDra",
+			Labels: map[string]string{
+				gce.DraGPULabel: "true",
+			},
+			CreationTimestamp: metav1.NewTime(start),
+		},
+		Status: apiv1.NodeStatus{
+			Conditions: []apiv1.NodeCondition{readyCondition},
+		},
+	}
+	expectedReadiness[nodeGPUReadyDra.Name] = true
+
 	initialReadyNodes := []*apiv1.Node{
 		nodeGpuReady,
 		nodeGpuUnready,
 		nodeGpuUnready2,
 		nodeDirectXReady,
 		nodeDirectXUnready,
 		nodeNoGpuReady,
+		nodeGPUReadyDra,
 	}
 	initialAllNodes := []*apiv1.Node{
 		nodeGpuReady,
@@ -168,6 +184,7 @@ func TestFilterOutNodesWithUnreadyResources(t *testing.T) {
 		nodeDirectXUnready,
 		nodeNoGpuReady,
 		nodeNoGpuUnready,
+		nodeGPUReadyDra,
 	}
 
 	processor := GpuCustomResourcesProcessor{}
diff --git a/cluster-autoscaler/simulator/utilization/info.go b/cluster-autoscaler/simulator/utilization/info.go
@@ -48,15 +48,15 @@ type Info struct {
 // utilization is the sum of requests for it divided by allocatable. It also
 // returns the individual cpu, memory and gpu utilization.
 func Calculate(nodeInfo *framework.NodeInfo, skipDaemonSetPods, skipMirrorPods, draEnabled bool, gpuConfig *cloudprovider.GpuConfig, currentTime time.Time) (utilInfo Info, err error) {
-	if gpuConfig != nil {
-		gpuUtil, err := CalculateUtilizationOfResource(nodeInfo, gpuConfig.ResourceName, skipDaemonSetPods, skipMirrorPods, currentTime)
+	if gpuConfig != nil && !gpuConfig.ExposedViaDra() {
+		gpuUtil, err := CalculateUtilizationOfResource(nodeInfo, gpuConfig.ExtendedResourceName, skipDaemonSetPods, skipMirrorPods, currentTime)
 		if err != nil {
-			klog.V(3).Infof("node %s has unready GPU resource: %s", nodeInfo.Node().Name, gpuConfig.ResourceName)
+			klog.V(3).Infof("node %s has unready GPU resource: %s", nodeInfo.Node().Name, gpuConfig.ExtendedResourceName)
 			// Return 0 if GPU is unready. This will guarantee we can still scale down a node with unready GPU.
-			return Info{GpuUtil: 0, ResourceName: gpuConfig.ResourceName, Utilization: 0}, nil
+			return Info{GpuUtil: 0, ResourceName: gpuConfig.ExtendedResourceName, Utilization: 0}, nil
 		}
 		// Skips cpu and memory utilization calculation for node with GPU.
-		return Info{GpuUtil: gpuUtil, ResourceName: gpuConfig.ResourceName, Utilization: gpuUtil}, err
+		return Info{GpuUtil: gpuUtil, ResourceName: gpuConfig.ExtendedResourceName, Utilization: gpuUtil}, err
 	}
 
 	if draEnabled && len(nodeInfo.LocalResourceSlices) > 0 {
diff --git a/cluster-autoscaler/simulator/utilization/info_test.go b/cluster-autoscaler/simulator/utilization/info_test.go
@@ -82,7 +82,7 @@ func TestCalculate(t *testing.T) {
 	SetNodeReadyState(node, true, time.Time{})
 	nodeInfo := framework.NewTestNodeInfo(node, pod, pod, pod2)
 
-	gpuConfig := getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig := getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err := Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@@ -91,15 +91,15 @@ func TestCalculate(t *testing.T) {
 	node2 := BuildTestNode("node2", 2000, -1)
 	nodeInfo = framework.NewTestNodeInfo(node2, pod, pod, pod2)
 
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	_, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.Error(t, err)
 
 	node3 := BuildTestNode("node3", 2000, 2000000)
 	SetNodeReadyState(node3, true, time.Time{})
 	nodeInfo = framework.NewTestNodeInfo(node3, pod, podWithInitContainers, podWithLargeNonRestartableInitContainers)
 
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 50.25, utilInfo.Utilization, 0.01)
@@ -113,21 +113,21 @@ func TestCalculate(t *testing.T) {
 	daemonSetPod4.Annotations = map[string]string{"cluster-autoscaler.kubernetes.io/daemonset-pod": "true"}
 
 	nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, daemonSetPod3, daemonSetPod4)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, true, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.5/10, utilInfo.Utilization, 0.01)
 
 	nodeInfo = framework.NewTestNodeInfo(node, pod, pod2, daemonSetPod3)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 
 	terminatedPod := BuildTestPod("podTerminated", 100, 200000)
 	terminatedPod.DeletionTimestamp = &metav1.Time{Time: testTime.Add(-10 * time.Minute)}
 	nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, terminatedPod)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
@@ -138,19 +138,19 @@ func TestCalculate(t *testing.T) {
 	}
 
 	nodeInfo = framework.NewTestNodeInfo(node, pod, pod, pod2, mirrorPod)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, true, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/9.0, utilInfo.Utilization, 0.01)
 
 	nodeInfo = framework.NewTestNodeInfo(node, pod, pod2, mirrorPod)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 2.0/10, utilInfo.Utilization, 0.01)
 
 	nodeInfo = framework.NewTestNodeInfo(node, pod, mirrorPod, daemonSetPod3)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, true, true, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 1.0/8.0, utilInfo.Utilization, 0.01)
@@ -161,7 +161,7 @@ func TestCalculate(t *testing.T) {
 	RequestGpuForPod(gpuPod, 1)
 	TolerateGpuForPod(gpuPod)
 	nodeInfo = framework.NewTestNodeInfo(gpuNode, pod, pod, gpuPod)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.InEpsilon(t, 1/1, utilInfo.Utilization, 0.01)
@@ -170,7 +170,7 @@ func TestCalculate(t *testing.T) {
 	gpuNode = BuildTestNode("gpu_node", 2000, 2000000)
 	AddGpuLabelToNode(gpuNode)
 	nodeInfo = framework.NewTestNodeInfo(gpuNode, pod, pod)
-	gpuConfig = getGpuConfigFromNode(nodeInfo.Node())
+	gpuConfig = getGpuConfigFromNode(nodeInfo.Node(), false)
 	utilInfo, err = Calculate(nodeInfo, false, false, false, gpuConfig, testTime)
 	assert.NoError(t, err)
 	assert.Zero(t, utilInfo.Utilization)
@@ -182,7 +182,8 @@ func TestCalculateWithDynamicResources(t *testing.T) {
 	gpuNode := BuildTestNode("gpuNode", 1000, 1000)
 	AddGpusToNode(gpuNode, 1)
 	AddGpuLabelToNode(gpuNode)
-	gpuConfig := getGpuConfigFromNode(gpuNode)
+	gpuConfig := getGpuConfigFromNode(gpuNode, false)
+	gpuConfigDra := getGpuConfigFromNode(gpuNode, true)
 	pod1 := BuildTestPod("pod1", 250, 0, WithNodeName("node"))
 	pod2 := BuildTestPod("pod2", 250, 0, WithNodeName("node"))
 	resourceSlice1 := &resourceapi.ResourceSlice{
@@ -343,7 +344,14 @@ func TestCalculateWithDynamicResources(t *testing.T) {
 			nodeInfo:     nodeInfoGpuAndDra,
 			gpuConfig:    gpuConfig,
 			draEnabled:   true,
-			wantUtilInfo: Info{Utilization: 0, ResourceName: gpuConfig.ResourceName},
+			wantUtilInfo: Info{Utilization: 0, ResourceName: gpuConfig.ExtendedResourceName},
+		},
+		{
+			testName:     "DRA slices and claims present, DRA enabled, DRA GPU config passed -> DRA util returned",
+			nodeInfo:     nodeInfoGpuAndDra,
+			gpuConfig:    gpuConfigDra,
+			draEnabled:   true,
+			wantUtilInfo: Info{DynamicResourceUtil: 0.8, Utilization: 0.8, ResourceName: apiv1.ResourceName("driver.foo.com/node-pool1")},
 		},
 		{
 			testName:     "DRA slices and claims present, DRA enabled, error while calculating DRA util -> error returned",
@@ -365,15 +373,23 @@ func TestCalculateWithDynamicResources(t *testing.T) {
 	}
 }
 
-func getGpuConfigFromNode(node *apiv1.Node) *cloudprovider.GpuConfig {
+func getGpuConfigFromNode(node *apiv1.Node, dra bool) *cloudprovider.GpuConfig {
 	gpuLabel := "cloud.google.com/gke-accelerator"
 	gpuType, hasGpuLabel := node.Labels[gpuLabel]
 	gpuAllocatable, hasGpuAllocatable := node.Status.Allocatable[gpu.ResourceNvidiaGPU]
 	if hasGpuLabel || (hasGpuAllocatable && !gpuAllocatable.IsZero()) {
+		if !dra {
+			return &cloudprovider.GpuConfig{
+				Label:                gpuLabel,
+				Type:                 gpuType,
+				ExtendedResourceName: gpu.ResourceNvidiaGPU,
+			}
+		}
+
 		return &cloudprovider.GpuConfig{
-			Label:        gpuLabel,
-			Type:         gpuType,
-			ResourceName: gpu.ResourceNvidiaGPU,
+			Label:         gpuLabel,
+			Type:          gpuType,
+			DraDriverName: "gpu.nvidia.com",
 		}
 	}
 	return nil
diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
diff --git a/cluster-autoscaler/utils/gpu/gpu_test.go b/cluster-autoscaler/utils/gpu/gpu_test.go