@@ -42,13 +42,15 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
42
42
newReadyNodes := make ([]* apiv1.Node , 0 )
43
43
nodesWithUnreadyGpu := make (map [string ]* apiv1.Node )
44
44
for _ , node := range readyNodes {
45
+ if gpuExposedViaDra (context , node ) {
46
+ newReadyNodes = append (newReadyNodes , node )
47
+ continue
48
+ }
49
+
45
50
_ , hasGpuLabel := node .Labels [context .CloudProvider .GPULabel ()]
46
51
gpuAllocatable , hasGpuAllocatable := node .Status .Allocatable [gpu .ResourceNvidiaGPU ]
47
52
directXAllocatable , hasDirectXAllocatable := node .Status .Allocatable [gpu .ResourceDirectX ]
48
- // We expect node to have GPU based on label, but it doesn't show up
49
- // on node object. Assume the node is still not fully started (installing
50
- // GPU drivers).
51
- if hasGpuLabel && ((! hasGpuAllocatable || gpuAllocatable .IsZero ()) && (! hasDirectXAllocatable || directXAllocatable .IsZero ())) {
53
+ if hasGpuLabel && ((! hasGpuAllocatable || gpuAllocatable .IsZero ()) && (! hasDirectXAllocatable || directXAllocatable .IsZero ())) && ! gpuExposedViaDra (context , node ) {
52
54
klog .V (3 ).Infof ("Overriding status of node %v, which seems to have unready GPU" ,
53
55
node .Name )
54
56
nodesWithUnreadyGpu [node .Name ] = kubernetes .GetUnreadyNodeCopy (node , kubernetes .ResourceUnready )
@@ -70,13 +72,17 @@ func (p *GpuCustomResourcesProcessor) FilterOutNodesWithUnreadyResources(context
70
72
// GetNodeResourceTargets returns mapping of resource names to their targets.
71
73
// This includes resources which are not yet ready to use and visible in kubernetes.
72
74
func (p * GpuCustomResourcesProcessor ) GetNodeResourceTargets (context * context.AutoscalingContext , node * apiv1.Node , nodeGroup cloudprovider.NodeGroup ) ([]CustomResourceTarget , errors.AutoscalerError ) {
73
- gpuTarget , err := p .GetNodeGpuTarget (context .CloudProvider .GPULabel (), node , nodeGroup )
75
+ if gpuExposedViaDra (context , node ) {
76
+ return []CustomResourceTarget {}, nil
77
+ }
78
+
79
+ gpuTarget , err := p .getNodeGpuTarget (context .CloudProvider .GPULabel (), node , nodeGroup )
74
80
return []CustomResourceTarget {gpuTarget }, err
75
81
}
76
82
77
- // GetNodeGpuTarget returns the gpu target of a given node. This includes gpus
83
+ // getNodeGpuTarget returns the gpu target of a given node. This includes gpus
78
84
// that are not ready to use and visible in kubernetes.
79
- func (p * GpuCustomResourcesProcessor ) GetNodeGpuTarget (GPULabel string , node * apiv1.Node , nodeGroup cloudprovider.NodeGroup ) (CustomResourceTarget , errors.AutoscalerError ) {
85
+ func (p * GpuCustomResourcesProcessor ) getNodeGpuTarget (GPULabel string , node * apiv1.Node , nodeGroup cloudprovider.NodeGroup ) (CustomResourceTarget , errors.AutoscalerError ) {
80
86
gpuLabel , found := node .Labels [GPULabel ]
81
87
if ! found {
82
88
return CustomResourceTarget {}, nil
@@ -121,3 +127,15 @@ func (p *GpuCustomResourcesProcessor) GetNodeGpuTarget(GPULabel string, node *ap
121
127
// CleanUp cleans up processor's internal structures.
122
128
func (p * GpuCustomResourcesProcessor ) CleanUp () {
123
129
}
130
+
131
+ func gpuExposedViaDra (ctx * context.AutoscalingContext , node * apiv1.Node ) bool {
132
+ gpuConfig := ctx .CloudProvider .GetNodeGpuConfig (node )
133
+ if gpuConfig == nil {
134
+ return false
135
+ }
136
+
137
+ // Devices attached through DRA are not using node allocatable
138
+ // to confirm their attachment, assume that node is ready
139
+ // and will be checked in the separate processor
140
+ return gpuConfig .ExposedViaDra ()
141
+ }
0 commit comments