@@ -28,8 +28,10 @@ import (
28
28
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
29
"k8s.io/apimachinery/pkg/labels"
30
30
"k8s.io/apimachinery/pkg/types"
31
+ "k8s.io/apimachinery/pkg/util/sets"
31
32
informerv1 "k8s.io/client-go/informers/core/v1"
32
33
listerv1 "k8s.io/client-go/listers/core/v1"
34
+ "k8s.io/client-go/tools/cache"
33
35
"k8s.io/klog/v2"
34
36
"k8s.io/kubernetes/pkg/scheduler/framework"
35
37
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -64,10 +66,11 @@ func (s *PermitState) Clone() framework.StateData {
64
66
type Manager interface {
65
67
PreFilter (context.Context , * corev1.Pod ) error
66
68
Permit (context.Context , * framework.CycleState , * corev1.Pod ) Status
69
+ Unreserve (context.Context , * corev1.Pod )
67
70
GetPodGroup (context.Context , * corev1.Pod ) (string , * v1alpha1.PodGroup )
71
+ GetAssignedPodCount (string ) int
68
72
GetCreationTimestamp (context.Context , * corev1.Pod , time.Time ) time.Time
69
73
DeletePermittedPodGroup (context.Context , string )
70
- CalculateAssignedPods (context.Context , string , string ) int
71
74
ActivateSiblings (ctx context.Context , pod * corev1.Pod , state * framework.CycleState )
72
75
BackoffPodGroup (string , time.Duration )
73
76
}
@@ -87,9 +90,34 @@ type PodGroupManager struct {
87
90
backedOffPG * gocache.Cache
88
91
// podLister is pod lister
89
92
podLister listerv1.PodLister
93
+ // assignedPodsByPG stores the pods assumed or bound for podgroups
94
+ assignedPodsByPG map [string ]sets.Set [string ]
90
95
sync.RWMutex
91
96
}
92
97
98
+ func AddPodFactory (pgMgr * PodGroupManager ) func (obj interface {}) {
99
+ return func (obj interface {}) {
100
+ p , ok := obj .(* corev1.Pod )
101
+ if ! ok {
102
+ return
103
+ }
104
+ if p .Spec .NodeName == "" {
105
+ return
106
+ }
107
+ pgFullName , _ := pgMgr .GetPodGroup (context .Background (), p )
108
+ if pgFullName == "" {
109
+ return
110
+ }
111
+ pgMgr .RWMutex .Lock ()
112
+ defer pgMgr .RWMutex .Unlock ()
113
+ if assigned , exist := pgMgr .assignedPodsByPG [pgFullName ]; exist {
114
+ assigned .Insert (p .Name )
115
+ } else {
116
+ pgMgr .assignedPodsByPG [pgFullName ] = sets .New (p .Name )
117
+ }
118
+ }
119
+ }
120
+
93
121
// NewPodGroupManager creates a new operation object.
94
122
func NewPodGroupManager (client client.Client , snapshotSharedLister framework.SharedLister , scheduleTimeout * time.Duration , podInformer informerv1.PodInformer ) * PodGroupManager {
95
123
pgMgr := & PodGroupManager {
@@ -99,10 +127,43 @@ func NewPodGroupManager(client client.Client, snapshotSharedLister framework.Sha
99
127
podLister : podInformer .Lister (),
100
128
permittedPG : gocache .New (3 * time .Second , 3 * time .Second ),
101
129
backedOffPG : gocache .New (10 * time .Second , 10 * time .Second ),
130
+ assignedPodsByPG : map [string ]sets.Set [string ]{},
102
131
}
132
+ podInformer .Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
133
+ AddFunc : AddPodFactory (pgMgr ),
134
+ DeleteFunc : func (obj interface {}) {
135
+ switch t := obj .(type ) {
136
+ case * corev1.Pod :
137
+ pod := t
138
+ if pod .Spec .NodeName == "" {
139
+ return
140
+ }
141
+ pgMgr .Unreserve (context .Background (), pod )
142
+ return
143
+ case cache.DeletedFinalStateUnknown :
144
+ pod , ok := t .Obj .(* corev1.Pod )
145
+ if ! ok {
146
+ return
147
+ }
148
+ if pod .Spec .NodeName == "" {
149
+ return
150
+ }
151
+ pgMgr .Unreserve (context .Background (), pod )
152
+ return
153
+ default :
154
+ return
155
+ }
156
+ },
157
+ })
103
158
return pgMgr
104
159
}
105
160
161
+ func (pgMgr * PodGroupManager ) GetAssignedPodCount (pgName string ) int {
162
+ pgMgr .RWMutex .RLock ()
163
+ defer pgMgr .RWMutex .RUnlock ()
164
+ return len (pgMgr .assignedPodsByPG [pgName ])
165
+ }
166
+
106
167
func (pgMgr * PodGroupManager ) BackoffPodGroup (pgName string , backoff time.Duration ) {
107
168
if backoff == time .Duration (0 ) {
108
169
return
@@ -222,16 +283,23 @@ func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.Cycle
222
283
return PodGroupNotFound
223
284
}
224
285
225
- assigned := pgMgr .CalculateAssignedPods (ctx , pg .Name , pg .Namespace )
286
+ pgMgr .RWMutex .RLock ()
287
+ defer pgMgr .RWMutex .RUnlock ()
288
+ assigned , exist := pgMgr .assignedPodsByPG [pgFullName ]
289
+ if ! exist {
290
+ assigned = sets.Set [string ]{}
291
+ pgMgr .assignedPodsByPG [pgFullName ] = assigned
292
+ }
293
+ assigned .Insert (pod .Name )
226
294
// The number of pods that have been assigned nodes is calculated from the snapshot.
227
295
// The current pod in not included in the snapshot during the current scheduling cycle.
228
- if int32 (assigned )+ 1 >= pg .Spec .MinMember {
296
+ if len (assigned ) >= int ( pg .Spec .MinMember ) {
229
297
return Success
230
298
}
231
299
232
- if assigned == 0 {
300
+ if len ( assigned ) == 1 {
233
301
// Given we've reached Permit(), it's mean all PreFilter checks (minMember & minResource)
234
- // already pass through, so if assigned == 0 , it could be due to:
302
+ // already pass through, so if len( assigned) == 1 , it could be due to:
235
303
// - minResource get satisfied
236
304
// - new pods added
237
305
// In either case, we should and only should use this 0-th pod to trigger activating
@@ -244,6 +312,24 @@ func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.Cycle
244
312
return Wait
245
313
}
246
314
315
+ // Unreserve invalidates assigned pod from assignedPodsByPG when schedule or bind failed.
316
+ func (pgMgr * PodGroupManager ) Unreserve (ctx context.Context , pod * corev1.Pod ) {
317
+ pgFullName , _ := pgMgr .GetPodGroup (ctx , pod )
318
+ if pgFullName == "" {
319
+ return
320
+ }
321
+
322
+ pgMgr .RWMutex .Lock ()
323
+ defer pgMgr .RWMutex .Unlock ()
324
+ assigned , exist := pgMgr .assignedPodsByPG [pgFullName ]
325
+ if exist {
326
+ assigned .Delete (pod .Name )
327
+ if len (assigned ) == 0 {
328
+ delete (pgMgr .assignedPodsByPG , pgFullName )
329
+ }
330
+ }
331
+ }
332
+
247
333
// GetCreationTimestamp returns the creation time of a podGroup or a pod.
248
334
func (pgMgr * PodGroupManager ) GetCreationTimestamp (ctx context.Context , pod * corev1.Pod , ts time.Time ) time.Time {
249
335
pgName := util .GetPodGroupLabel (pod )
@@ -275,27 +361,6 @@ func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod)
275
361
return fmt .Sprintf ("%v/%v" , pod .Namespace , pgName ), & pg
276
362
}
277
363
278
- // CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound.
279
- func (pgMgr * PodGroupManager ) CalculateAssignedPods (ctx context.Context , podGroupName , namespace string ) int {
280
- lh := klog .FromContext (ctx )
281
- nodeInfos , err := pgMgr .snapshotSharedLister .NodeInfos ().List ()
282
- if err != nil {
283
- lh .Error (err , "Cannot get nodeInfos from frameworkHandle" )
284
- return 0
285
- }
286
- var count int
287
- for _ , nodeInfo := range nodeInfos {
288
- for _ , podInfo := range nodeInfo .Pods {
289
- pod := podInfo .Pod
290
- if util .GetPodGroupLabel (pod ) == podGroupName && pod .Namespace == namespace && pod .Spec .NodeName != "" {
291
- count ++
292
- }
293
- }
294
- }
295
-
296
- return count
297
- }
298
-
299
364
// CheckClusterResource checks if resource capacity of the cluster can satisfy <resourceRequest>.
300
365
// It returns an error detailing the resource gap if not satisfied; otherwise returns nil.
301
366
func CheckClusterResource (ctx context.Context , nodeList []* framework.NodeInfo , resourceRequest corev1.ResourceList , desiredPodGroupName string ) error {
0 commit comments