Skip to content

Commit 1e1b6b0

Browse files
authored
cache assigned pod count (#708)
Signed-off-by: KunWuLuan <[email protected]>
1 parent f633dd2 commit 1e1b6b0

File tree

5 files changed

+242
-41
lines changed

5 files changed

+242
-41
lines changed

pkg/coscheduling/core/core.go

Lines changed: 91 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ import (
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/labels"
3030
"k8s.io/apimachinery/pkg/types"
31+
"k8s.io/apimachinery/pkg/util/sets"
3132
informerv1 "k8s.io/client-go/informers/core/v1"
3233
listerv1 "k8s.io/client-go/listers/core/v1"
34+
"k8s.io/client-go/tools/cache"
3335
"k8s.io/klog/v2"
3436
"k8s.io/kubernetes/pkg/scheduler/framework"
3537
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -64,10 +66,11 @@ func (s *PermitState) Clone() framework.StateData {
6466
type Manager interface {
6567
PreFilter(context.Context, *corev1.Pod) error
6668
Permit(context.Context, *framework.CycleState, *corev1.Pod) Status
69+
Unreserve(context.Context, *corev1.Pod)
6770
GetPodGroup(context.Context, *corev1.Pod) (string, *v1alpha1.PodGroup)
71+
GetAssignedPodCount(string) int
6872
GetCreationTimestamp(context.Context, *corev1.Pod, time.Time) time.Time
6973
DeletePermittedPodGroup(context.Context, string)
70-
CalculateAssignedPods(context.Context, string, string) int
7174
ActivateSiblings(ctx context.Context, pod *corev1.Pod, state *framework.CycleState)
7275
BackoffPodGroup(string, time.Duration)
7376
}
@@ -87,9 +90,34 @@ type PodGroupManager struct {
8790
backedOffPG *gocache.Cache
8891
// podLister is pod lister
8992
podLister listerv1.PodLister
93+
// assignedPodsByPG stores the pods assumed or bound for podgroups
94+
assignedPodsByPG map[string]sets.Set[string]
9095
sync.RWMutex
9196
}
9297

98+
func AddPodFactory(pgMgr *PodGroupManager) func(obj interface{}) {
99+
return func(obj interface{}) {
100+
p, ok := obj.(*corev1.Pod)
101+
if !ok {
102+
return
103+
}
104+
if p.Spec.NodeName == "" {
105+
return
106+
}
107+
pgFullName, _ := pgMgr.GetPodGroup(context.Background(), p)
108+
if pgFullName == "" {
109+
return
110+
}
111+
pgMgr.RWMutex.Lock()
112+
defer pgMgr.RWMutex.Unlock()
113+
if assigned, exist := pgMgr.assignedPodsByPG[pgFullName]; exist {
114+
assigned.Insert(p.Name)
115+
} else {
116+
pgMgr.assignedPodsByPG[pgFullName] = sets.New(p.Name)
117+
}
118+
}
119+
}
120+
93121
// NewPodGroupManager creates a new operation object.
94122
func NewPodGroupManager(client client.Client, snapshotSharedLister framework.SharedLister, scheduleTimeout *time.Duration, podInformer informerv1.PodInformer) *PodGroupManager {
95123
pgMgr := &PodGroupManager{
@@ -99,10 +127,43 @@ func NewPodGroupManager(client client.Client, snapshotSharedLister framework.Sha
99127
podLister: podInformer.Lister(),
100128
permittedPG: gocache.New(3*time.Second, 3*time.Second),
101129
backedOffPG: gocache.New(10*time.Second, 10*time.Second),
130+
assignedPodsByPG: map[string]sets.Set[string]{},
102131
}
132+
podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
133+
AddFunc: AddPodFactory(pgMgr),
134+
DeleteFunc: func(obj interface{}) {
135+
switch t := obj.(type) {
136+
case *corev1.Pod:
137+
pod := t
138+
if pod.Spec.NodeName == "" {
139+
return
140+
}
141+
pgMgr.Unreserve(context.Background(), pod)
142+
return
143+
case cache.DeletedFinalStateUnknown:
144+
pod, ok := t.Obj.(*corev1.Pod)
145+
if !ok {
146+
return
147+
}
148+
if pod.Spec.NodeName == "" {
149+
return
150+
}
151+
pgMgr.Unreserve(context.Background(), pod)
152+
return
153+
default:
154+
return
155+
}
156+
},
157+
})
103158
return pgMgr
104159
}
105160

161+
func (pgMgr *PodGroupManager) GetAssignedPodCount(pgName string) int {
162+
pgMgr.RWMutex.RLock()
163+
defer pgMgr.RWMutex.RUnlock()
164+
return len(pgMgr.assignedPodsByPG[pgName])
165+
}
166+
106167
func (pgMgr *PodGroupManager) BackoffPodGroup(pgName string, backoff time.Duration) {
107168
if backoff == time.Duration(0) {
108169
return
@@ -222,16 +283,23 @@ func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.Cycle
222283
return PodGroupNotFound
223284
}
224285

225-
assigned := pgMgr.CalculateAssignedPods(ctx, pg.Name, pg.Namespace)
286+
pgMgr.RWMutex.RLock()
287+
defer pgMgr.RWMutex.RUnlock()
288+
assigned, exist := pgMgr.assignedPodsByPG[pgFullName]
289+
if !exist {
290+
assigned = sets.Set[string]{}
291+
pgMgr.assignedPodsByPG[pgFullName] = assigned
292+
}
293+
assigned.Insert(pod.Name)
226294
// The number of pods that have been assigned nodes is calculated from the snapshot.
227295
// The current pod in not included in the snapshot during the current scheduling cycle.
228-
if int32(assigned)+1 >= pg.Spec.MinMember {
296+
if len(assigned) >= int(pg.Spec.MinMember) {
229297
return Success
230298
}
231299

232-
if assigned == 0 {
300+
if len(assigned) == 1 {
233301
// Given we've reached Permit(), it's mean all PreFilter checks (minMember & minResource)
234-
// already pass through, so if assigned == 0, it could be due to:
302+
// already pass through, so if len(assigned) == 1, it could be due to:
235303
// - minResource get satisfied
236304
// - new pods added
237305
// In either case, we should and only should use this 0-th pod to trigger activating
@@ -244,6 +312,24 @@ func (pgMgr *PodGroupManager) Permit(ctx context.Context, state *framework.Cycle
244312
return Wait
245313
}
246314

315+
// Unreserve invalidates assigned pod from assignedPodsByPG when schedule or bind failed.
316+
func (pgMgr *PodGroupManager) Unreserve(ctx context.Context, pod *corev1.Pod) {
317+
pgFullName, _ := pgMgr.GetPodGroup(ctx, pod)
318+
if pgFullName == "" {
319+
return
320+
}
321+
322+
pgMgr.RWMutex.Lock()
323+
defer pgMgr.RWMutex.Unlock()
324+
assigned, exist := pgMgr.assignedPodsByPG[pgFullName]
325+
if exist {
326+
assigned.Delete(pod.Name)
327+
if len(assigned) == 0 {
328+
delete(pgMgr.assignedPodsByPG, pgFullName)
329+
}
330+
}
331+
}
332+
247333
// GetCreationTimestamp returns the creation time of a podGroup or a pod.
248334
func (pgMgr *PodGroupManager) GetCreationTimestamp(ctx context.Context, pod *corev1.Pod, ts time.Time) time.Time {
249335
pgName := util.GetPodGroupLabel(pod)
@@ -275,27 +361,6 @@ func (pgMgr *PodGroupManager) GetPodGroup(ctx context.Context, pod *corev1.Pod)
275361
return fmt.Sprintf("%v/%v", pod.Namespace, pgName), &pg
276362
}
277363

278-
// CalculateAssignedPods returns the number of pods that has been assigned nodes: assumed or bound.
279-
func (pgMgr *PodGroupManager) CalculateAssignedPods(ctx context.Context, podGroupName, namespace string) int {
280-
lh := klog.FromContext(ctx)
281-
nodeInfos, err := pgMgr.snapshotSharedLister.NodeInfos().List()
282-
if err != nil {
283-
lh.Error(err, "Cannot get nodeInfos from frameworkHandle")
284-
return 0
285-
}
286-
var count int
287-
for _, nodeInfo := range nodeInfos {
288-
for _, podInfo := range nodeInfo.Pods {
289-
pod := podInfo.Pod
290-
if util.GetPodGroupLabel(pod) == podGroupName && pod.Namespace == namespace && pod.Spec.NodeName != "" {
291-
count++
292-
}
293-
}
294-
}
295-
296-
return count
297-
}
298-
299364
// CheckClusterResource checks if resource capacity of the cluster can satisfy <resourceRequest>.
300365
// It returns an error detailing the resource gap if not satisfied; otherwise returns nil.
301366
func CheckClusterResource(ctx context.Context, nodeList []*framework.NodeInfo, resourceRequest corev1.ResourceList, desiredPodGroupName string) error {

pkg/coscheduling/core/core_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
corev1 "k8s.io/api/core/v1"
2626
"k8s.io/apimachinery/pkg/api/resource"
2727
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/util/sets"
2829
"k8s.io/client-go/informers"
2930
clientsetfake "k8s.io/client-go/kubernetes/fake"
3031
clicache "k8s.io/client-go/tools/cache"
@@ -170,6 +171,7 @@ func TestPreFilter(t *testing.T) {
170171
scheduleTimeout: &scheduleTimeout,
171172
permittedPG: newCache(),
172173
backedOffPG: newCache(),
174+
assignedPodsByPG: make(map[string]sets.Set[string]),
173175
}
174176

175177
informerFactory.Start(ctx.Done())
@@ -264,19 +266,17 @@ func TestPermit(t *testing.T) {
264266
informerFactory := informers.NewSharedInformerFactory(cs, 0)
265267
podInformer := informerFactory.Core().V1().Pods()
266268

267-
pgMgr := &PodGroupManager{
268-
client: client,
269-
snapshotSharedLister: tu.NewFakeSharedLister(tt.existingPods, nodes),
270-
podLister: podInformer.Lister(),
271-
scheduleTimeout: &scheduleTimeout,
272-
}
269+
pgMgr := NewPodGroupManager(client, tu.NewFakeSharedLister(tt.existingPods, nodes), &scheduleTimeout, podInformer)
273270

274271
informerFactory.Start(ctx.Done())
275272
if !clicache.WaitForCacheSync(ctx.Done(), podInformer.Informer().HasSynced) {
276273
t.Fatal("WaitForCacheSync failed")
277274
}
275+
addFunc := AddPodFactory(pgMgr)
278276
for _, p := range tt.existingPods {
279277
podInformer.Informer().GetStore().Add(p)
278+
// we call add func here because we can not ensure existing pods are added before premit are called
279+
addFunc(p)
280280
}
281281

282282
if got := pgMgr.Permit(ctx, &framework.CycleState{}, tt.pod); got != tt.want {

pkg/coscheduling/coscheduling.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ func (cs *Coscheduling) PostFilter(ctx context.Context, state *framework.CycleSt
166166

167167
// This indicates there are already enough Pods satisfying the PodGroup,
168168
// so don't bother to reject the whole PodGroup.
169-
assigned := cs.pgMgr.CalculateAssignedPods(ctx, pg.Name, pod.Namespace)
169+
assigned := cs.pgMgr.GetAssignedPodCount(pgName)
170170
if assigned >= int(pg.Spec.MinMember) {
171171
lh.V(4).Info("Assigned pods", "podGroup", klog.KObj(pg), "assigned", assigned)
172172
return &framework.PostFilterResult{}, framework.NewStatus(framework.Unschedulable)
@@ -256,6 +256,7 @@ func (cs *Coscheduling) Unreserve(ctx context.Context, state *framework.CycleSta
256256
if pg == nil {
257257
return
258258
}
259+
cs.pgMgr.Unreserve(ctx, pod)
259260
cs.frameworkHandler.IterateOverWaitingPods(func(waitingPod framework.WaitingPod) {
260261
if waitingPod.GetPod().Namespace == pod.Namespace && util.GetPodGroupLabel(waitingPod.GetPod()) == pg.Name {
261262
lh.V(3).Info("Unreserve rejects", "pod", klog.KObj(waitingPod.GetPod()), "podGroup", klog.KObj(pg))

pkg/coscheduling/coscheduling_test.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -622,24 +622,27 @@ func TestPostFilter(t *testing.T) {
622622
cs := clientsetfake.NewSimpleClientset()
623623
informerFactory := informers.NewSharedInformerFactory(cs, 0)
624624
podInformer := informerFactory.Core().V1().Pods()
625-
625+
pgMgr := core.NewPodGroupManager(
626+
client,
627+
tu.NewFakeSharedLister(tt.existingPods, nodes),
628+
&scheduleTimeout,
629+
podInformer,
630+
)
626631
pl := &Coscheduling{
627632
frameworkHandler: f,
628-
pgMgr: core.NewPodGroupManager(
629-
client,
630-
tu.NewFakeSharedLister(tt.existingPods, nodes),
631-
&scheduleTimeout,
632-
podInformer,
633-
),
634-
scheduleTimeout: &scheduleTimeout,
633+
pgMgr: pgMgr,
634+
scheduleTimeout: &scheduleTimeout,
635635
}
636636

637637
informerFactory.Start(ctx.Done())
638638
if !clicache.WaitForCacheSync(ctx.Done(), podInformer.Informer().HasSynced) {
639639
t.Fatal("WaitForCacheSync failed")
640640
}
641+
addFunc := core.AddPodFactory(pgMgr)
641642
for _, p := range tt.existingPods {
642643
podInformer.Informer().GetStore().Add(p)
644+
// we call add func here because we can not ensure existing pods are added before premit are called
645+
addFunc(p)
643646
}
644647

645648
_, got := pl.PostFilter(ctx, framework.NewCycleState(), tt.pod, nodeStatusMap)

0 commit comments

Comments
 (0)