Skip to content

Commit 4fb34df

Browse files
committed
Scheduler: LastOrdinal based on replicas instead of FreeCap
When scaling down and compacting, basing the last ordinal on the free capacity structure leads to have a lastOrdinal off by one since `FreeCap` might contain the free capacity for unschedulable pods. We will have to continue including unschduelable pods in FreeCap because it might happen that a pod becomes unscheduleble for external reasons like node gets shutdown for pods with lower ordinals and the pod need to be rescheduled and during that time period we want to consider those when compacting; once all vpods that were on that pod that is gone get rescheduled, FreeCap will only include scheduleable pods. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
1 parent 4087c3a commit 4fb34df

File tree

5 files changed

+80
-57
lines changed

5 files changed

+80
-57
lines changed

pkg/scheduler/scheduler.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ type VPodLister func() ([]VPod, error)
4242
// Evictor allows for vreplicas to be evicted.
4343
// For instance, the evictor is used by the statefulset scheduler to
4444
// move vreplicas to pod with a lower ordinal.
45+
//
46+
// pod might be `nil`.
4547
type Evictor func(pod *corev1.Pod, vpod VPod, from *duckv1alpha1.Placement) error
4648

4749
// Scheduler is responsible for placing VPods into real Kubernetes pods

pkg/scheduler/state/state.go

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,14 @@ type StateAccessor interface {
4444
// It is used by for the scheduler and the autoscaler
4545
type State struct {
4646
// free tracks the free capacity of each pod.
47+
//
48+
// Including pods that might not exist anymore, it reflects the free capacity determined by
49+
// placements in the vpod status.
4750
FreeCap []int32
4851

4952
// schedulable pods tracks the pods that aren't being evicted.
5053
SchedulablePods []int32
5154

52-
// LastOrdinal is the ordinal index corresponding to the last statefulset replica
53-
// with placed vpods.
54-
LastOrdinal int32
55-
5655
// Pod capacity.
5756
Capacity int32
5857

@@ -143,14 +142,10 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
143142
return nil, err
144143
}
145144

146-
free := make([]int32, 0)
145+
freeCap := make([]int32, 0)
147146
pending := make(map[types.NamespacedName]int32, 4)
148147
expectedVReplicasByVPod := make(map[types.NamespacedName]int32, len(vpods))
149148
schedulablePods := sets.NewInt32()
150-
last := int32(-1)
151-
152-
// keep track of (vpod key, podname) pairs with existing placements
153-
withPlacement := make(map[types.NamespacedName]map[string]bool)
154149

155150
podSpread := make(map[types.NamespacedName]map[string]int32)
156151

@@ -172,7 +167,7 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
172167
}
173168

174169
for _, p := range schedulablePods.List() {
175-
free, last = s.updateFreeCapacity(logger, free, last, PodNameFromOrdinal(s.statefulSetName, p), 0)
170+
freeCap = s.updateFreeCapacity(logger, freeCap, PodNameFromOrdinal(s.statefulSetName, p), 0)
176171
}
177172

178173
// Getting current state from existing placements for all vpods
@@ -182,16 +177,13 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
182177
pending[vpod.GetKey()] = pendingFromVPod(vpod)
183178
expectedVReplicasByVPod[vpod.GetKey()] = vpod.GetVReplicas()
184179

185-
withPlacement[vpod.GetKey()] = make(map[string]bool)
186180
podSpread[vpod.GetKey()] = make(map[string]int32)
187181

188182
for i := 0; i < len(ps); i++ {
189183
podName := ps[i].PodName
190184
vreplicas := ps[i].VReplicas
191185

192-
free, last = s.updateFreeCapacity(logger, free, last, podName, vreplicas)
193-
194-
withPlacement[vpod.GetKey()][podName] = true
186+
freeCap = s.updateFreeCapacity(logger, freeCap, podName, vreplicas)
195187

196188
pod, err := s.podLister.Get(podName)
197189
if err != nil {
@@ -204,8 +196,17 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
204196
}
205197
}
206198

207-
state := &State{FreeCap: free, SchedulablePods: schedulablePods.List(), LastOrdinal: last, Capacity: s.capacity, Replicas: scale.Spec.Replicas, StatefulSetName: s.statefulSetName, PodLister: s.podLister,
208-
PodSpread: podSpread, Pending: pending, ExpectedVReplicaByVPod: expectedVReplicasByVPod}
199+
state := &State{
200+
FreeCap: freeCap,
201+
SchedulablePods: schedulablePods.List(),
202+
Capacity: s.capacity,
203+
Replicas: scale.Spec.Replicas,
204+
StatefulSetName: s.statefulSetName,
205+
PodLister: s.podLister,
206+
PodSpread: podSpread,
207+
Pending: pending,
208+
ExpectedVReplicaByVPod: expectedVReplicasByVPod,
209+
}
209210

210211
logger.Infow("cluster state info", zap.Any("state", state))
211212

@@ -219,23 +220,19 @@ func pendingFromVPod(vpod scheduler.VPod) int32 {
219220
return int32(math.Max(float64(0), float64(expected-scheduled)))
220221
}
221222

222-
func (s *stateBuilder) updateFreeCapacity(logger *zap.SugaredLogger, free []int32, last int32, podName string, vreplicas int32) ([]int32, int32) {
223+
func (s *stateBuilder) updateFreeCapacity(logger *zap.SugaredLogger, freeCap []int32, podName string, vreplicas int32) []int32 {
223224
ordinal := OrdinalFromPodName(podName)
224-
free = grow(free, ordinal, s.capacity)
225+
freeCap = grow(freeCap, ordinal, s.capacity)
225226

226-
free[ordinal] -= vreplicas
227+
freeCap[ordinal] -= vreplicas
227228

228229
// Assert the pod is not overcommitted
229-
if free[ordinal] < 0 {
230+
if overcommit := freeCap[ordinal]; overcommit < 0 {
230231
// This should not happen anymore. Log as an error but do not interrupt the current scheduling.
231-
logger.Warnw("pod is overcommitted", zap.String("podName", podName), zap.Int32("free", free[ordinal]))
232-
}
233-
234-
if ordinal > last {
235-
last = ordinal
232+
logger.Warnw("pod is overcommitted", zap.String("podName", podName), zap.Int32("overcommit", overcommit))
236233
}
237234

238-
return free, last
235+
return freeCap
239236
}
240237

241238
func (s *State) TotalPending() int32 {
@@ -283,23 +280,16 @@ func (s *State) MarshalJSON() ([]byte, error) {
283280
type S struct {
284281
FreeCap []int32 `json:"freeCap"`
285282
SchedulablePods []int32 `json:"schedulablePods"`
286-
LastOrdinal int32 `json:"lastOrdinal"`
287283
Capacity int32 `json:"capacity"`
288284
Replicas int32 `json:"replicas"`
289-
NumZones int32 `json:"numZones"`
290-
NumNodes int32 `json:"numNodes"`
291-
NodeToZoneMap map[string]string `json:"nodeToZoneMap"`
292285
StatefulSetName string `json:"statefulSetName"`
293286
PodSpread map[string]map[string]int32 `json:"podSpread"`
294-
NodeSpread map[string]map[string]int32 `json:"nodeSpread"`
295-
ZoneSpread map[string]map[string]int32 `json:"zoneSpread"`
296287
Pending map[string]int32 `json:"pending"`
297288
}
298289

299290
sj := S{
300291
FreeCap: s.FreeCap,
301292
SchedulablePods: s.SchedulablePods,
302-
LastOrdinal: s.LastOrdinal,
303293
Capacity: s.Capacity,
304294
Replicas: s.Replicas,
305295
StatefulSetName: s.StatefulSetName,

pkg/scheduler/state/state_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,14 @@ func TestStateBuilder(t *testing.T) {
5858
name: "no vpods",
5959
replicas: int32(0),
6060
vpods: [][]duckv1alpha1.Placement{},
61-
expected: State{Capacity: 10, FreeCap: []int32{}, SchedulablePods: []int32{}, LastOrdinal: -1, StatefulSetName: sfsName, Pending: map[types.NamespacedName]int32{}, ExpectedVReplicaByVPod: map[types.NamespacedName]int32{}},
61+
expected: State{Capacity: 10, FreeCap: []int32{}, SchedulablePods: []int32{}, StatefulSetName: sfsName, Pending: map[types.NamespacedName]int32{}, ExpectedVReplicaByVPod: map[types.NamespacedName]int32{}},
6262
freec: int32(0),
6363
},
6464
{
6565
name: "one vpods",
6666
replicas: int32(1),
6767
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
68-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
68+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
6969
PodSpread: map[types.NamespacedName]map[string]int32{
7070
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
7171
"statefulset-name-0": 1,
@@ -88,7 +88,7 @@ func TestStateBuilder(t *testing.T) {
8888
{{PodName: "statefulset-name-1", VReplicas: 2}},
8989
{{PodName: "statefulset-name-1", VReplicas: 3}, {PodName: "statefulset-name-0", VReplicas: 1}},
9090
},
91-
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(0), int32(1), int32(2)}, LastOrdinal: 2, Replicas: 3, StatefulSetName: sfsName,
91+
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(0), int32(1), int32(2)}, Replicas: 3, StatefulSetName: sfsName,
9292
PodSpread: map[types.NamespacedName]map[string]int32{
9393
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
9494
"statefulset-name-0": 1,
@@ -124,7 +124,7 @@ func TestStateBuilder(t *testing.T) {
124124
{{PodName: "statefulset-name-1", VReplicas: 2}},
125125
{{PodName: "statefulset-name-1", VReplicas: 3}, {PodName: "statefulset-name-0", VReplicas: 1}},
126126
},
127-
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(1), int32(2)}, LastOrdinal: 2, Replicas: 3, StatefulSetName: sfsName,
127+
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(1), int32(2)}, Replicas: 3, StatefulSetName: sfsName,
128128
PodSpread: map[types.NamespacedName]map[string]int32{
129129
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
130130
"statefulset-name-2": 5,
@@ -157,7 +157,7 @@ func TestStateBuilder(t *testing.T) {
157157
{{PodName: "statefulset-name-1", VReplicas: 0}},
158158
{{PodName: "statefulset-name-1", VReplicas: 0}, {PodName: "statefulset-name-3", VReplicas: 0}},
159159
},
160-
expected: State{Capacity: 10, FreeCap: []int32{int32(9), int32(10), int32(5), int32(10)}, SchedulablePods: []int32{int32(0), int32(1), int32(2), int32(3)}, LastOrdinal: 3, Replicas: 4, StatefulSetName: sfsName,
160+
expected: State{Capacity: 10, FreeCap: []int32{int32(9), int32(10), int32(5), int32(10)}, SchedulablePods: []int32{int32(0), int32(1), int32(2), int32(3)}, Replicas: 4, StatefulSetName: sfsName,
161161
PodSpread: map[types.NamespacedName]map[string]int32{
162162
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
163163
"statefulset-name-0": 1,
@@ -188,7 +188,7 @@ func TestStateBuilder(t *testing.T) {
188188
name: "three vpods but one tainted and one with no zone label",
189189
replicas: int32(1),
190190
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
191-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
191+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
192192
PodSpread: map[types.NamespacedName]map[string]int32{
193193
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
194194
"statefulset-name-0": 1,
@@ -207,7 +207,7 @@ func TestStateBuilder(t *testing.T) {
207207
name: "one vpod (HA)",
208208
replicas: int32(1),
209209
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
210-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
210+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
211211
PodSpread: map[types.NamespacedName]map[string]int32{
212212
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
213213
"statefulset-name-0": 1,

pkg/scheduler/statefulset/autoscaler.go

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
"go.uber.org/zap"
2828
v1 "k8s.io/api/core/v1"
29+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
"k8s.io/apimachinery/pkg/types"
3132
"k8s.io/utils/integer"
@@ -250,17 +251,8 @@ func (a *autoscaler) mayCompact(logger *zap.SugaredLogger, s *st.State) error {
250251
zap.Any("state", s),
251252
)
252253

253-
// when there is only one pod there is nothing to move or number of pods is just enough!
254-
if s.LastOrdinal < 1 || len(s.SchedulablePods) <= 1 {
255-
return nil
256-
}
257-
258-
// Determine if there is enough free capacity to
259-
// move all vreplicas placed in the last pod to pods with a lower ordinal
260-
freeCapacity := s.FreeCapacity() - s.Free(s.LastOrdinal)
261-
usedInLastPod := s.Capacity - s.Free(s.LastOrdinal)
262-
263-
if freeCapacity >= usedInLastPod {
254+
// Determine if there are vpods that need compaction
255+
if s.Replicas != int32(len(s.FreeCap)) {
264256
a.lastCompactAttempt = time.Now()
265257
err := a.compact(s)
266258
if err != nil {
@@ -285,9 +277,9 @@ func (a *autoscaler) compact(s *st.State) error {
285277
for i := len(placements) - 1; i >= 0; i-- { //start from the last placement
286278
ordinal := st.OrdinalFromPodName(placements[i].PodName)
287279

288-
if ordinal == s.LastOrdinal {
280+
if ordinal >= s.Replicas {
289281
pod, err = s.PodLister.Get(placements[i].PodName)
290-
if err != nil {
282+
if err != nil && !apierrors.IsNotFound(err) {
291283
return fmt.Errorf("failed to get pod %s: %w", placements[i].PodName, err)
292284
}
293285

pkg/scheduler/statefulset/autoscaler_test.go

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,9 +470,7 @@ func TestCompactor(t *testing.T) {
470470
{PodName: "statefulset-name-0", VReplicas: int32(8)},
471471
{PodName: "statefulset-name-1", VReplicas: int32(2)}}),
472472
},
473-
wantEvictions: map[types.NamespacedName][]duckv1alpha1.Placement{
474-
{Name: "vpod-1", Namespace: testNs}: {{PodName: "statefulset-name-1", VReplicas: int32(2)}},
475-
},
473+
wantEvictions: nil,
476474
},
477475
{
478476
name: "multiple vpods, with placements in multiple pods, compacted",
@@ -500,8 +498,49 @@ func TestCompactor(t *testing.T) {
500498
{PodName: "statefulset-name-0", VReplicas: int32(2)},
501499
{PodName: "statefulset-name-2", VReplicas: int32(7)}}),
502500
},
501+
wantEvictions: nil,
502+
},
503+
{
504+
name: "multiple vpods, scale down, with placements in multiple pods, compacted",
505+
replicas: int32(2),
506+
vpods: []scheduler.VPod{
507+
tscheduler.NewVPod(testNs, "vpod-1", 10, []duckv1alpha1.Placement{
508+
{PodName: "statefulset-name-0", VReplicas: int32(3)},
509+
{PodName: "statefulset-name-1", VReplicas: int32(7)}}),
510+
tscheduler.NewVPod(testNs, "vpod-2", 10, []duckv1alpha1.Placement{
511+
{PodName: "statefulset-name-0", VReplicas: int32(7)},
512+
{PodName: "statefulset-name-2", VReplicas: int32(3)}}),
513+
},
503514
wantEvictions: map[types.NamespacedName][]duckv1alpha1.Placement{
504-
{Name: "vpod-2", Namespace: testNs}: {{PodName: "statefulset-name-2", VReplicas: int32(7)}},
515+
{Name: "vpod-2", Namespace: testNs}: {{PodName: "statefulset-name-2", VReplicas: int32(3)}},
516+
},
517+
},
518+
{
519+
name: "multiple vpods, scale down multiple, with placements in multiple pods, compacted",
520+
replicas: int32(1),
521+
vpods: []scheduler.VPod{
522+
tscheduler.NewVPod(testNs, "vpod-1", 6, []duckv1alpha1.Placement{
523+
{PodName: "statefulset-name-0", VReplicas: int32(3)},
524+
{PodName: "statefulset-name-1", VReplicas: int32(7)},
525+
{PodName: "statefulset-name-2", VReplicas: int32(6)},
526+
}),
527+
tscheduler.NewVPod(testNs, "vpod-2", 3, []duckv1alpha1.Placement{
528+
{PodName: "statefulset-name-0", VReplicas: int32(7)},
529+
{PodName: "statefulset-name-2", VReplicas: int32(3)},
530+
{PodName: "statefulset-name-3", VReplicas: int32(2)},
531+
{PodName: "statefulset-name-10", VReplicas: int32(1)},
532+
}),
533+
},
534+
wantEvictions: map[types.NamespacedName][]duckv1alpha1.Placement{
535+
{Name: "vpod-1", Namespace: testNs}: {
536+
{PodName: "statefulset-name-2", VReplicas: int32(6)},
537+
{PodName: "statefulset-name-1", VReplicas: int32(7)},
538+
},
539+
{Name: "vpod-2", Namespace: testNs}: {
540+
{PodName: "statefulset-name-10", VReplicas: int32(1)},
541+
{PodName: "statefulset-name-3", VReplicas: int32(2)},
542+
{PodName: "statefulset-name-2", VReplicas: int32(3)},
543+
},
505544
},
506545
},
507546
}

0 commit comments

Comments
 (0)