Skip to content

Commit 3f807e9

Browse files
committed
Scheduler: LastOrdinal based on replicas instead of FreeCap
When scaling down and compacting, basing the last ordinal on the free capacity structure leads to have a lastOrdinal off by one since `FreeCap` might contain the free capacity for unschedulable pods. We will have to continue including unschduelable pods in FreeCap because it might happen that a pod becomes unscheduleble for external reasons like node gets shutdown for pods with lower ordinals and the pod need to be rescheduled and during that time period we want to consider those when compacting; once all vpods that were on that pod that is gone get rescheduled, FreeCap will only include scheduleable pods. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
1 parent 4087c3a commit 3f807e9

File tree

3 files changed

+33
-42
lines changed

3 files changed

+33
-42
lines changed

pkg/scheduler/state/state.go

Lines changed: 18 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,6 @@ type State struct {
4949
// schedulable pods tracks the pods that aren't being evicted.
5050
SchedulablePods []int32
5151

52-
// LastOrdinal is the ordinal index corresponding to the last statefulset replica
53-
// with placed vpods.
54-
LastOrdinal int32
55-
5652
// Pod capacity.
5753
Capacity int32
5854

@@ -143,14 +139,10 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
143139
return nil, err
144140
}
145141

146-
free := make([]int32, 0)
142+
freeCap := make([]int32, 0)
147143
pending := make(map[types.NamespacedName]int32, 4)
148144
expectedVReplicasByVPod := make(map[types.NamespacedName]int32, len(vpods))
149145
schedulablePods := sets.NewInt32()
150-
last := int32(-1)
151-
152-
// keep track of (vpod key, podname) pairs with existing placements
153-
withPlacement := make(map[types.NamespacedName]map[string]bool)
154146

155147
podSpread := make(map[types.NamespacedName]map[string]int32)
156148

@@ -172,7 +164,7 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
172164
}
173165

174166
for _, p := range schedulablePods.List() {
175-
free, last = s.updateFreeCapacity(logger, free, last, PodNameFromOrdinal(s.statefulSetName, p), 0)
167+
freeCap = s.updateFreeCapacity(logger, freeCap, PodNameFromOrdinal(s.statefulSetName, p), 0)
176168
}
177169

178170
// Getting current state from existing placements for all vpods
@@ -182,16 +174,13 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
182174
pending[vpod.GetKey()] = pendingFromVPod(vpod)
183175
expectedVReplicasByVPod[vpod.GetKey()] = vpod.GetVReplicas()
184176

185-
withPlacement[vpod.GetKey()] = make(map[string]bool)
186177
podSpread[vpod.GetKey()] = make(map[string]int32)
187178

188179
for i := 0; i < len(ps); i++ {
189180
podName := ps[i].PodName
190181
vreplicas := ps[i].VReplicas
191182

192-
free, last = s.updateFreeCapacity(logger, free, last, podName, vreplicas)
193-
194-
withPlacement[vpod.GetKey()][podName] = true
183+
freeCap = s.updateFreeCapacity(logger, freeCap, podName, vreplicas)
195184

196185
pod, err := s.podLister.Get(podName)
197186
if err != nil {
@@ -204,8 +193,17 @@ func (s *stateBuilder) State(ctx context.Context) (*State, error) {
204193
}
205194
}
206195

207-
state := &State{FreeCap: free, SchedulablePods: schedulablePods.List(), LastOrdinal: last, Capacity: s.capacity, Replicas: scale.Spec.Replicas, StatefulSetName: s.statefulSetName, PodLister: s.podLister,
208-
PodSpread: podSpread, Pending: pending, ExpectedVReplicaByVPod: expectedVReplicasByVPod}
196+
state := &State{
197+
FreeCap: freeCap,
198+
SchedulablePods: schedulablePods.List(),
199+
Capacity: s.capacity,
200+
Replicas: scale.Spec.Replicas,
201+
StatefulSetName: s.statefulSetName,
202+
PodLister: s.podLister,
203+
PodSpread: podSpread,
204+
Pending: pending,
205+
ExpectedVReplicaByVPod: expectedVReplicasByVPod,
206+
}
209207

210208
logger.Infow("cluster state info", zap.Any("state", state))
211209

@@ -219,23 +217,19 @@ func pendingFromVPod(vpod scheduler.VPod) int32 {
219217
return int32(math.Max(float64(0), float64(expected-scheduled)))
220218
}
221219

222-
func (s *stateBuilder) updateFreeCapacity(logger *zap.SugaredLogger, free []int32, last int32, podName string, vreplicas int32) ([]int32, int32) {
220+
func (s *stateBuilder) updateFreeCapacity(logger *zap.SugaredLogger, free []int32, podName string, vreplicas int32) []int32 {
223221
ordinal := OrdinalFromPodName(podName)
224222
free = grow(free, ordinal, s.capacity)
225223

226224
free[ordinal] -= vreplicas
227225

228226
// Assert the pod is not overcommitted
229-
if free[ordinal] < 0 {
227+
if overcommit := free[ordinal]; overcommit < 0 {
230228
// This should not happen anymore. Log as an error but do not interrupt the current scheduling.
231-
logger.Warnw("pod is overcommitted", zap.String("podName", podName), zap.Int32("free", free[ordinal]))
232-
}
233-
234-
if ordinal > last {
235-
last = ordinal
229+
logger.Warnw("pod is overcommitted", zap.String("podName", podName), zap.Int32("free", overcommit))
236230
}
237231

238-
return free, last
232+
return free
239233
}
240234

241235
func (s *State) TotalPending() int32 {
@@ -283,23 +277,16 @@ func (s *State) MarshalJSON() ([]byte, error) {
283277
type S struct {
284278
FreeCap []int32 `json:"freeCap"`
285279
SchedulablePods []int32 `json:"schedulablePods"`
286-
LastOrdinal int32 `json:"lastOrdinal"`
287280
Capacity int32 `json:"capacity"`
288281
Replicas int32 `json:"replicas"`
289-
NumZones int32 `json:"numZones"`
290-
NumNodes int32 `json:"numNodes"`
291-
NodeToZoneMap map[string]string `json:"nodeToZoneMap"`
292282
StatefulSetName string `json:"statefulSetName"`
293283
PodSpread map[string]map[string]int32 `json:"podSpread"`
294-
NodeSpread map[string]map[string]int32 `json:"nodeSpread"`
295-
ZoneSpread map[string]map[string]int32 `json:"zoneSpread"`
296284
Pending map[string]int32 `json:"pending"`
297285
}
298286

299287
sj := S{
300288
FreeCap: s.FreeCap,
301289
SchedulablePods: s.SchedulablePods,
302-
LastOrdinal: s.LastOrdinal,
303290
Capacity: s.Capacity,
304291
Replicas: s.Replicas,
305292
StatefulSetName: s.StatefulSetName,

pkg/scheduler/state/state_test.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,14 @@ func TestStateBuilder(t *testing.T) {
5858
name: "no vpods",
5959
replicas: int32(0),
6060
vpods: [][]duckv1alpha1.Placement{},
61-
expected: State{Capacity: 10, FreeCap: []int32{}, SchedulablePods: []int32{}, LastOrdinal: -1, StatefulSetName: sfsName, Pending: map[types.NamespacedName]int32{}, ExpectedVReplicaByVPod: map[types.NamespacedName]int32{}},
61+
expected: State{Capacity: 10, FreeCap: []int32{}, SchedulablePods: []int32{}, StatefulSetName: sfsName, Pending: map[types.NamespacedName]int32{}, ExpectedVReplicaByVPod: map[types.NamespacedName]int32{}},
6262
freec: int32(0),
6363
},
6464
{
6565
name: "one vpods",
6666
replicas: int32(1),
6767
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
68-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
68+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
6969
PodSpread: map[types.NamespacedName]map[string]int32{
7070
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
7171
"statefulset-name-0": 1,
@@ -88,7 +88,7 @@ func TestStateBuilder(t *testing.T) {
8888
{{PodName: "statefulset-name-1", VReplicas: 2}},
8989
{{PodName: "statefulset-name-1", VReplicas: 3}, {PodName: "statefulset-name-0", VReplicas: 1}},
9090
},
91-
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(0), int32(1), int32(2)}, LastOrdinal: 2, Replicas: 3, StatefulSetName: sfsName,
91+
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(0), int32(1), int32(2)}, Replicas: 3, StatefulSetName: sfsName,
9292
PodSpread: map[types.NamespacedName]map[string]int32{
9393
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
9494
"statefulset-name-0": 1,
@@ -124,7 +124,7 @@ func TestStateBuilder(t *testing.T) {
124124
{{PodName: "statefulset-name-1", VReplicas: 2}},
125125
{{PodName: "statefulset-name-1", VReplicas: 3}, {PodName: "statefulset-name-0", VReplicas: 1}},
126126
},
127-
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(1), int32(2)}, LastOrdinal: 2, Replicas: 3, StatefulSetName: sfsName,
127+
expected: State{Capacity: 10, FreeCap: []int32{int32(8), int32(5), int32(5)}, SchedulablePods: []int32{int32(1), int32(2)}, Replicas: 3, StatefulSetName: sfsName,
128128
PodSpread: map[types.NamespacedName]map[string]int32{
129129
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
130130
"statefulset-name-2": 5,
@@ -157,7 +157,7 @@ func TestStateBuilder(t *testing.T) {
157157
{{PodName: "statefulset-name-1", VReplicas: 0}},
158158
{{PodName: "statefulset-name-1", VReplicas: 0}, {PodName: "statefulset-name-3", VReplicas: 0}},
159159
},
160-
expected: State{Capacity: 10, FreeCap: []int32{int32(9), int32(10), int32(5), int32(10)}, SchedulablePods: []int32{int32(0), int32(1), int32(2), int32(3)}, LastOrdinal: 3, Replicas: 4, StatefulSetName: sfsName,
160+
expected: State{Capacity: 10, FreeCap: []int32{int32(9), int32(10), int32(5), int32(10)}, SchedulablePods: []int32{int32(0), int32(1), int32(2), int32(3)}, Replicas: 4, StatefulSetName: sfsName,
161161
PodSpread: map[types.NamespacedName]map[string]int32{
162162
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
163163
"statefulset-name-0": 1,
@@ -188,7 +188,7 @@ func TestStateBuilder(t *testing.T) {
188188
name: "three vpods but one tainted and one with no zone label",
189189
replicas: int32(1),
190190
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
191-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
191+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
192192
PodSpread: map[types.NamespacedName]map[string]int32{
193193
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
194194
"statefulset-name-0": 1,
@@ -207,7 +207,7 @@ func TestStateBuilder(t *testing.T) {
207207
name: "one vpod (HA)",
208208
replicas: int32(1),
209209
vpods: [][]duckv1alpha1.Placement{{{PodName: "statefulset-name-0", VReplicas: 1}}},
210-
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, LastOrdinal: 0, Replicas: 1, StatefulSetName: sfsName,
210+
expected: State{Capacity: 10, FreeCap: []int32{int32(9)}, SchedulablePods: []int32{int32(0)}, Replicas: 1, StatefulSetName: sfsName,
211211
PodSpread: map[types.NamespacedName]map[string]int32{
212212
{Name: vpodName + "-0", Namespace: vpodNs + "-0"}: {
213213
"statefulset-name-0": 1,

pkg/scheduler/statefulset/autoscaler.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,15 +250,17 @@ func (a *autoscaler) mayCompact(logger *zap.SugaredLogger, s *st.State) error {
250250
zap.Any("state", s),
251251
)
252252

253+
lastOrdinal := s.Replicas - 1
254+
253255
// when there is only one pod there is nothing to move or number of pods is just enough!
254-
if s.LastOrdinal < 1 || len(s.SchedulablePods) <= 1 {
256+
if lastOrdinal < 1 || len(s.SchedulablePods) <= 1 {
255257
return nil
256258
}
257259

258260
// Determine if there is enough free capacity to
259261
// move all vreplicas placed in the last pod to pods with a lower ordinal
260-
freeCapacity := s.FreeCapacity() - s.Free(s.LastOrdinal)
261-
usedInLastPod := s.Capacity - s.Free(s.LastOrdinal)
262+
freeCapacity := s.FreeCapacity() - s.Free(lastOrdinal)
263+
usedInLastPod := s.Capacity - s.Free(lastOrdinal)
262264

263265
if freeCapacity >= usedInLastPod {
264266
a.lastCompactAttempt = time.Now()
@@ -280,12 +282,14 @@ func (a *autoscaler) compact(s *st.State) error {
280282
return err
281283
}
282284

285+
lastOrdinal := s.Replicas - 1
286+
283287
for _, vpod := range vpods {
284288
placements := vpod.GetPlacements()
285289
for i := len(placements) - 1; i >= 0; i-- { //start from the last placement
286290
ordinal := st.OrdinalFromPodName(placements[i].PodName)
287291

288-
if ordinal == s.LastOrdinal {
292+
if ordinal == lastOrdinal {
289293
pod, err = s.PodLister.Get(placements[i].PodName)
290294
if err != nil {
291295
return fmt.Errorf("failed to get pod %s: %w", placements[i].PodName, err)

0 commit comments

Comments
 (0)