Skip to content

Commit b82fb50

Browse files
KUBE-5984 - ensure that scale-ups always occur when there are starved pods (#225)
* KUBE-5984 - ensure that scale-ups always occur when there are starved pods * Bug fixes and golangci-lint * Address PR comments (except metrics, to do next) * Also add a test for scaling up greater than the scaleOnStarve limit * Add in metrics * Add docs for Scale On Starve * refactor scaleOnStarve check --------- Co-authored-by: Michael Walsh <[email protected]>
1 parent 792da16 commit b82fb50

File tree

11 files changed

+643
-66
lines changed

11 files changed

+643
-66
lines changed

docs/calculations.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ when cached capacity exists:
8484
when cached capacity doesn't exist:
8585
- Amount to increase by: `1` node
8686

87+
## Scale On Starve
88+
89+
The node groups can also contain an optional boolean parameter `scale_on_starve`. When this is true, the system caps
90+
each escalation size to a minimum of 1 whenever there is a pod that cannot currently be scheduled. This helps alleviate
91+
situations where a large pod exceeds the available capacity of any 1 node, but its total size does not cause the system
92+
as a whole to exceed the scale up threshold.
93+
8794

8895
## Daemonsets
8996

docs/configuration/nodegroup.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ node_groups:
1616
min_nodes: 1
1717
max_nodes: 30
1818
dry_mode: false
19+
scale_on_starve: false
1920
taint_upper_capacity_threshold_percent: 40
2021
taint_lower_capacity_threshold_percent: 10
2122
slow_node_removal_rate: 2
@@ -98,6 +99,14 @@ Escalator would do in specific scenarios.
9899

99100
Note: this flag is overridden by the `--drymode` command line flag.
100101

102+
### `scale_on_starve`
103+
104+
This flag adds an additional check to the escalator scaling algorithm to enforce a minimum scaling of 1 new node
105+
whenever there is a pod that cannot currently be scheduled due to no node having capacity to run it.
106+
107+
Warning: You have to be extra careful around pod request sizes not exceeding node capacity when this option is enabled,
108+
or you may find the escalator always creates the maximum number of nodes.
109+
101110
### `taint_upper_capacity_threshold_percent`
102111

103112
This option defines the threshold at which Escalator will slowly start tainting nodes. The slow tainting will only occur

pkg/controller/controller.go

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"github.com/atlassian/escalator/pkg/cloudprovider"
88
"github.com/atlassian/escalator/pkg/k8s"
99
"github.com/atlassian/escalator/pkg/metrics"
10-
1110
"github.com/pkg/errors"
1211
log "github.com/sirupsen/logrus"
1312
v1 "k8s.io/api/core/v1"
@@ -259,23 +258,31 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
259258
nodeGroup.NodeInfoMap = k8s.CreateNodeNameToInfoMap(pods, allNodes)
260259

261260
// Calc capacity for untainted nodes
262-
memRequest, cpuRequest, err := k8s.CalculatePodsRequestsTotal(pods)
261+
podRequests, err := k8s.CalculatePodsRequestedUsage(pods)
263262
if err != nil {
264263
log.Errorf("Failed to calculate requests: %v", err)
265264
return 0, err
266265
}
267266

268-
memCapacity, cpuCapacity, err := k8s.CalculateNodesCapacityTotal(untaintedNodes)
267+
nodeCapacity, err := k8s.CalculateNodesCapacity(untaintedNodes, pods)
269268
if err != nil {
270269
log.Errorf("Failed to calculate capacity: %v", err)
271270
return 0, err
272271
}
273272

274273
// Metrics
275-
metrics.NodeGroupCPURequest.WithLabelValues(nodegroup).Set(float64(cpuRequest.MilliValue()))
276-
metrics.NodeGroupCPUCapacity.WithLabelValues(nodegroup).Set(float64(cpuCapacity.MilliValue()))
277-
metrics.NodeGroupMemCapacity.WithLabelValues(nodegroup).Set(float64(memCapacity.MilliValue() / 1000))
278-
metrics.NodeGroupMemRequest.WithLabelValues(nodegroup).Set(float64(memRequest.MilliValue() / 1000))
274+
metrics.NodeGroupCPURequest.WithLabelValues(nodegroup).Set(float64(podRequests.Total.GetCPUQuantity().MilliValue()))
275+
metrics.NodeGroupCPUCapacity.WithLabelValues(nodegroup).Set(float64(nodeCapacity.Total.GetCPUQuantity().MilliValue()))
276+
metrics.NodeGroupMemCapacity.WithLabelValues(nodegroup).Set(float64(nodeCapacity.Total.GetMemoryQuantity().MilliValue() / 1000))
277+
metrics.NodeGroupMemRequest.WithLabelValues(nodegroup).Set(float64(podRequests.Total.GetMemoryQuantity().MilliValue() / 1000))
278+
metrics.NodeGroupCPURequestLargestPendingCPU.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingCPU.GetCPUQuantity().MilliValue()))
279+
metrics.NodeGroupMemRequestLargestPendingCPU.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingCPU.GetMemoryQuantity().MilliValue() / 1000))
280+
metrics.NodeGroupCPURequestLargestPendingMem.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingMemory.GetCPUQuantity().MilliValue()))
281+
metrics.NodeGroupMemRequestLargestPendingMem.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingMemory.GetMemoryQuantity().MilliValue() / 1000))
282+
metrics.NodeGroupCPUCapacityLargestAvailableCPU.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableCPU.GetCPUQuantity().MilliValue()))
283+
metrics.NodeGroupMemCapacityLargestAvailableCPU.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableCPU.GetMemoryQuantity().MilliValue() / 1000))
284+
metrics.NodeGroupCPUCapacityLargestAvailableMem.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableMemory.GetCPUQuantity().MilliValue()))
285+
metrics.NodeGroupMemCapacityLargestAvailableMem.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableMemory.GetMemoryQuantity().MilliValue() / 1000))
279286

280287
// If we ever get into a state where we have less nodes than the minimum
281288
if len(untaintedNodes) < nodeGroup.Opts.MinNodes {
@@ -296,7 +303,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
296303
// Calc %
297304
// both cpu and memory capacity are based on number of untainted nodes
298305
// pass number of untainted nodes in to help make decision if it's a scaling-up-from-0
299-
cpuPercent, memPercent, err := calcPercentUsage(cpuRequest, memRequest, cpuCapacity, memCapacity, int64(len(untaintedNodes)))
306+
cpuPercent, memPercent, err := calcPercentUsage(
307+
*podRequests.Total.GetCPUQuantity(),
308+
*podRequests.Total.GetMemoryQuantity(),
309+
*nodeCapacity.Total.GetCPUQuantity(),
310+
*nodeCapacity.Total.GetMemoryQuantity(),
311+
int64(len(untaintedNodes)))
300312
if err != nil {
301313
log.Errorf("Failed to calculate percentages: %v", err)
302314
return 0, err
@@ -343,13 +355,24 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
343355
// if ScaleUpThresholdPercent is our "max target" or "slack capacity"
344356
// we want to add enough nodes such that the maxPercentage cluster util
345357
// drops back below ScaleUpThresholdPercent
346-
nodesDelta, err = calcScaleUpDelta(untaintedNodes, cpuPercent, memPercent, cpuRequest, memRequest, nodeGroup)
358+
nodesDelta, err = calcScaleUpDelta(
359+
untaintedNodes,
360+
cpuPercent,
361+
memPercent,
362+
*podRequests.Total.GetCPUQuantity(),
363+
*podRequests.Total.GetMemoryQuantity(),
364+
nodeGroup)
347365
if err != nil {
348366
log.Errorf("Failed to calculate node delta: %v", err)
349367
return nodesDelta, err
350368
}
351369
}
352370

371+
if c.isScaleOnStarve(nodeGroup, podRequests, nodeCapacity, untaintedNodes) {
372+
log.WithField("nodegroup", nodegroup).Info("Setting scale to minimum of 1 due to a starved pod")
373+
nodesDelta = int(math.Max(float64(nodesDelta), 1))
374+
}
375+
353376
log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta)
354377

355378
scaleOptions := scaleOpts{
@@ -396,6 +419,18 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
396419
return nodesDelta, err
397420
}
398421

422+
func (c *Controller) isScaleOnStarve(
423+
nodeGroup *NodeGroupState,
424+
podRequests k8s.PodRequestedUsage,
425+
nodeCapacity k8s.NodeAvailableCapacity,
426+
untaintedNodes []*v1.Node,
427+
) bool {
428+
return nodeGroup.Opts.ScaleOnStarve &&
429+
((!podRequests.LargestPendingCPU.IsEmpty() && podRequests.LargestPendingCPU.MilliCPU > nodeCapacity.LargestAvailableCPU.MilliCPU) ||
430+
(!podRequests.LargestPendingMemory.IsEmpty() && podRequests.LargestPendingMemory.Memory > nodeCapacity.LargestAvailableMemory.Memory)) &&
431+
len(untaintedNodes) < nodeGroup.Opts.MaxNodes
432+
}
433+
399434
// RunOnce performs the main autoscaler logic once
400435
func (c *Controller) RunOnce() error {
401436
startTime := time.Now()

0 commit comments

Comments
 (0)