atlassian
diff --git a/‎docs/calculations.md
Lines changed: 7 additions & 0 deletions b/‎docs/calculations.md
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/configuration/nodegroup.md
Lines changed: 9 additions & 0 deletions b/‎docs/configuration/nodegroup.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎pkg/controller/controller.go
Lines changed: 44 additions & 9 deletions b/‎pkg/controller/controller.go
Lines changed: 44 additions & 9 deletions
@@ -84,6 +84,13 @@ when cached capacity exists:
 when cached capacity doesn't exist:
 - Amount to increase by: `1` node
 
+## Scale On Starve
+
+The node groups can also contain an optional boolean parameter `scale_on_starve`. When this is true, the system caps
+each escalation size to a minimum of 1 whenever there is a pod that cannot currently be scheduled. This helps alleviate
+situations where a large pod exceeds the available capacity of any 1 node, but its total size does not cause the system
+as a whole to exceed the scale up threshold.
+
 
 ## Daemonsets
 
 
@@ -16,6 +16,7 @@ node_groups:
     min_nodes: 1
     max_nodes: 30
     dry_mode: false
+    scale_on_starve: false
     taint_upper_capacity_threshold_percent: 40
     taint_lower_capacity_threshold_percent: 10
     slow_node_removal_rate: 2
@@ -98,6 +99,14 @@ Escalator would do in specific scenarios.
 
 Note: this flag is overridden by the `--drymode` command line flag.
 
+### `scale_on_starve`
+
+This flag adds an additional check to the escalator scaling algorithm to enforce a minimum scaling of 1 new node
+whenever there is a pod that cannot currently be scheduled due to no node having capacity to run it.
+
+Warning: You have to be extra careful around pod request sizes not exceeding node capacity when this option is enabled,
+or you may find the escalator always creates the maximum number of nodes.
+
 ### `taint_upper_capacity_threshold_percent`
 
 This option defines the threshold at which Escalator will slowly start tainting nodes. The slow tainting will only occur
 
@@ -7,7 +7,6 @@ import (
 	"github.com/atlassian/escalator/pkg/cloudprovider"
 	"github.com/atlassian/escalator/pkg/k8s"
 	"github.com/atlassian/escalator/pkg/metrics"
-
 	"github.com/pkg/errors"
 	log "github.com/sirupsen/logrus"
 	v1 "k8s.io/api/core/v1"
@@ -259,23 +258,31 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
 	nodeGroup.NodeInfoMap = k8s.CreateNodeNameToInfoMap(pods, allNodes)
 
 	// Calc capacity for untainted nodes
-	memRequest, cpuRequest, err := k8s.CalculatePodsRequestsTotal(pods)
+	podRequests, err := k8s.CalculatePodsRequestedUsage(pods)
 	if err != nil {
 		log.Errorf("Failed to calculate requests: %v", err)
 		return 0, err
 	}
 
-	memCapacity, cpuCapacity, err := k8s.CalculateNodesCapacityTotal(untaintedNodes)
+	nodeCapacity, err := k8s.CalculateNodesCapacity(untaintedNodes, pods)
 	if err != nil {
 		log.Errorf("Failed to calculate capacity: %v", err)
 		return 0, err
 	}
 
 	// Metrics
-	metrics.NodeGroupCPURequest.WithLabelValues(nodegroup).Set(float64(cpuRequest.MilliValue()))
-	metrics.NodeGroupCPUCapacity.WithLabelValues(nodegroup).Set(float64(cpuCapacity.MilliValue()))
-	metrics.NodeGroupMemCapacity.WithLabelValues(nodegroup).Set(float64(memCapacity.MilliValue() / 1000))
-	metrics.NodeGroupMemRequest.WithLabelValues(nodegroup).Set(float64(memRequest.MilliValue() / 1000))
+	metrics.NodeGroupCPURequest.WithLabelValues(nodegroup).Set(float64(podRequests.Total.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupCPUCapacity.WithLabelValues(nodegroup).Set(float64(nodeCapacity.Total.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupMemCapacity.WithLabelValues(nodegroup).Set(float64(nodeCapacity.Total.GetMemoryQuantity().MilliValue() / 1000))
+	metrics.NodeGroupMemRequest.WithLabelValues(nodegroup).Set(float64(podRequests.Total.GetMemoryQuantity().MilliValue() / 1000))
+	metrics.NodeGroupCPURequestLargestPendingCPU.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingCPU.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupMemRequestLargestPendingCPU.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingCPU.GetMemoryQuantity().MilliValue() / 1000))
+	metrics.NodeGroupCPURequestLargestPendingMem.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingMemory.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupMemRequestLargestPendingMem.WithLabelValues(nodegroup).Set(float64(podRequests.LargestPendingMemory.GetMemoryQuantity().MilliValue() / 1000))
+	metrics.NodeGroupCPUCapacityLargestAvailableCPU.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableCPU.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupMemCapacityLargestAvailableCPU.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableCPU.GetMemoryQuantity().MilliValue() / 1000))
+	metrics.NodeGroupCPUCapacityLargestAvailableMem.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableMemory.GetCPUQuantity().MilliValue()))
+	metrics.NodeGroupMemCapacityLargestAvailableMem.WithLabelValues(nodegroup).Set(float64(nodeCapacity.LargestAvailableMemory.GetMemoryQuantity().MilliValue() / 1000))
 
 	// If we ever get into a state where we have less nodes than the minimum
 	if len(untaintedNodes) < nodeGroup.Opts.MinNodes {
@@ -296,7 +303,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
 	// Calc %
 	// both cpu and memory capacity are based on number of untainted nodes
 	// pass number of untainted nodes in to help make decision if it's a scaling-up-from-0
-	cpuPercent, memPercent, err := calcPercentUsage(cpuRequest, memRequest, cpuCapacity, memCapacity, int64(len(untaintedNodes)))
+	cpuPercent, memPercent, err := calcPercentUsage(
+		*podRequests.Total.GetCPUQuantity(),
+		*podRequests.Total.GetMemoryQuantity(),
+		*nodeCapacity.Total.GetCPUQuantity(),
+		*nodeCapacity.Total.GetMemoryQuantity(),
+		int64(len(untaintedNodes)))
 	if err != nil {
 		log.Errorf("Failed to calculate percentages: %v", err)
 		return 0, err
@@ -343,13 +355,24 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
 		// if ScaleUpThresholdPercent is our "max target" or "slack capacity"
 		// we want to add enough nodes such that the maxPercentage cluster util
 		// drops back below ScaleUpThresholdPercent
-		nodesDelta, err = calcScaleUpDelta(untaintedNodes, cpuPercent, memPercent, cpuRequest, memRequest, nodeGroup)
+		nodesDelta, err = calcScaleUpDelta(
+			untaintedNodes,
+			cpuPercent,
+			memPercent,
+			*podRequests.Total.GetCPUQuantity(),
+			*podRequests.Total.GetMemoryQuantity(),
+			nodeGroup)
 		if err != nil {
 			log.Errorf("Failed to calculate node delta: %v", err)
 			return nodesDelta, err
 		}
 	}
 
+	if c.isScaleOnStarve(nodeGroup, podRequests, nodeCapacity, untaintedNodes) {
+		log.WithField("nodegroup", nodegroup).Info("Setting scale to minimum of 1 due to a starved pod")
+		nodesDelta = int(math.Max(float64(nodesDelta), 1))
+	}
+
 	log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta)
 
 	scaleOptions := scaleOpts{
@@ -396,6 +419,18 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
 	return nodesDelta, err
 }
 
+func (c *Controller) isScaleOnStarve(
+	nodeGroup *NodeGroupState,
+	podRequests k8s.PodRequestedUsage,
+	nodeCapacity k8s.NodeAvailableCapacity,
+	untaintedNodes []*v1.Node,
+) bool {
+	return nodeGroup.Opts.ScaleOnStarve &&
+		((!podRequests.LargestPendingCPU.IsEmpty() && podRequests.LargestPendingCPU.MilliCPU > nodeCapacity.LargestAvailableCPU.MilliCPU) ||
+			(!podRequests.LargestPendingMemory.IsEmpty() && podRequests.LargestPendingMemory.Memory > nodeCapacity.LargestAvailableMemory.Memory)) &&
+		len(untaintedNodes) < nodeGroup.Opts.MaxNodes
+}
+
 // RunOnce performs the main autoscaler logic once
 func (c *Controller) RunOnce() error {
 	startTime := time.Now()