7
7
"github.com/atlassian/escalator/pkg/cloudprovider"
8
8
"github.com/atlassian/escalator/pkg/k8s"
9
9
"github.com/atlassian/escalator/pkg/metrics"
10
-
11
10
"github.com/pkg/errors"
12
11
log "github.com/sirupsen/logrus"
13
12
v1 "k8s.io/api/core/v1"
@@ -259,23 +258,31 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
259
258
nodeGroup .NodeInfoMap = k8s .CreateNodeNameToInfoMap (pods , allNodes )
260
259
261
260
// Calc capacity for untainted nodes
262
- memRequest , cpuRequest , err := k8s .CalculatePodsRequestsTotal (pods )
261
+ podRequests , err := k8s .CalculatePodsRequestedUsage (pods )
263
262
if err != nil {
264
263
log .Errorf ("Failed to calculate requests: %v" , err )
265
264
return 0 , err
266
265
}
267
266
268
- memCapacity , cpuCapacity , err := k8s .CalculateNodesCapacityTotal (untaintedNodes )
267
+ nodeCapacity , err := k8s .CalculateNodesCapacity (untaintedNodes , pods )
269
268
if err != nil {
270
269
log .Errorf ("Failed to calculate capacity: %v" , err )
271
270
return 0 , err
272
271
}
273
272
274
273
// Metrics
275
- metrics .NodeGroupCPURequest .WithLabelValues (nodegroup ).Set (float64 (cpuRequest .MilliValue ()))
276
- metrics .NodeGroupCPUCapacity .WithLabelValues (nodegroup ).Set (float64 (cpuCapacity .MilliValue ()))
277
- metrics .NodeGroupMemCapacity .WithLabelValues (nodegroup ).Set (float64 (memCapacity .MilliValue () / 1000 ))
278
- metrics .NodeGroupMemRequest .WithLabelValues (nodegroup ).Set (float64 (memRequest .MilliValue () / 1000 ))
274
+ metrics .NodeGroupCPURequest .WithLabelValues (nodegroup ).Set (float64 (podRequests .Total .GetCPUQuantity ().MilliValue ()))
275
+ metrics .NodeGroupCPUCapacity .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .Total .GetCPUQuantity ().MilliValue ()))
276
+ metrics .NodeGroupMemCapacity .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .Total .GetMemoryQuantity ().MilliValue () / 1000 ))
277
+ metrics .NodeGroupMemRequest .WithLabelValues (nodegroup ).Set (float64 (podRequests .Total .GetMemoryQuantity ().MilliValue () / 1000 ))
278
+ metrics .NodeGroupCPURequestLargestPendingCPU .WithLabelValues (nodegroup ).Set (float64 (podRequests .LargestPendingCPU .GetCPUQuantity ().MilliValue ()))
279
+ metrics .NodeGroupMemRequestLargestPendingCPU .WithLabelValues (nodegroup ).Set (float64 (podRequests .LargestPendingCPU .GetMemoryQuantity ().MilliValue () / 1000 ))
280
+ metrics .NodeGroupCPURequestLargestPendingMem .WithLabelValues (nodegroup ).Set (float64 (podRequests .LargestPendingMemory .GetCPUQuantity ().MilliValue ()))
281
+ metrics .NodeGroupMemRequestLargestPendingMem .WithLabelValues (nodegroup ).Set (float64 (podRequests .LargestPendingMemory .GetMemoryQuantity ().MilliValue () / 1000 ))
282
+ metrics .NodeGroupCPUCapacityLargestAvailableCPU .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .LargestAvailableCPU .GetCPUQuantity ().MilliValue ()))
283
+ metrics .NodeGroupMemCapacityLargestAvailableCPU .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .LargestAvailableCPU .GetMemoryQuantity ().MilliValue () / 1000 ))
284
+ metrics .NodeGroupCPUCapacityLargestAvailableMem .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .LargestAvailableMemory .GetCPUQuantity ().MilliValue ()))
285
+ metrics .NodeGroupMemCapacityLargestAvailableMem .WithLabelValues (nodegroup ).Set (float64 (nodeCapacity .LargestAvailableMemory .GetMemoryQuantity ().MilliValue () / 1000 ))
279
286
280
287
// If we ever get into a state where we have less nodes than the minimum
281
288
if len (untaintedNodes ) < nodeGroup .Opts .MinNodes {
@@ -296,7 +303,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
296
303
// Calc %
297
304
// both cpu and memory capacity are based on number of untainted nodes
298
305
// pass number of untainted nodes in to help make decision if it's a scaling-up-from-0
299
- cpuPercent , memPercent , err := calcPercentUsage (cpuRequest , memRequest , cpuCapacity , memCapacity , int64 (len (untaintedNodes )))
306
+ cpuPercent , memPercent , err := calcPercentUsage (
307
+ * podRequests .Total .GetCPUQuantity (),
308
+ * podRequests .Total .GetMemoryQuantity (),
309
+ * nodeCapacity .Total .GetCPUQuantity (),
310
+ * nodeCapacity .Total .GetMemoryQuantity (),
311
+ int64 (len (untaintedNodes )))
300
312
if err != nil {
301
313
log .Errorf ("Failed to calculate percentages: %v" , err )
302
314
return 0 , err
@@ -343,13 +355,24 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
343
355
// if ScaleUpThresholdPercent is our "max target" or "slack capacity"
344
356
// we want to add enough nodes such that the maxPercentage cluster util
345
357
// drops back below ScaleUpThresholdPercent
346
- nodesDelta , err = calcScaleUpDelta (untaintedNodes , cpuPercent , memPercent , cpuRequest , memRequest , nodeGroup )
358
+ nodesDelta , err = calcScaleUpDelta (
359
+ untaintedNodes ,
360
+ cpuPercent ,
361
+ memPercent ,
362
+ * podRequests .Total .GetCPUQuantity (),
363
+ * podRequests .Total .GetMemoryQuantity (),
364
+ nodeGroup )
347
365
if err != nil {
348
366
log .Errorf ("Failed to calculate node delta: %v" , err )
349
367
return nodesDelta , err
350
368
}
351
369
}
352
370
371
+ if c .isScaleOnStarve (nodeGroup , podRequests , nodeCapacity , untaintedNodes ) {
372
+ log .WithField ("nodegroup" , nodegroup ).Info ("Setting scale to minimum of 1 due to a starved pod" )
373
+ nodesDelta = int (math .Max (float64 (nodesDelta ), 1 ))
374
+ }
375
+
353
376
log .WithField ("nodegroup" , nodegroup ).Debugf ("Delta: %v" , nodesDelta )
354
377
355
378
scaleOptions := scaleOpts {
@@ -396,6 +419,18 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
396
419
return nodesDelta , err
397
420
}
398
421
422
+ func (c * Controller ) isScaleOnStarve (
423
+ nodeGroup * NodeGroupState ,
424
+ podRequests k8s.PodRequestedUsage ,
425
+ nodeCapacity k8s.NodeAvailableCapacity ,
426
+ untaintedNodes []* v1.Node ,
427
+ ) bool {
428
+ return nodeGroup .Opts .ScaleOnStarve &&
429
+ ((! podRequests .LargestPendingCPU .IsEmpty () && podRequests .LargestPendingCPU .MilliCPU > nodeCapacity .LargestAvailableCPU .MilliCPU ) ||
430
+ (! podRequests .LargestPendingMemory .IsEmpty () && podRequests .LargestPendingMemory .Memory > nodeCapacity .LargestAvailableMemory .Memory )) &&
431
+ len (untaintedNodes ) < nodeGroup .Opts .MaxNodes
432
+ }
433
+
399
434
// RunOnce performs the main autoscaler logic once
400
435
func (c * Controller ) RunOnce () error {
401
436
startTime := time .Now ()
0 commit comments