Skip to content

Commit be1d9cb

Browse files
committed
Allow 1.5% tolerance in memory capacity when comparing nodegroups
In testing, AWS M5 instances can on occasion display approximately a 1% difference in memory capacity between availability zones, deployed with the same launch configuration and same AMI. Allow a 1.5% tolerance to give some buffer on the actual amount of memory discrepancy since in testing, some examples were just over 1% (eg 1.05%, 1.1%). Tests are included with capacity values taken from real instances to prevent future regression.
1 parent 952085e commit be1d9cb

File tree

2 files changed

+66
-20
lines changed

2 files changed

+66
-20
lines changed

cluster-autoscaler/processors/nodegroupset/compare_nodegroups.go

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ const (
3131
// MaxFreeDifferenceRatio describes how free resources (allocatable - daemon and system pods)
3232
// can differ between groups in the same NodeGroupSet
3333
MaxFreeDifferenceRatio = 0.05
34-
// MaxMemoryDifferenceInKiloBytes describes how much memory
35-
// capacity can differ but still be considered equal.
36-
MaxMemoryDifferenceInKiloBytes = 256000
34+
// MaxCapacityMemoryDifferenceRatio describes how Node.Status.Capacity.Memory can differ between
35+
// groups in the same NodeGroupSet
36+
MaxCapacityMemoryDifferenceRatio = 0.015
3737
)
3838

3939
// BasicIgnoredLabels define a set of basic labels that should be ignored when comparing the similarity
@@ -53,21 +53,25 @@ var BasicIgnoredLabels = map[string]bool{
5353
// similar enough to be considered a part of a single NodeGroupSet.
5454
type NodeInfoComparator func(n1, n2 *schedulerframework.NodeInfo) bool
5555

56-
func compareResourceMapsWithTolerance(resources map[apiv1.ResourceName][]resource.Quantity,
56+
func resourceMapsWithinTolerance(resources map[apiv1.ResourceName][]resource.Quantity,
5757
maxDifferenceRatio float64) bool {
5858
for _, qtyList := range resources {
59-
if len(qtyList) != 2 {
60-
return false
61-
}
62-
larger := math.Max(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue()))
63-
smaller := math.Min(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue()))
64-
if larger-smaller > larger*maxDifferenceRatio {
59+
if !resourceListWithinTolerance(qtyList, maxDifferenceRatio) {
6560
return false
6661
}
6762
}
6863
return true
6964
}
7065

66+
func resourceListWithinTolerance(qtyList []resource.Quantity, maxDifferenceRatio float64) bool {
67+
if len(qtyList) != 2 {
68+
return false
69+
}
70+
larger := math.Max(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue()))
71+
smaller := math.Min(float64(qtyList[0].MilliValue()), float64(qtyList[1].MilliValue()))
72+
return larger-smaller <= larger*maxDifferenceRatio
73+
}
74+
7175
func compareLabels(nodes []*schedulerframework.NodeInfo, ignoredLabels map[string]bool) bool {
7276
labels := make(map[string][]string)
7377
for _, node := range nodes {
@@ -131,9 +135,7 @@ func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignored
131135
}
132136
switch kind {
133137
case apiv1.ResourceMemory:
134-
// For memory capacity we allow a small tolerance
135-
memoryDifference := math.Abs(float64(qtyList[0].Value()) - float64(qtyList[1].Value()))
136-
if memoryDifference > MaxMemoryDifferenceInKiloBytes {
138+
if !resourceListWithinTolerance(qtyList, MaxCapacityMemoryDifferenceRatio) {
137139
return false
138140
}
139141
default:
@@ -147,10 +149,10 @@ func IsCloudProviderNodeInfoSimilar(n1, n2 *schedulerframework.NodeInfo, ignored
147149
}
148150

149151
// For allocatable and free we allow resource quantities to be within a few % of each other
150-
if !compareResourceMapsWithTolerance(allocatable, MaxAllocatableDifferenceRatio) {
152+
if !resourceMapsWithinTolerance(allocatable, MaxAllocatableDifferenceRatio) {
151153
return false
152154
}
153-
if !compareResourceMapsWithTolerance(free, MaxFreeDifferenceRatio) {
155+
if !resourceMapsWithinTolerance(free, MaxFreeDifferenceRatio) {
154156
return false
155157
}
156158

cluster-autoscaler/processors/nodegroupset/compare_nodegroups_test.go

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,16 +101,60 @@ func TestNodesSimilarVariousRequirementsAndPods(t *testing.T) {
101101

102102
func TestNodesSimilarVariousMemoryRequirements(t *testing.T) {
103103
comparator := CreateGenericNodeInfoComparator([]string{})
104-
n1 := BuildTestNode("node1", 1000, MaxMemoryDifferenceInKiloBytes)
104+
n1 := BuildTestNode("node1", 1000, 1000)
105105

106106
// Different memory capacity within tolerance
107-
n2 := BuildTestNode("node2", 1000, MaxMemoryDifferenceInKiloBytes)
108-
n2.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(2*MaxMemoryDifferenceInKiloBytes, resource.DecimalSI)
107+
n2 := BuildTestNode("node2", 1000, 1000)
108+
n2.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(1000-(1000*MaxCapacityMemoryDifferenceRatio)+1, resource.DecimalSI)
109109
checkNodesSimilar(t, n1, n2, comparator, true)
110110

111111
// Different memory capacity exceeds tolerance
112-
n3 := BuildTestNode("node3", 1000, MaxMemoryDifferenceInKiloBytes)
113-
n3.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(2*MaxMemoryDifferenceInKiloBytes+1, resource.DecimalSI)
112+
n3 := BuildTestNode("node3", 1000, 1000)
113+
n3.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(1000-(1000*MaxCapacityMemoryDifferenceRatio)-1, resource.DecimalSI)
114+
checkNodesSimilar(t, n1, n3, comparator, false)
115+
}
116+
117+
func TestNodesSimilarVariousLargeMemoryRequirementsM5XLarge(t *testing.T) {
118+
comparator := CreateGenericNodeInfoComparator([]string{})
119+
120+
// Use realistic memory capacity (taken from real nodes)
121+
// 15944120 KB ~= 16GiB (m5.xLarge)
122+
q1 := resource.MustParse("16116152Ki")
123+
q2 := resource.MustParse("15944120Ki")
124+
125+
n1 := BuildTestNode("node1", 1000, q1.Value())
126+
127+
// Different memory capacity within tolerance
128+
// Value taken from another m5.xLarge in a different zone
129+
n2 := BuildTestNode("node2", 1000, q2.Value())
130+
checkNodesSimilar(t, n1, n2, comparator, true)
131+
132+
// Different memory capacity exceeds tolerance
133+
// Value of q1 * 1.02
134+
q3 := resource.MustParse("16438475Ki")
135+
n3 := BuildTestNode("node3", 1000, q3.Value())
136+
checkNodesSimilar(t, n1, n3, comparator, false)
137+
}
138+
139+
func TestNodesSimilarVariousLargeMemoryRequirementsM516XLarge(t *testing.T) {
140+
comparator := CreateGenericNodeInfoComparator([]string{})
141+
142+
// Use realistic memory capacity (taken from real nodes)
143+
// 257217528 KB ~= 256GiB (m5.16xLarge)
144+
q1 := resource.MustParse("259970052Ki")
145+
q2 := resource.MustParse("257217528Ki")
146+
147+
n1 := BuildTestNode("node1", 1000, q1.Value())
148+
149+
// Different memory capacity within tolerance
150+
// Value taken from another m5.xLarge in a different zone
151+
n2 := BuildTestNode("node2", 1000, q2.Value())
152+
checkNodesSimilar(t, n1, n2, comparator, true)
153+
154+
// Different memory capacity exceeds tolerance
155+
// Value of q1 * 1.02
156+
q3 := resource.MustParse("265169453Ki")
157+
n3 := BuildTestNode("node3", 1000, q3.Value())
114158
checkNodesSimilar(t, n1, n3, comparator, false)
115159
}
116160

0 commit comments

Comments
 (0)