Skip to content

Commit 3bd237c

Browse files
rbtrneaggarwMSthatmattlong
authored
fix: hysteresis in ipampool during rapid pool scaling (#967)
* Fix CNS Scale Up/down fix to avoid cleanup pending ToBeDeleted ips * remove pendingRelease bool flag * Incorporate feedback * incorporate lint failures * fix more lint issues * nit fixes again for lint errors * fixes for lint * fix comment spacing * fix final lints * fix broken ipampoolmonitor test Co-authored-by: neaggarw <[email protected]> Co-authored-by: Matthew Long <[email protected]>
1 parent 4dfefd0 commit 3bd237c

File tree

4 files changed

+279
-103
lines changed

4 files changed

+279
-103
lines changed

cns/fakes/requestcontrollerfake.go

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func NewRequestControllerFake(cnsService *HTTPServiceFake, scalar nnc.Scaler, su
3737
rc.ip, _, _ = net.ParseCIDR(subnetAddressSpace)
3838

3939
rc.CarveIPConfigsAndAddToStatusAndCNS(numberOfIPConfigs)
40+
rc.cachedCRD.Spec.RequestedIPCount = int64(numberOfIPConfigs)
4041

4142
return rc
4243
}
@@ -62,7 +63,6 @@ func (rc *RequestControllerFake) CarveIPConfigsAndAddToStatusAndCNS(numberOfIPCo
6263
}
6364

6465
rc.fakecns.IPStateManager.AddIPConfigs(cnsIPConfigs)
65-
rc.cachedCRD.Spec.RequestedIPCount = int64(len(cnsIPConfigs))
6666

6767
return cnsIPConfigs
6868
}
@@ -88,7 +88,7 @@ func remove(slice []nnc.IPAssignment, s int) []nnc.IPAssignment {
8888
return append(slice[:s], slice[s+1:]...)
8989
}
9090

91-
func (rc *RequestControllerFake) Reconcile() error {
91+
func (rc *RequestControllerFake) Reconcile(removePendingReleaseIPs bool) error {
9292

9393
diff := int(rc.cachedCRD.Spec.RequestedIPCount) - len(rc.fakecns.GetPodIPConfigState())
9494

@@ -114,12 +114,11 @@ func (rc *RequestControllerFake) Reconcile() error {
114114
rc.cachedCRD.Status.NetworkContainers[0].IPAssignments = remove(rc.cachedCRD.Status.NetworkContainers[0].IPAssignments, index)
115115

116116
}
117+
}
117118

118-
// remove ipconfig from CNS
119+
// remove ipconfig from CNS
120+
if removePendingReleaseIPs {
119121
rc.fakecns.IPStateManager.RemovePendingReleaseIPConfigs(rc.cachedCRD.Spec.IPsNotInUse)
120-
121-
// empty the not in use ip's from the spec
122-
rc.cachedCRD.Spec.IPsNotInUse = []string{}
123122
}
124123

125124
// update

cns/ipampoolmonitor/ipampoolmonitor.go

Lines changed: 13 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ type CNSIPAMPoolMonitor struct {
2222
cachedNNC nnc.NodeNetworkConfig
2323
httpService cns.HTTPService
2424
mu sync.RWMutex
25-
pendingRelease bool
2625
rc singletenantcontroller.RequestController
2726
scalarUnits nnc.Scaler
2827
updatingIpsNotInUseCount int
@@ -31,9 +30,8 @@ type CNSIPAMPoolMonitor struct {
3130
func NewCNSIPAMPoolMonitor(httpService cns.HTTPService, rc singletenantcontroller.RequestController) *CNSIPAMPoolMonitor {
3231
logger.Printf("NewCNSIPAMPoolMonitor: Create IPAM Pool Monitor")
3332
return &CNSIPAMPoolMonitor{
34-
pendingRelease: false,
35-
httpService: httpService,
36-
rc: rc,
33+
httpService: httpService,
34+
rc: rc,
3735
}
3836
}
3937

@@ -80,13 +78,13 @@ func (pm *CNSIPAMPoolMonitor) Reconcile(ctx context.Context) error {
8078
return pm.increasePoolSize(ctx)
8179

8280
// pod count is decreasing
83-
case freeIPConfigCount > pm.MaximumFreeIps:
81+
case freeIPConfigCount >= pm.MaximumFreeIps:
8482
logger.Printf("[ipam-pool-monitor] Decreasing pool size...%s ", msg)
8583
return pm.decreasePoolSize(ctx, pendingReleaseIPCount)
8684

8785
// CRD has reconciled CNS state, and target spec is now the same size as the state
8886
// free to remove the IP's from the CRD
89-
case pm.pendingRelease && int(pm.cachedNNC.Spec.RequestedIPCount) == cnsPodIPConfigCount:
87+
case len(pm.cachedNNC.Spec.IPsNotInUse) != pendingReleaseIPCount:
9088
logger.Printf("[ipam-pool-monitor] Removing Pending Release IP's from CRD...%s ", msg)
9189
return pm.cleanPendingRelease(ctx)
9290

@@ -105,10 +103,7 @@ func (pm *CNSIPAMPoolMonitor) increasePoolSize(ctx context.Context) error {
105103

106104
var err error
107105
var tempNNCSpec nnc.NodeNetworkConfigSpec
108-
tempNNCSpec, err = pm.createNNCSpecForCRD(false)
109-
if err != nil {
110-
return err
111-
}
106+
tempNNCSpec = pm.createNNCSpecForCRD()
112107

113108
// Query the max IP count
114109
maxIPCount := pm.getMaxIPCount()
@@ -187,10 +182,7 @@ func (pm *CNSIPAMPoolMonitor) decreasePoolSize(ctx context.Context, existingPend
187182
}
188183

189184
var tempNNCSpec nnc.NodeNetworkConfigSpec
190-
tempNNCSpec, err = pm.createNNCSpecForCRD(false)
191-
if err != nil {
192-
return err
193-
}
185+
tempNNCSpec = pm.createNNCSpecForCRD()
194186

195187
if newIpsMarkedAsPending {
196188
// cache the updatingPendingRelease so that we dont re-set new IPs to PendingRelease in case UpdateCRD call fails
@@ -212,7 +204,6 @@ func (pm *CNSIPAMPoolMonitor) decreasePoolSize(ctx context.Context, existingPend
212204

213205
// save the updated state to cachedSpec
214206
pm.cachedNNC.Spec = tempNNCSpec
215-
pm.pendingRelease = true
216207

217208
// clear the updatingPendingIpsNotInUse, as we have Updated the CRD
218209
logger.Printf("[ipam-pool-monitor] cleaning the updatingPendingIpsNotInUse, existing length %d", pm.updatingIpsNotInUseCount)
@@ -229,10 +220,7 @@ func (pm *CNSIPAMPoolMonitor) cleanPendingRelease(ctx context.Context) error {
229220

230221
var err error
231222
var tempNNCSpec nnc.NodeNetworkConfigSpec
232-
tempNNCSpec, err = pm.createNNCSpecForCRD(true)
233-
if err != nil {
234-
return err
235-
}
223+
tempNNCSpec = pm.createNNCSpecForCRD()
236224

237225
err = pm.rc.UpdateCRDSpec(ctx, tempNNCSpec)
238226
if err != nil {
@@ -244,31 +232,25 @@ func (pm *CNSIPAMPoolMonitor) cleanPendingRelease(ctx context.Context) error {
244232

245233
// save the updated state to cachedSpec
246234
pm.cachedNNC.Spec = tempNNCSpec
247-
pm.pendingRelease = false
248235
return nil
249236
}
250237

251238
// CNSToCRDSpec translates CNS's map of Ips to be released and requested ip count into a CRD Spec
252-
func (pm *CNSIPAMPoolMonitor) createNNCSpecForCRD(resetNotInUseList bool) (nnc.NodeNetworkConfigSpec, error) {
239+
func (pm *CNSIPAMPoolMonitor) createNNCSpecForCRD() nnc.NodeNetworkConfigSpec {
253240
var (
254241
spec nnc.NodeNetworkConfigSpec
255242
)
256243

257244
// DUpdate the count from cached spec
258245
spec.RequestedIPCount = pm.cachedNNC.Spec.RequestedIPCount
259246

260-
// Discard the ToBeDeleted list if requested. This happens if DNC has cleaned up the pending ips and CNS has also updated its state.
261-
if resetNotInUseList == true {
262-
spec.IPsNotInUse = make([]string, 0)
263-
} else {
264-
// Get All Pending IPs from CNS and populate it again.
265-
pendingIps := pm.httpService.GetPendingReleaseIPConfigs()
266-
for _, pendingIp := range pendingIps {
267-
spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIp.ID)
268-
}
247+
// Get All Pending IPs from CNS and populate it again.
248+
pendingIPs := pm.httpService.GetPendingReleaseIPConfigs()
249+
for _, pendingIP := range pendingIPs {
250+
spec.IPsNotInUse = append(spec.IPsNotInUse, pendingIP.ID)
269251
}
270252

271-
return spec, nil
253+
return spec
272254
}
273255

274256
// UpdatePoolLimitsTransacted called by request controller on reconcile to set the batch size limits

0 commit comments

Comments
 (0)