Skip to content

Commit 85ecd0e

Browse files
committed
Azure: per VMSS, incremental VMSS VMs cache
Azure's cloud provider VMSS VMs API accesses are mediated through a cache holding and refreshing all VMSS together. Due to that we hit VMSSVM.List API more often than we could: an instance's cache miss or expiration should only require a single VMSS re-list, while it's currently O(n) relative to the number of attached Scale Sets. Under hard pressure (clusters with many attached VMSS that can't all be listed in one sequence of successive API calls) the controller manager might be stuck trying to re-list everything from scratch, then aborting the whole operation; then re-trying and re-triggering API rate-limits, affecting the whole Subscription. This patch replaces the global VMSS VMs cache by per-VMSS VMs caches. Refreshes (VMSS VMs lists) are scoped to the single relevant VMSS; under severe throttling the various caches can be incrementally refreshed. Signed-off-by: Benjamin Pineau <[email protected]>
1 parent 43fbe17 commit 85ecd0e

File tree

2 files changed

+215
-84
lines changed

2 files changed

+215
-84
lines changed

staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go

Lines changed: 99 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ type vmssMetaInfo struct {
6161
resourceGroup string
6262
}
6363

64+
// nodeIdentity identifies a node within a subscription.
65+
type nodeIdentity struct {
66+
resourceGroup string
67+
vmssName string
68+
nodeName string
69+
}
70+
6471
// scaleSet implements VMSet interface for Azure scale set.
6572
type scaleSet struct {
6673
*Cloud
@@ -70,7 +77,7 @@ type scaleSet struct {
7077
availabilitySet VMSet
7178

7279
vmssCache *azcache.TimedCache
73-
vmssVMCache *azcache.TimedCache
80+
vmssVMCache *sync.Map // [resourcegroup/vmssname]*azcache.TimedCache
7481
availabilitySetNodesCache *azcache.TimedCache
7582
}
7683

@@ -80,6 +87,7 @@ func newScaleSet(az *Cloud) (VMSet, error) {
8087
ss := &scaleSet{
8188
Cloud: az,
8289
availabilitySet: newAvailabilitySet(az),
90+
vmssVMCache: &sync.Map{},
8391
}
8492

8593
if !ss.DisableAvailabilitySetNodes {
@@ -94,11 +102,6 @@ func newScaleSet(az *Cloud) (VMSet, error) {
94102
return nil, err
95103
}
96104

97-
ss.vmssVMCache, err = ss.newVMSSVirtualMachinesCache()
98-
if err != nil {
99-
return nil, err
100-
}
101-
102105
return ss, nil
103106
}
104107

@@ -139,12 +142,17 @@ func (ss *scaleSet) getVMSS(vmssName string, crt azcache.AzureCacheReadType) (*c
139142
return vmss, nil
140143
}
141144

142-
// getVmssVM gets virtualMachineScaleSetVM by nodeName from cache.
143-
// It returns cloudprovider.InstanceNotFound if node does not belong to any scale sets.
144-
func (ss *scaleSet) getVmssVM(nodeName string, crt azcache.AzureCacheReadType) (string, string, *compute.VirtualMachineScaleSetVM, error) {
145+
// getVmssVMByNodeIdentity find virtualMachineScaleSetVM by nodeIdentity, using node's parent VMSS cache.
146+
// Returns cloudprovider.InstanceNotFound if the node does not belong to the scale set named in nodeIdentity.
147+
func (ss *scaleSet) getVmssVMByNodeIdentity(node *nodeIdentity, crt azcache.AzureCacheReadType) (string, string, *compute.VirtualMachineScaleSetVM, error) {
148+
cacheKey, cache, err := ss.getVMSSVMCache(node.resourceGroup, node.vmssName)
149+
if err != nil {
150+
return "", "", nil, err
151+
}
152+
145153
getter := func(nodeName string, crt azcache.AzureCacheReadType) (string, string, *compute.VirtualMachineScaleSetVM, bool, error) {
146154
var found bool
147-
cached, err := ss.vmssVMCache.Get(vmssVirtualMachinesKey, crt)
155+
cached, err := cache.Get(cacheKey, crt)
148156
if err != nil {
149157
return "", "", nil, found, err
150158
}
@@ -159,19 +167,19 @@ func (ss *scaleSet) getVmssVM(nodeName string, crt azcache.AzureCacheReadType) (
159167
return "", "", nil, found, nil
160168
}
161169

162-
_, err := getScaleSetVMInstanceID(nodeName)
170+
_, err = getScaleSetVMInstanceID(node.nodeName)
163171
if err != nil {
164172
return "", "", nil, err
165173
}
166174

167-
vmssName, instanceID, vm, found, err := getter(nodeName, crt)
175+
vmssName, instanceID, vm, found, err := getter(node.nodeName, crt)
168176
if err != nil {
169177
return "", "", nil, err
170178
}
171179

172180
if !found {
173-
klog.V(2).Infof("Couldn't find VMSS VM with nodeName %s, refreshing the cache", nodeName)
174-
vmssName, instanceID, vm, found, err = getter(nodeName, azcache.CacheReadTypeForceRefresh)
181+
klog.V(2).Infof("Couldn't find VMSS VM with nodeName %s, refreshing the cache", node.nodeName)
182+
vmssName, instanceID, vm, found, err = getter(node.nodeName, azcache.CacheReadTypeForceRefresh)
175183
if err != nil {
176184
return "", "", nil, err
177185
}
@@ -187,6 +195,17 @@ func (ss *scaleSet) getVmssVM(nodeName string, crt azcache.AzureCacheReadType) (
187195
return vmssName, instanceID, vm, nil
188196
}
189197

198+
// getVmssVM gets virtualMachineScaleSetVM by nodeName from cache.
199+
// Returns cloudprovider.InstanceNotFound if nodeName does not belong to any scale set.
200+
func (ss *scaleSet) getVmssVM(nodeName string, crt azcache.AzureCacheReadType) (string, string, *compute.VirtualMachineScaleSetVM, error) {
201+
node, err := ss.getNodeIdentityByNodeName(nodeName, crt)
202+
if err != nil {
203+
return "", "", nil, err
204+
}
205+
206+
return ss.getVmssVMByNodeIdentity(node, crt)
207+
}
208+
190209
// GetPowerStatusByNodeName returns the power state of the specified node.
191210
func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, err error) {
192211
managedByAS, err := ss.isNodeManagedByAvailabilitySet(name, azcache.CacheReadTypeUnsafe)
@@ -222,8 +241,13 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er
222241
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
223242
// The node must belong to one of scale sets.
224243
func (ss *scaleSet) getVmssVMByInstanceID(resourceGroup, scaleSetName, instanceID string, crt azcache.AzureCacheReadType) (*compute.VirtualMachineScaleSetVM, error) {
244+
cacheKey, cache, err := ss.getVMSSVMCache(resourceGroup, scaleSetName)
245+
if err != nil {
246+
return nil, err
247+
}
248+
225249
getter := func(crt azcache.AzureCacheReadType) (vm *compute.VirtualMachineScaleSetVM, found bool, err error) {
226-
cached, err := ss.vmssVMCache.Get(vmssVirtualMachinesKey, crt)
250+
cached, err := cache.Get(cacheKey, crt)
227251
if err != nil {
228252
return nil, false, err
229253
}
@@ -590,6 +614,66 @@ func (ss *scaleSet) listScaleSets(resourceGroup string) ([]string, error) {
590614
return ssNames, nil
591615
}
592616

617+
// getNodeIdentityByNodeName use the VMSS cache to find a node's resourcegroup and vmss, returned in a nodeIdentity.
618+
func (ss *scaleSet) getNodeIdentityByNodeName(nodeName string, crt azcache.AzureCacheReadType) (*nodeIdentity, error) {
619+
getter := func(nodeName string, crt azcache.AzureCacheReadType) (*nodeIdentity, error) {
620+
node := &nodeIdentity{
621+
nodeName: nodeName,
622+
}
623+
624+
cached, err := ss.vmssCache.Get(vmssKey, crt)
625+
if err != nil {
626+
return nil, err
627+
}
628+
629+
vmsses := cached.(*sync.Map)
630+
vmsses.Range(func(key, value interface{}) bool {
631+
v := value.(*vmssEntry)
632+
if v.vmss.Name == nil {
633+
return true
634+
}
635+
636+
vmssPrefix := *v.vmss.Name
637+
if v.vmss.VirtualMachineProfile != nil &&
638+
v.vmss.VirtualMachineProfile.OsProfile != nil &&
639+
v.vmss.VirtualMachineProfile.OsProfile.ComputerNamePrefix != nil {
640+
vmssPrefix = *v.vmss.VirtualMachineProfile.OsProfile.ComputerNamePrefix
641+
}
642+
643+
if strings.EqualFold(vmssPrefix, nodeName[:len(nodeName)-6]) {
644+
node.vmssName = *v.vmss.Name
645+
node.resourceGroup = v.resourceGroup
646+
return false
647+
}
648+
649+
return true
650+
})
651+
return node, nil
652+
}
653+
654+
if _, err := getScaleSetVMInstanceID(nodeName); err != nil {
655+
return nil, err
656+
}
657+
658+
node, err := getter(nodeName, crt)
659+
if err != nil {
660+
return nil, err
661+
}
662+
if node.vmssName != "" {
663+
return node, nil
664+
}
665+
666+
klog.V(2).Infof("Couldn't find VMSS for node %s, refreshing the cache", nodeName)
667+
node, err = getter(nodeName, azcache.CacheReadTypeForceRefresh)
668+
if err != nil {
669+
return nil, err
670+
}
671+
if node.vmssName == "" {
672+
return nil, cloudprovider.InstanceNotFound
673+
}
674+
return node, nil
675+
}
676+
593677
// listScaleSetVMs lists VMs belonging to the specified scale set.
594678
func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compute.VirtualMachineScaleSetVM, error) {
595679
ctx, cancel := getContextWithCancel()

0 commit comments

Comments
 (0)