Skip to content

Commit 4997972

Browse files
committed
Avoid unwanted VMSS VMs caches invalidations
`fetchAutoAsgs()` is called at regular intervals, fetches a list of VMSS, then call `Register()` to cache each of those. That registration function will tell the caller wether that vmss' cache is outdated (when the provided VMSS, supposedly fresh, is different than the one held in cache) and will replace existing cache entry by the provided VMSS (which in effect will require a forced refresh since that ScaleSet struct is passed by fetchAutoAsgs with a nil lastRefresh time and an empty instanceCache). To detect changes, `Register()` uses an `reflect.DeepEqual()` between the provided and the cached VMSS. Which does always find them different: cached VMSS were enriched with instances lists (while the provided one is blank, fresh from a simple vmss.list call). That DeepEqual is also fragile due to the compared structs containing mutexes (that may be held or not) and refresh timestamps, attributes that shoudln't be relevant to the comparison. As a consequence, all Register() calls causes indirect cache invalidations and a costly refresh (VMSS VMS List). The number of Register() calls is directly proportional to the number of VMSS attached to the cluster, and can easily triggers ARM API throttling. With a large number of VMSS, that throttling prevents `fetchAutoAsgs` to ever succeed (and cluster-autoscaler to start). ie.: ``` I0807 16:55:25.875907 153 azure_scale_set.go:344] GetScaleSetVms: starts I0807 16:55:25.875915 153 azure_scale_set.go:350] GetScaleSetVms: scaleSet.Name: a-testvmss-10, vmList: [] E0807 16:55:25.875919 153 azure_scale_set.go:352] VirtualMachineScaleSetVMsClient.List failed for a-testvmss-10: &{true 0 2020-08-07 17:10:25.875447854 +0000 UTC m=+913.985215807 azure cloud provider throttled for operation VMSSVMList with reason "client throttled"} E0807 16:55:25.875928 153 azure_manager.go:538] Failed to regenerate ASG cache: Retriable: true, RetryAfter: 899s, HTTPStatusCode: 0, RawError: azure cloud provider throttled for operation VMSSVMList with reason "client throttled" F0807 16:55:25.875934 153 azure_cloud_provider.go:167] Failed to create Azure Manager: Retriable: true, RetryAfter: 899s, HTTPStatusCode: 0, RawError: azure cloud provider throttled for operation VMSSVMList with reason "client throttled" goroutine 28 [running]: ``` From [`ScaleSet` struct attributes](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go#L74-L89) (manager, sizes, mutexes, refreshes timestamps) only sizes are relevant to that comparison. `curSize` is not strictly necessary, but comparing it will provide early instance caches refreshs.
1 parent 0f595ff commit 4997972

File tree

4 files changed

+15
-9
lines changed

4 files changed

+15
-9
lines changed

cluster-autoscaler/cloudprovider/azure/azure_cache.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ limitations under the License.
1717
package azure
1818

1919
import (
20-
"reflect"
2120
"regexp"
2221
"strings"
2322
"sync"
@@ -62,7 +61,9 @@ func (m *asgCache) Register(asg cloudprovider.NodeGroup) bool {
6261

6362
for i := range m.registeredAsgs {
6463
if existing := m.registeredAsgs[i]; strings.EqualFold(existing.Id(), asg.Id()) {
65-
if reflect.DeepEqual(existing, asg) {
64+
e := existing.(*ScaleSet)
65+
a := asg.(*ScaleSet)
66+
if e.minSize == a.minSize && e.maxSize == a.maxSize && e.curSize == a.curSize {
6667
return false
6768
}
6869

@@ -181,7 +182,7 @@ func (m *asgCache) regenerate() error {
181182

182183
m.instanceToAsg = newCache
183184

184-
// Incalidating unowned instance cache.
185+
// Invalidating unowned instance cache.
185186
m.invalidateUnownedInstanceCache()
186187

187188
return nil

cluster-autoscaler/cloudprovider/azure/azure_manager.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ func (m *AzureManager) buildAsgFromSpec(spec string) (cloudprovider.NodeGroup, e
513513
case vmTypeStandard:
514514
return NewAgentPool(s, m)
515515
case vmTypeVMSS:
516-
return NewScaleSet(s, m)
516+
return NewScaleSet(s, m, -1)
517517
case vmTypeAKS:
518518
return NewAKSAgentPool(s, m)
519519
default:
@@ -700,7 +700,12 @@ func (m *AzureManager) listScaleSets(filter []labelAutoDiscoveryConfig) ([]cloud
700700
continue
701701
}
702702

703-
asg, err := NewScaleSet(spec, m)
703+
curSize := int64(-1)
704+
if scaleSet.Sku != nil && scaleSet.Sku.Capacity != nil {
705+
curSize = *scaleSet.Sku.Capacity
706+
}
707+
708+
asg, err := NewScaleSet(spec, m, curSize)
704709
if err != nil {
705710
klog.Warningf("ignoring nodegroup %q %s", *scaleSet.Name, err)
706711
continue

cluster-autoscaler/cloudprovider/azure/azure_manager_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -751,7 +751,7 @@ func TestListScalesets(t *testing.T) {
751751
minSize: 5,
752752
maxSize: 50,
753753
manager: manager,
754-
curSize: -1,
754+
curSize: 3,
755755
sizeRefreshPeriod: defaultVmssSizeRefreshPeriod,
756756
}},
757757
},
@@ -857,7 +857,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
857857
minSize: minVal,
858858
maxSize: maxVal,
859859
manager: manager,
860-
curSize: -1,
860+
curSize: 3,
861861
sizeRefreshPeriod: defaultVmssSizeRefreshPeriod,
862862
}}
863863
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)

cluster-autoscaler/cloudprovider/azure/azure_scale_set.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,15 @@ type ScaleSet struct {
8989
}
9090

9191
// NewScaleSet creates a new NewScaleSet.
92-
func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager) (*ScaleSet, error) {
92+
func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (*ScaleSet, error) {
9393
scaleSet := &ScaleSet{
9494
azureRef: azureRef{
9595
Name: spec.Name,
9696
},
9797
minSize: spec.MinSize,
9898
maxSize: spec.MaxSize,
9999
manager: az,
100-
curSize: -1,
100+
curSize: curSize,
101101
}
102102

103103
if az.config.VmssCacheTTL != 0 {

0 commit comments

Comments
 (0)