Skip to content

Commit f9b93ec

Browse files
adding env variable EnableLabelPrediction (#8324)
* adding env variable EnableLabelPrediction * addressing comments * adding ut test and nil scenario * adding ephemeral storage ut * changing default value to true
1 parent 0d14eca commit f9b93ec

File tree

5 files changed

+146
-43
lines changed

5 files changed

+146
-43
lines changed

cluster-autoscaler/cloudprovider/azure/azure_config.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ type Config struct {
103103

104104
// EnableFastDeleteOnFailedProvisioning defines whether to delete the experimental faster VMSS instance deletion on failed provisioning
105105
EnableFastDeleteOnFailedProvisioning bool `json:"enableFastDeleteOnFailedProvisioning,omitempty" yaml:"enableFastDeleteOnFailedProvisioning,omitempty"`
106+
107+
// EnableLabelPredictionsOnTemplate defines whether to enable label predictions on the template when scaling from zero
108+
EnableLabelPredictionsOnTemplate bool `json:"enableLabelPredictionsOnTemplate,omitempty" yaml:"enableLabelPredictionsOnTemplate,omitempty"`
106109
}
107110

108111
// These are only here for backward compabitility. Their equivalent exists in providerazure.Config with a different name.
@@ -133,6 +136,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
133136
cfg.VMType = providerazureconsts.VMTypeVMSS
134137
cfg.MaxDeploymentsCount = int64(defaultMaxDeploymentsCount)
135138
cfg.StrictCacheUpdates = false
139+
cfg.EnableLabelPredictionsOnTemplate = true
136140

137141
// Config file overrides defaults
138142
if configReader != nil {
@@ -308,6 +312,9 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
308312
if _, err = assignBoolFromEnvIfExists(&cfg.EnableFastDeleteOnFailedProvisioning, "AZURE_ENABLE_FAST_DELETE_ON_FAILED_PROVISIONING"); err != nil {
309313
return nil, err
310314
}
315+
if _, err = assignBoolFromEnvIfExists(&cfg.EnableLabelPredictionsOnTemplate, "AZURE_ENABLE_LABEL_PREDICTIONS_ON_TEMPLATE"); err != nil {
316+
return nil, err
317+
}
311318

312319
// Nonstatic defaults
313320
cfg.VMType = strings.ToLower(cfg.VMType)

cluster-autoscaler/cloudprovider/azure/azure_scale_set.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ type ScaleSet struct {
8989
dedicatedHost bool
9090

9191
enableFastDeleteOnFailedProvisioning bool
92+
93+
enableLabelPredictionsOnTemplate bool
9294
}
9395

9496
// NewScaleSet creates a new NewScaleSet.
@@ -108,10 +110,11 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64, d
108110
instancesRefreshJitter: az.config.VmssVmsCacheJitter,
109111
},
110112

111-
enableForceDelete: az.config.EnableForceDelete,
112-
enableDynamicInstanceList: az.config.EnableDynamicInstanceList,
113-
enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage,
114-
dedicatedHost: dedicatedHost,
113+
enableForceDelete: az.config.EnableForceDelete,
114+
enableDynamicInstanceList: az.config.EnableDynamicInstanceList,
115+
enableDetailedCSEMessage: az.config.EnableDetailedCSEMessage,
116+
enableLabelPredictionsOnTemplate: az.config.EnableLabelPredictionsOnTemplate,
117+
dedicatedHost: dedicatedHost,
115118
}
116119

117120
if az.config.VmssVirtualMachinesCacheTTLInSeconds != 0 {
@@ -662,7 +665,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*framework.NodeInfo, error) {
662665
if err != nil {
663666
return nil, err
664667
}
665-
node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList)
668+
node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager, scaleSet.enableDynamicInstanceList, scaleSet.enableLabelPredictionsOnTemplate)
666669
if err != nil {
667670
return nil, err
668671
}

cluster-autoscaler/cloudprovider/azure/azure_template.go

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ func buildNodeTemplateFromVMPool(vmsPool armcontainerservice.AgentPool, location
211211
}, nil
212212
}
213213

214-
func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager *AzureManager, enableDynamicInstanceList bool) (*apiv1.Node, error) {
214+
func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager *AzureManager, enableDynamicInstanceList bool, enableLabelPrediction bool) (*apiv1.Node, error) {
215215
node := apiv1.Node{}
216216
nodeName := fmt.Sprintf("%s-asg-%d", nodeGroupName, rand.Int63())
217217

@@ -272,7 +272,7 @@ func buildNodeFromTemplate(nodeGroupName string, template NodeTemplate, manager
272272
node.Status.Allocatable = node.Status.Capacity
273273

274274
if template.VMSSNodeTemplate != nil {
275-
node = processVMSSTemplate(template, nodeName, node)
275+
node = processVMSSTemplate(template, nodeName, node, enableLabelPrediction)
276276
} else if template.VMPoolNodeTemplate != nil {
277277
node = processVMPoolTemplate(template, nodeName, node)
278278
} else {
@@ -298,7 +298,7 @@ func processVMPoolTemplate(template NodeTemplate, nodeName string, node apiv1.No
298298
return node
299299
}
300300

301-
func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node) apiv1.Node {
301+
func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node, enableLabelPrediction bool) apiv1.Node {
302302
// NodeLabels
303303
if template.VMSSNodeTemplate.Tags != nil {
304304
for k, v := range template.VMSSNodeTemplate.Tags {
@@ -324,45 +324,50 @@ func processVMSSTemplate(template NodeTemplate, nodeName string, node apiv1.Node
324324
labels = extractLabelsFromTags(template.VMSSNodeTemplate.Tags)
325325
}
326326

327-
// Add the agentpool label, its value should come from the VMSS poolName tag
328-
// NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one
329-
// We will have to live with both labels for a while
330-
if node.Labels[legacyPoolNameTag] != "" {
331-
labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
332-
labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
333-
}
334-
if node.Labels[poolNameTag] != "" {
335-
labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag]
336-
labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag]
337-
}
338-
339-
// Add the storage profile and storage tier labels for vmss node
340-
if template.VMSSNodeTemplate.OSDisk != nil {
341-
// ephemeral
342-
if template.VMSSNodeTemplate.OSDisk.DiffDiskSettings != nil && template.VMSSNodeTemplate.OSDisk.DiffDiskSettings.Option == compute.Local {
343-
labels[legacyStorageProfileNodeLabelKey] = "ephemeral"
344-
labels[storageProfileNodeLabelKey] = "ephemeral"
345-
} else {
346-
labels[legacyStorageProfileNodeLabelKey] = "managed"
347-
labels[storageProfileNodeLabelKey] = "managed"
327+
// This is the best-effort to match AKS system labels,
328+
// this prediction needs to be constantly worked on and maintained to keep up with the changes in AKS
329+
if enableLabelPrediction {
330+
// Add the agentpool label, its value should come from the VMSS poolName tag
331+
// NOTE: The plan is for agentpool label to be deprecated in favor of the aks-prefixed one
332+
// We will have to live with both labels for a while
333+
if node.Labels[legacyPoolNameTag] != "" {
334+
labels[legacyAgentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
335+
labels[agentPoolNodeLabelKey] = node.Labels[legacyPoolNameTag]
336+
}
337+
if node.Labels[poolNameTag] != "" {
338+
labels[legacyAgentPoolNodeLabelKey] = node.Labels[poolNameTag]
339+
labels[agentPoolNodeLabelKey] = node.Labels[poolNameTag]
348340
}
349-
if template.VMSSNodeTemplate.OSDisk.ManagedDisk != nil {
350-
labels[legacyStorageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
351-
labels[storageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
341+
342+
// Add the storage profile and storage tier labels for vmss node
343+
if template.VMSSNodeTemplate.OSDisk != nil {
344+
// ephemeral
345+
if template.VMSSNodeTemplate.OSDisk.DiffDiskSettings != nil && template.VMSSNodeTemplate.OSDisk.DiffDiskSettings.Option == compute.Local {
346+
labels[legacyStorageProfileNodeLabelKey] = "ephemeral"
347+
labels[storageProfileNodeLabelKey] = "ephemeral"
348+
} else {
349+
labels[legacyStorageProfileNodeLabelKey] = "managed"
350+
labels[storageProfileNodeLabelKey] = "managed"
351+
}
352+
if template.VMSSNodeTemplate.OSDisk.ManagedDisk != nil {
353+
labels[legacyStorageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
354+
labels[storageTierNodeLabelKey] = string(template.VMSSNodeTemplate.OSDisk.ManagedDisk.StorageAccountType)
355+
}
352356
}
353-
// Add ephemeral-storage value
354-
if template.VMSSNodeTemplate.OSDisk.DiskSizeGB != nil {
355-
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VMSSNodeTemplate.OSDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI)
356-
klog.V(4).Infof("OS Disk Size from template is: %d", *template.VMSSNodeTemplate.OSDisk.DiskSizeGB)
357-
klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage])
357+
358+
// If we are on GPU-enabled SKUs, append the accelerator
359+
// label so that CA makes better decision when scaling from zero for GPU pools
360+
if isNvidiaEnabledSKU(template.SkuName) {
361+
labels[GPULabel] = "nvidia"
362+
labels[legacyGPULabel] = "nvidia"
358363
}
359364
}
360365

361-
// If we are on GPU-enabled SKUs, append the accelerator
362-
// label so that CA makes better decision when scaling from zero for GPU pools
363-
if isNvidiaEnabledSKU(template.SkuName) {
364-
labels[GPULabel] = "nvidia"
365-
labels[legacyGPULabel] = "nvidia"
366+
// Add ephemeral-storage value
367+
if template.VMSSNodeTemplate.OSDisk != nil && template.VMSSNodeTemplate.OSDisk.DiskSizeGB != nil {
368+
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(int64(int(*template.VMSSNodeTemplate.OSDisk.DiskSizeGB)*1024*1024*1024), resource.DecimalSI)
369+
klog.V(4).Infof("OS Disk Size from template is: %d", *template.VMSSNodeTemplate.OSDisk.DiskSizeGB)
370+
klog.V(4).Infof("Setting ephemeral storage to: %v", node.Status.Capacity[apiv1.ResourceEphemeralStorage])
366371
}
367372

368373
// Extract allocatables from tags

cluster-autoscaler/cloudprovider/azure/azure_template_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,91 @@ func makeTaintSet(taints []apiv1.Taint) map[apiv1.Taint]bool {
291291
}
292292
return set
293293
}
294+
295+
func TestBuildNodeFromTemplateWithLabelPrediction(t *testing.T) {
296+
poolName := "testpool"
297+
testSkuName := "Standard_DS2_v2"
298+
testNodeName := "test-node"
299+
300+
vmss := compute.VirtualMachineScaleSet{
301+
Response: autorest.Response{},
302+
Sku: &compute.Sku{Name: &testSkuName},
303+
Plan: nil,
304+
VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{
305+
VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{
306+
StorageProfile: &compute.VirtualMachineScaleSetStorageProfile{
307+
OsDisk: &compute.VirtualMachineScaleSetOSDisk{
308+
DiffDiskSettings: nil, // This makes it managed
309+
ManagedDisk: &compute.VirtualMachineScaleSetManagedDiskParameters{
310+
StorageAccountType: compute.StorageAccountTypesPremiumLRS,
311+
},
312+
},
313+
},
314+
},
315+
},
316+
Tags: map[string]*string{
317+
"poolName": &poolName,
318+
},
319+
Zones: &[]string{"1", "2"},
320+
Location: to.StringPtr("westus"),
321+
}
322+
323+
template, err := buildNodeTemplateFromVMSS(vmss, map[string]string{}, "")
324+
assert.NoError(t, err)
325+
326+
manager := &AzureManager{}
327+
node, err := buildNodeFromTemplate(testNodeName, template, manager, false, true)
328+
assert.NoError(t, err)
329+
assert.NotNil(t, node)
330+
331+
// Verify label prediction labels are added
332+
assert.Equal(t, poolName, node.Labels["agentpool"])
333+
assert.Equal(t, poolName, node.Labels["kubernetes.azure.com/agentpool"])
334+
assert.Equal(t, "managed", node.Labels["storageprofile"])
335+
assert.Equal(t, "managed", node.Labels["kubernetes.azure.com/storageprofile"])
336+
}
337+
338+
func TestBuildNodeFromTemplateWithEphemeralStorage(t *testing.T) {
339+
poolName := "testpool"
340+
testSkuName := "Standard_DS2_v2"
341+
testNodeName := "test-node"
342+
diskSizeGB := int32(128)
343+
344+
vmss := compute.VirtualMachineScaleSet{
345+
Response: autorest.Response{},
346+
Sku: &compute.Sku{Name: &testSkuName},
347+
Plan: nil,
348+
VirtualMachineScaleSetProperties: &compute.VirtualMachineScaleSetProperties{
349+
VirtualMachineProfile: &compute.VirtualMachineScaleSetVMProfile{
350+
StorageProfile: &compute.VirtualMachineScaleSetStorageProfile{
351+
OsDisk: &compute.VirtualMachineScaleSetOSDisk{
352+
DiskSizeGB: &diskSizeGB,
353+
DiffDiskSettings: nil, // This makes it managed
354+
ManagedDisk: &compute.VirtualMachineScaleSetManagedDiskParameters{
355+
StorageAccountType: compute.StorageAccountTypesPremiumLRS,
356+
},
357+
},
358+
},
359+
},
360+
},
361+
Tags: map[string]*string{
362+
"poolName": &poolName,
363+
},
364+
Zones: &[]string{"1", "2"},
365+
Location: to.StringPtr("westus"),
366+
}
367+
368+
template, err := buildNodeTemplateFromVMSS(vmss, map[string]string{}, "")
369+
assert.NoError(t, err)
370+
371+
manager := &AzureManager{}
372+
node, err := buildNodeFromTemplate(testNodeName, template, manager, false, false)
373+
assert.NoError(t, err)
374+
assert.NotNil(t, node)
375+
376+
// Verify ephemeral storage is set correctly
377+
expectedEphemeralStorage := resource.NewQuantity(int64(diskSizeGB)*1024*1024*1024, resource.DecimalSI)
378+
ephemeralStorage, exists := node.Status.Capacity[apiv1.ResourceEphemeralStorage]
379+
assert.True(t, exists)
380+
assert.Equal(t, expectedEphemeralStorage.String(), ephemeralStorage.String())
381+
}

cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ func (vmPool *VMPool) TemplateNodeInfo() (*framework.NodeInfo, error) {
469469
if err != nil {
470470
return nil, err
471471
}
472-
node, err := buildNodeFromTemplate(vmPool.agentPoolName, template, vmPool.manager, vmPool.manager.config.EnableDynamicInstanceList)
472+
node, err := buildNodeFromTemplate(vmPool.agentPoolName, template, vmPool.manager, vmPool.manager.config.EnableDynamicInstanceList, false)
473473
if err != nil {
474474
return nil, err
475475
}

0 commit comments

Comments
 (0)