Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
bb0c70c
autosacle hack
mweibel Jun 24, 2024
728e361
propagate deleteMachine annotation
mweibel Jun 25, 2024
a950d03
fix: ampm remove finalizer and avoid recreation if VMSS is deleting
mweibel Jul 2, 2024
53fc763
try adding vmss update
mweibel Jul 5, 2024
995b9a9
add some logs
mweibel Jul 5, 2024
f992a33
try setting ready when updating or not the same amount of replicas
mweibel Jul 12, 2024
3334a6c
delete annotated machines first
mweibel Jul 29, 2024
2f62956
dont delete an already deleting vmss vm
mweibel Jul 31, 2024
731da6f
delete annotated machines first
mweibel Jun 25, 2024
fbc9ba6
fix fux
mweibel Jul 31, 2024
dc58154
fix wrong merge/rebase resolution
mweibel Jul 31, 2024
5f218e7
fix deleting logic
mweibel Aug 7, 2024
604421c
clear failure reason/message, improve logging
mweibel Aug 8, 2024
636b3ec
more verbosity
mweibel Aug 8, 2024
f5344a0
add some debug log to updates
mweibel Aug 8, 2024
e7178ae
more logging
mweibel Aug 9, 2024
808195c
common method
mweibel Aug 9, 2024
8848113
fix: use patch instead of put for vmss updates
mweibel Sep 26, 2024
27d4479
temp disable vmss vm update
mweibel Sep 26, 2024
d8f516d
add missing storage/os profile to update call
mweibel Sep 27, 2024
6d1945a
Revert "add missing storage/os profile to update call"
mweibel Sep 27, 2024
b521434
Revert "temp disable vmss vm update"
mweibel Sep 27, 2024
f754931
Revert "fix: use patch instead of put for vmss updates"
mweibel Sep 27, 2024
71fe9de
reset osProfile if no model changes and no updates to custom data
mweibel Sep 27, 2024
aea7ac9
remove all props when no model changes and no custom data change
mweibel Sep 27, 2024
b71127b
adjust properties based on model changes
mweibel Oct 2, 2024
7711797
set shouldPatchCustomData regardless of externally managed state
mweibel Oct 4, 2024
fd31011
always set custom data hash annotation
mweibel Oct 4, 2024
1d17010
testing just removing customData
mweibel Oct 7, 2024
733b53f
Revert "testing just removing customData"
mweibel Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ const (
PatchFuture string = "PATCH"
// PutFuture is a future that was derived from a PUT request.
PutFuture string = "PUT"
// PostFuture is a future that was derived from a POST request.
PostFuture string = "POST"
// DeleteFuture is a future that was derived from a DELETE request.
DeleteFuture string = "DELETE"
)
Expand Down
4 changes: 4 additions & 0 deletions azure/converters/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ func SDKToVMSSVM(sdkInstance armcompute.VirtualMachineScaleSetVM) *azure.VMSSVM
instance.AvailabilityZone = *sdkInstance.Zones[0]
}

if sdkInstance.Properties != nil && sdkInstance.Properties.LatestModelApplied != nil {
instance.LatestModelApplied = *sdkInstance.Properties.LatestModelApplied
}

return &instance
}

Expand Down
17 changes: 6 additions & 11 deletions azure/scope/machinepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,7 @@ func (m *MachinePoolScope) ScaleSetSpec(ctx context.Context) azure.ResourceSpecG
}

if m.cache != nil {
if m.HasReplicasExternallyManaged(ctx) {
spec.ShouldPatchCustomData = m.cache.HasBootstrapDataChanges
log.V(4).Info("has bootstrap data changed?", "shouldPatchCustomData", spec.ShouldPatchCustomData)
}
spec.ShouldPatchCustomData = m.cache.HasBootstrapDataChanges
spec.VMSSExtensionSpecs = m.VMSSExtensionSpecs()
spec.SKU = m.cache.VMSKU
spec.VMImage = m.cache.VMImage
Expand Down Expand Up @@ -614,10 +611,10 @@ func (m *MachinePoolScope) setProvisioningStateAndConditions(v infrav1.Provision
} else {
conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetDesiredReplicasCondition, infrav1.ScaleSetScaleDownReason, clusterv1.ConditionSeverityInfo, "")
}
m.SetNotReady()
m.SetReady()
case v == infrav1.Updating:
conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetModelUpdatedCondition, infrav1.ScaleSetModelOutOfDateReason, clusterv1.ConditionSeverityInfo, "")
m.SetNotReady()
m.SetReady()
case v == infrav1.Creating:
conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetRunningCondition, infrav1.ScaleSetCreatingReason, clusterv1.ConditionSeverityInfo, "")
m.SetNotReady()
Expand Down Expand Up @@ -705,11 +702,9 @@ func (m *MachinePoolScope) Close(ctx context.Context) error {
if err := m.updateReplicasAndProviderIDs(ctx); err != nil {
return errors.Wrap(err, "failed to update replicas and providerIDs")
}
if m.HasReplicasExternallyManaged(ctx) {
if err := m.updateCustomDataHash(ctx); err != nil {
// ignore errors to calculating the custom data hash since it's not absolutely crucial.
log.V(4).Error(err, "unable to update custom data hash, ignoring.")
}
if err := m.updateCustomDataHash(ctx); err != nil {
// ignore errors to calculating the custom data hash since it's not absolutely crucial.
log.V(4).Error(err, "unable to update custom data hash, ignoring.")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co
return orderRandom
}
}()
log = ctrl.LoggerFrom(ctx).V(4)
log = ctrl.LoggerFrom(ctx).V(2).WithValues("method", "selectMachinesToDelete")
deleteAnnotatedMachines = order(getDeleteAnnotatedMachines(machinesByProviderID))
failedMachines = order(getFailedMachines(machinesByProviderID))
deletingMachines = order(getDeletingMachines(machinesByProviderID))
Expand Down
33 changes: 33 additions & 0 deletions azure/services/scalesets/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ type Client interface {

CreateOrUpdateAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string, parameters interface{}) (result interface{}, poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], err error)
DeleteAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], err error)

BeginUpdateInstances(ctx context.Context, spec azure.ResourceSpecGetter, vmInstanceIDs armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs, resumeToken string) (*runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse], error)
}

// AzureClient contains the Azure go-sdk Client.
Expand Down Expand Up @@ -206,3 +208,34 @@ func (ac *AzureClient) DeleteAsync(ctx context.Context, spec azure.ResourceSpecG
// if the operation completed, return a nil poller.
return nil, err
}

// BeginUpdateInstances - Upgrades one or more virtual machines to the latest SKU set in the VM scale set model.
// If the operation fails it returns an *azcore.ResponseError type.
//
// Parameters
// - spec - The ResourceSpecGetter containing used for name and resource group of the virtual machine scale set.
// - vmInstanceIDs - A list of virtual machine instance IDs from the VM scale set.
func (ac *AzureClient) BeginUpdateInstances(ctx context.Context, spec azure.ResourceSpecGetter, vmInstanceIDs armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse], err error) {
ctx, _, done := tele.StartSpanWithLogger(ctx, "scalesets.AzureClient.BeginUpdateInstances")
defer done()

opts := &armcompute.VirtualMachineScaleSetsClientBeginUpdateInstancesOptions{ResumeToken: resumeToken}
poller, err = ac.scalesets.BeginUpdateInstances(ctx, spec.ResourceGroupName(), spec.ResourceName(), vmInstanceIDs, opts)
if err != nil {
return nil, err
}

ctx, cancel := context.WithTimeout(ctx, ac.apiCallTimeout)
defer cancel()

pollOpts := &runtime.PollUntilDoneOptions{Frequency: async.DefaultPollerFrequency}
_, err = poller.PollUntilDone(ctx, pollOpts)
if err != nil {
// if an error occurs, return the Poller.
// this means the long-running operation didn't finish in the specified timeout.
return poller, err
}

// if the operation completed, return a nil poller.
return nil, err
}
15 changes: 15 additions & 0 deletions azure/services/scalesets/mock_scalesets/client_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions azure/services/scalesets/scalesets.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,17 @@ func (s *Service) Reconcile(ctx context.Context) (retErr error) {
if !ok {
return errors.Errorf("%T is not an armcompute.VirtualMachineScaleSet", result)
}
//
//if existing != nil {
// if existingVmss, ok := existing.(armcompute.VirtualMachineScaleSet); ok {
// vmssEqual := cmp.Equal(vmss, existingVmss)
// if !vmssEqual {
// log.Info("updated VMSS", "diff", cmp.Diff(existingVmss, vmss))
// } else {
// log.Info("VMSS equal, probably not updated?")
// }
// }
//}

fetchedVMSS := converters.SDKToVMSS(vmss, scaleSetSpec.VMSSInstances)
if err := s.Scope.ReconcileReplicas(ctx, &fetchedVMSS); err != nil {
Expand All @@ -130,6 +141,55 @@ func (s *Service) Reconcile(ctx context.Context) (retErr error) {
}
s.Scope.SetProviderID(providerID)
s.Scope.SetVMSSState(&fetchedVMSS)

//
//azLatestModelApplied := true
//for _, instance := range scaleSetSpec.VMSSInstances {
// if instance.Properties.LatestModelApplied != nil && !*instance.Properties.LatestModelApplied {
// azLatestModelApplied = false
// break
// }
//}
//
//log = log.WithValues("rg", spec.ResourceGroupName(), "resourceName", spec.ResourceName())
//if !azLatestModelApplied {
// log.V(3).Info("latest model not applied")
//
// resourceName := spec.ResourceName()
// futureType := infrav1.PostFuture
// // Check for an ongoing long-running operation.
// resumeToken := ""
// if future := s.Scope.GetLongRunningOperationState(resourceName, serviceName, futureType); future != nil {
// log.V(4).Info("found future")
// t, err := converters.FutureToResumeToken(*future)
// if err != nil {
// s.Scope.DeleteLongRunningOperationState(resourceName, serviceName, futureType)
// return errors.Wrap(err, "could not decode future data, resetting long-running operation state")
// }
// resumeToken = t
// }
// target := "*"
// poller, err := s.Client.BeginUpdateInstances(ctx, spec, armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs{
// InstanceIDs: []*string{&target},
// }, resumeToken)
// if poller != nil && azure.IsContextDeadlineExceededOrCanceledError(err) {
// log.Info("context deadline exceeded or canceled for long running op", "err", err)
//
// future, err := converters.PollerToFuture(poller, futureType, serviceName, resourceName, spec.ResourceGroupName())
// if err != nil {
// return errors.Wrap(err, "failed to convert poller to future")
// }
// s.Scope.SetLongRunningOperationState(future)
// return azure.WithTransientError(azure.NewOperationNotDoneError(future), s.Scope.DefaultedReconcilerRequeue())
// }
// log.Info("latest model update operation done")
//
// // Once the operation is done, delete the long-running operation state. Even if the operation ended with
// // an error, clear out any lingering state to try the operation again.
// s.Scope.DeleteLongRunningOperationState(resourceName, serviceName, futureType)
//} else {
// log.V(3).Info("latest model applied, not updating anything")
//}
}

return err
Expand Down
26 changes: 23 additions & 3 deletions azure/services/scalesets/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ func (s *ScaleSetSpec) OwnerResourceName() string {
}

func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interface{}) (parameters interface{}, err error) {
ctx, log, done := tele.StartSpanWithLogger(ctx, "scalesets.ScaleSetSpec.existingParameters")
defer done()

existingVMSS, ok := existing.(armcompute.VirtualMachineScaleSet)
if !ok {
return nil, errors.Errorf("%T is not an armcompute.VirtualMachineScaleSet", existing)
Expand All @@ -112,7 +115,7 @@ func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interfac
vmss.Properties.VirtualMachineProfile.NetworkProfile = nil
vmss.ID = existingVMSS.ID

hasModelChanges := hasModelModifyingDifferences(&existingInfraVMSS, vmss)
hasModelChanges := hasModelModifyingDifferences(ctx, &existingInfraVMSS, vmss)
isFlex := s.OrchestrationMode == infrav1.FlexibleOrchestrationMode
updated := true
if !isFlex {
Expand All @@ -131,6 +134,23 @@ func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interfac
return nil, nil
}

// if there are no model changes and no change in custom data, get rid of all properties to avoid unnecessary VMSS model
// updates.
if !hasModelChanges && !s.ShouldPatchCustomData {
log.Info("### removing virtual machine profile")
vmss.Properties.VirtualMachineProfile = nil
} else {
log.Info("### not removing virtual machine profile", "hasModelChanges", hasModelChanges, "shouldPatchCustomData", s.ShouldPatchCustomData)
}

log.Info("updating VMSS",
"name", s.Name,
"capacity", vmss.SKU.Capacity,
"existingCapacity", existingInfraVMSS.Capacity,
"hasModelChanges", hasModelChanges,
"shouldPatchCustomData", s.ShouldPatchCustomData,
)

return vmss, nil
}

Expand Down Expand Up @@ -280,9 +300,9 @@ func (s *ScaleSetSpec) Parameters(ctx context.Context, existing interface{}) (pa
return vmss, nil
}

func hasModelModifyingDifferences(infraVMSS *azure.VMSS, vmss armcompute.VirtualMachineScaleSet) bool {
func hasModelModifyingDifferences(ctx context.Context, infraVMSS *azure.VMSS, vmss armcompute.VirtualMachineScaleSet) bool {
other := converters.SDKToVMSS(vmss, []armcompute.VirtualMachineScaleSetVM{})
return infraVMSS.HasModelChanges(other)
return infraVMSS.HasModelChanges(ctx, other)
}

func (s *ScaleSetSpec) generateExtensions(ctx context.Context) ([]armcompute.VirtualMachineScaleSetExtension, error) {
Expand Down
27 changes: 22 additions & 5 deletions azure/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ limitations under the License.
package azure

import (
"context"
"reflect"
"strings"

"sigs.k8s.io/cluster-api-provider-azure/util/tele"

"github.com/google/go-cmp/cmp"
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure"
Expand Down Expand Up @@ -104,6 +107,7 @@ type (
State infrav1.ProvisioningState `json:"vmState,omitempty"`
BootstrappingState infrav1.ProvisioningState `json:"bootstrappingState,omitempty"`
OrchestrationMode infrav1.OrchestrationModeType `json:"orchestrationMode,omitempty"`
LatestModelApplied bool `json:"latestModelApplied,omitempty"`
}

// VMSS defines a virtual machine scale set.
Expand All @@ -122,11 +126,24 @@ type (
)

// HasModelChanges returns true if the spec fields which will mutate the Azure VMSS model are different.
func (vmss VMSS) HasModelChanges(other VMSS) bool {
equal := cmp.Equal(vmss.Image, other.Image) &&
cmp.Equal(vmss.Identity, other.Identity) &&
cmp.Equal(vmss.Zones, other.Zones) &&
cmp.Equal(vmss.Sku, other.Sku)
func (vmss VMSS) HasModelChanges(ctx context.Context, other VMSS) bool {
ctx, log, done := tele.StartSpanWithLogger(ctx, "vmss.HasModelChanges")
defer done()

imgEqual := cmp.Equal(vmss.Image, other.Image)
identityEqual := cmp.Equal(vmss.Identity, other.Identity)
zonesEqual := cmp.Equal(vmss.Zones, other.Zones)
skuEqual := cmp.Equal(vmss.Sku, other.Sku)

equal := imgEqual &&
identityEqual &&
zonesEqual &&
skuEqual

if !equal {
log.Info("VMSS has changed", "vmssName", vmss.Name, "imgEqual", imgEqual, "identityEqual", identityEqual, "zonesEqual", zonesEqual, "skuEqual", skuEqual)
}

return !equal
}

Expand Down
3 changes: 2 additions & 1 deletion azure/types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package azure

import (
"context"
"testing"

. "github.com/onsi/gomega"
Expand Down Expand Up @@ -131,7 +132,7 @@ func TestVMSS_HasModelChanges(t *testing.T) {
t.Run(c.Name, func(t *testing.T) {
l, r := c.Factory()
g := NewWithT(t)
g.Expect(l.HasModelChanges(r)).To(Equal(c.HasModelChanges))
g.Expect(l.HasModelChanges(context.Background(), r)).To(Equal(c.HasModelChanges))
})
}
}
Expand Down
2 changes: 1 addition & 1 deletion config/capz/manager_image_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ spec:
spec:
containers:
# Change the value of image field below to your controller image URL
- image: gcr.io/k8s-staging-cluster-api-azure/cluster-api-azure-controller:main
- image: localhost:5000/cluster-api-azure-controller-amd64:dev
name: manager
18 changes: 16 additions & 2 deletions exp/controllers/azuremachinepoolmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,17 @@ func (ampmr *AzureMachinePoolMachineController) reconcileNormal(ctx context.Cont
ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeWarning, "FailedVMState", "Azure scale set VM is in failed state")
machineScope.SetFailureReason(capierrors.UpdateMachineError)
machineScope.SetFailureMessage(errors.Errorf("Azure VM state is %s", state))
case infrav1.Succeeded:
ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeNormal, "ProvisioningSucceeded", "Azure scale set VM is in succeeded state")
// clear failure reason/message in case it was there
// TODO(mw): not sure if this works, we also need to probably clear the failure reason/message in another way.
if machineScope.AzureMachinePoolMachine.Status.FailureReason != nil {
machineScope.SetFailureReason(capierrors.MachineStatusError(""))
machineScope.SetFailureMessage(errors.New(""))
}
case infrav1.Deleting:
log.V(4).Info("deleting machine because state is Deleting", "machine", machineScope.Name())
ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeNormal, "Deleting", "Azure scale set VM is in deleting state")
log.V(2).Info("deleting machine because state is Deleting", "machine", machineScope.Name())
if err := ampmr.Client.Delete(ctx, machineScope.Machine); err != nil {
return reconcile.Result{}, errors.Wrap(err, "machine failed to be deleted when deleting")
}
Expand Down Expand Up @@ -329,14 +338,19 @@ func (ampmr *AzureMachinePoolMachineController) reconcileDelete(ctx context.Cont
controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
return reconcile.Result{}, nil
}

if !machineScope.AzureMachinePool.ObjectMeta.DeletionTimestamp.IsZero() {
log.Info("Skipping VMSS VM deletion as VMSS delete will delete individual instances")

controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
return reconcile.Result{}, nil
}

if machineScope.ProvisioningState() == infrav1.Deleting || machineScope.ProvisioningState() == infrav1.Deleted {
log.V(2).Info(fmt.Sprintf("Skipping VMSS VM deletion as VMSS VM is already %s", machineScope.ProvisioningState()))
controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer)
return reconcile.Result{}, nil
}

log.Info("Deleting AzureMachinePoolMachine")

// deleting a single machine
Expand Down