diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index 444bc50ca66..fc2fdab61eb 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -63,6 +63,8 @@ const ( PatchFuture string = "PATCH" // PutFuture is a future that was derived from a PUT request. PutFuture string = "PUT" + // PostFuture is a future that was derived from a POST request. + PostFuture string = "POST" // DeleteFuture is a future that was derived from a DELETE request. DeleteFuture string = "DELETE" ) diff --git a/azure/converters/vmss.go b/azure/converters/vmss.go index 52257bfdbb8..0c7ad0d46be 100644 --- a/azure/converters/vmss.go +++ b/azure/converters/vmss.go @@ -154,6 +154,10 @@ func SDKToVMSSVM(sdkInstance armcompute.VirtualMachineScaleSetVM) *azure.VMSSVM instance.AvailabilityZone = *sdkInstance.Zones[0] } + if sdkInstance.Properties != nil && sdkInstance.Properties.LatestModelApplied != nil { + instance.LatestModelApplied = *sdkInstance.Properties.LatestModelApplied + } + return &instance } diff --git a/azure/scope/machinepool.go b/azure/scope/machinepool.go index 611a6aa5c57..602dc9b9301 100644 --- a/azure/scope/machinepool.go +++ b/azure/scope/machinepool.go @@ -221,10 +221,7 @@ func (m *MachinePoolScope) ScaleSetSpec(ctx context.Context) azure.ResourceSpecG } if m.cache != nil { - if m.HasReplicasExternallyManaged(ctx) { - spec.ShouldPatchCustomData = m.cache.HasBootstrapDataChanges - log.V(4).Info("has bootstrap data changed?", "shouldPatchCustomData", spec.ShouldPatchCustomData) - } + spec.ShouldPatchCustomData = m.cache.HasBootstrapDataChanges spec.VMSSExtensionSpecs = m.VMSSExtensionSpecs() spec.SKU = m.cache.VMSKU spec.VMImage = m.cache.VMImage @@ -614,10 +611,10 @@ func (m *MachinePoolScope) setProvisioningStateAndConditions(v infrav1.Provision } else { conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetDesiredReplicasCondition, infrav1.ScaleSetScaleDownReason, clusterv1.ConditionSeverityInfo, "") } - m.SetNotReady() + m.SetReady() case v == infrav1.Updating: conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetModelUpdatedCondition, infrav1.ScaleSetModelOutOfDateReason, clusterv1.ConditionSeverityInfo, "") - m.SetNotReady() + m.SetReady() case v == infrav1.Creating: conditions.MarkFalse(m.AzureMachinePool, infrav1.ScaleSetRunningCondition, infrav1.ScaleSetCreatingReason, clusterv1.ConditionSeverityInfo, "") m.SetNotReady() @@ -705,11 +702,9 @@ func (m *MachinePoolScope) Close(ctx context.Context) error { if err := m.updateReplicasAndProviderIDs(ctx); err != nil { return errors.Wrap(err, "failed to update replicas and providerIDs") } - if m.HasReplicasExternallyManaged(ctx) { - if err := m.updateCustomDataHash(ctx); err != nil { - // ignore errors to calculating the custom data hash since it's not absolutely crucial. - log.V(4).Error(err, "unable to update custom data hash, ignoring.") - } + if err := m.updateCustomDataHash(ctx); err != nil { + // ignore errors to calculating the custom data hash since it's not absolutely crucial. + log.V(4).Error(err, "unable to update custom data hash, ignoring.") } } diff --git a/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go b/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go index 30afa56a8ab..a7304f090e2 100644 --- a/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go +++ b/azure/scope/strategies/machinepool_deployments/machinepool_deployment_strategy.go @@ -128,7 +128,7 @@ func (rollingUpdateStrategy rollingUpdateStrategy) SelectMachinesToDelete(ctx co return orderRandom } }() - log = ctrl.LoggerFrom(ctx).V(4) + log = ctrl.LoggerFrom(ctx).V(2).WithValues("method", "selectMachinesToDelete") deleteAnnotatedMachines = order(getDeleteAnnotatedMachines(machinesByProviderID)) failedMachines = order(getFailedMachines(machinesByProviderID)) deletingMachines = order(getDeletingMachines(machinesByProviderID)) diff --git a/azure/services/scalesets/client.go b/azure/services/scalesets/client.go index 3bbc94990a3..d43e3e98eb9 100644 --- a/azure/services/scalesets/client.go +++ b/azure/services/scalesets/client.go @@ -36,6 +36,8 @@ type Client interface { CreateOrUpdateAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string, parameters interface{}) (result interface{}, poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], err error) DeleteAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientDeleteResponse], err error) + + BeginUpdateInstances(ctx context.Context, spec azure.ResourceSpecGetter, vmInstanceIDs armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs, resumeToken string) (*runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse], error) } // AzureClient contains the Azure go-sdk Client. @@ -206,3 +208,34 @@ func (ac *AzureClient) DeleteAsync(ctx context.Context, spec azure.ResourceSpecG // if the operation completed, return a nil poller. return nil, err } + +// BeginUpdateInstances - Upgrades one or more virtual machines to the latest SKU set in the VM scale set model. +// If the operation fails it returns an *azcore.ResponseError type. +// +// Parameters +// - spec - The ResourceSpecGetter containing used for name and resource group of the virtual machine scale set. +// - vmInstanceIDs - A list of virtual machine instance IDs from the VM scale set. +func (ac *AzureClient) BeginUpdateInstances(ctx context.Context, spec azure.ResourceSpecGetter, vmInstanceIDs armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse], err error) { + ctx, _, done := tele.StartSpanWithLogger(ctx, "scalesets.AzureClient.BeginUpdateInstances") + defer done() + + opts := &armcompute.VirtualMachineScaleSetsClientBeginUpdateInstancesOptions{ResumeToken: resumeToken} + poller, err = ac.scalesets.BeginUpdateInstances(ctx, spec.ResourceGroupName(), spec.ResourceName(), vmInstanceIDs, opts) + if err != nil { + return nil, err + } + + ctx, cancel := context.WithTimeout(ctx, ac.apiCallTimeout) + defer cancel() + + pollOpts := &runtime.PollUntilDoneOptions{Frequency: async.DefaultPollerFrequency} + _, err = poller.PollUntilDone(ctx, pollOpts) + if err != nil { + // if an error occurs, return the Poller. + // this means the long-running operation didn't finish in the specified timeout. + return poller, err + } + + // if the operation completed, return a nil poller. + return nil, err +} diff --git a/azure/services/scalesets/mock_scalesets/client_mock.go b/azure/services/scalesets/mock_scalesets/client_mock.go index fdad562f8c9..83e4aee8954 100644 --- a/azure/services/scalesets/mock_scalesets/client_mock.go +++ b/azure/services/scalesets/mock_scalesets/client_mock.go @@ -58,6 +58,21 @@ func (m *MockClient) EXPECT() *MockClientMockRecorder { return m.recorder } +// BeginUpdateInstances mocks base method. +func (m *MockClient) BeginUpdateInstances(ctx context.Context, spec azure.ResourceSpecGetter, vmInstanceIDs armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs, resumeToken string) (*runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse], error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "BeginUpdateInstances", ctx, spec, vmInstanceIDs, resumeToken) + ret0, _ := ret[0].(*runtime.Poller[armcompute.VirtualMachineScaleSetsClientUpdateInstancesResponse]) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// BeginUpdateInstances indicates an expected call of BeginUpdateInstances. +func (mr *MockClientMockRecorder) BeginUpdateInstances(ctx, spec, vmInstanceIDs, resumeToken any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "BeginUpdateInstances", reflect.TypeOf((*MockClient)(nil).BeginUpdateInstances), ctx, spec, vmInstanceIDs, resumeToken) +} + // CreateOrUpdateAsync mocks base method. func (m *MockClient) CreateOrUpdateAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string, parameters any) (any, *runtime.Poller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse], error) { m.ctrl.T.Helper() diff --git a/azure/services/scalesets/scalesets.go b/azure/services/scalesets/scalesets.go index 383965482e9..c0ad3e48c27 100644 --- a/azure/services/scalesets/scalesets.go +++ b/azure/services/scalesets/scalesets.go @@ -117,6 +117,17 @@ func (s *Service) Reconcile(ctx context.Context) (retErr error) { if !ok { return errors.Errorf("%T is not an armcompute.VirtualMachineScaleSet", result) } + // + //if existing != nil { + // if existingVmss, ok := existing.(armcompute.VirtualMachineScaleSet); ok { + // vmssEqual := cmp.Equal(vmss, existingVmss) + // if !vmssEqual { + // log.Info("updated VMSS", "diff", cmp.Diff(existingVmss, vmss)) + // } else { + // log.Info("VMSS equal, probably not updated?") + // } + // } + //} fetchedVMSS := converters.SDKToVMSS(vmss, scaleSetSpec.VMSSInstances) if err := s.Scope.ReconcileReplicas(ctx, &fetchedVMSS); err != nil { @@ -130,6 +141,55 @@ func (s *Service) Reconcile(ctx context.Context) (retErr error) { } s.Scope.SetProviderID(providerID) s.Scope.SetVMSSState(&fetchedVMSS) + + // + //azLatestModelApplied := true + //for _, instance := range scaleSetSpec.VMSSInstances { + // if instance.Properties.LatestModelApplied != nil && !*instance.Properties.LatestModelApplied { + // azLatestModelApplied = false + // break + // } + //} + // + //log = log.WithValues("rg", spec.ResourceGroupName(), "resourceName", spec.ResourceName()) + //if !azLatestModelApplied { + // log.V(3).Info("latest model not applied") + // + // resourceName := spec.ResourceName() + // futureType := infrav1.PostFuture + // // Check for an ongoing long-running operation. + // resumeToken := "" + // if future := s.Scope.GetLongRunningOperationState(resourceName, serviceName, futureType); future != nil { + // log.V(4).Info("found future") + // t, err := converters.FutureToResumeToken(*future) + // if err != nil { + // s.Scope.DeleteLongRunningOperationState(resourceName, serviceName, futureType) + // return errors.Wrap(err, "could not decode future data, resetting long-running operation state") + // } + // resumeToken = t + // } + // target := "*" + // poller, err := s.Client.BeginUpdateInstances(ctx, spec, armcompute.VirtualMachineScaleSetVMInstanceRequiredIDs{ + // InstanceIDs: []*string{&target}, + // }, resumeToken) + // if poller != nil && azure.IsContextDeadlineExceededOrCanceledError(err) { + // log.Info("context deadline exceeded or canceled for long running op", "err", err) + // + // future, err := converters.PollerToFuture(poller, futureType, serviceName, resourceName, spec.ResourceGroupName()) + // if err != nil { + // return errors.Wrap(err, "failed to convert poller to future") + // } + // s.Scope.SetLongRunningOperationState(future) + // return azure.WithTransientError(azure.NewOperationNotDoneError(future), s.Scope.DefaultedReconcilerRequeue()) + // } + // log.Info("latest model update operation done") + // + // // Once the operation is done, delete the long-running operation state. Even if the operation ended with + // // an error, clear out any lingering state to try the operation again. + // s.Scope.DeleteLongRunningOperationState(resourceName, serviceName, futureType) + //} else { + // log.V(3).Info("latest model applied, not updating anything") + //} } return err diff --git a/azure/services/scalesets/spec.go b/azure/services/scalesets/spec.go index c014d7db5c2..1a249f3c3a8 100644 --- a/azure/services/scalesets/spec.go +++ b/azure/services/scalesets/spec.go @@ -92,6 +92,9 @@ func (s *ScaleSetSpec) OwnerResourceName() string { } func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interface{}) (parameters interface{}, err error) { + ctx, log, done := tele.StartSpanWithLogger(ctx, "scalesets.ScaleSetSpec.existingParameters") + defer done() + existingVMSS, ok := existing.(armcompute.VirtualMachineScaleSet) if !ok { return nil, errors.Errorf("%T is not an armcompute.VirtualMachineScaleSet", existing) @@ -112,7 +115,7 @@ func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interfac vmss.Properties.VirtualMachineProfile.NetworkProfile = nil vmss.ID = existingVMSS.ID - hasModelChanges := hasModelModifyingDifferences(&existingInfraVMSS, vmss) + hasModelChanges := hasModelModifyingDifferences(ctx, &existingInfraVMSS, vmss) isFlex := s.OrchestrationMode == infrav1.FlexibleOrchestrationMode updated := true if !isFlex { @@ -131,6 +134,23 @@ func (s *ScaleSetSpec) existingParameters(ctx context.Context, existing interfac return nil, nil } + // if there are no model changes and no change in custom data, get rid of all properties to avoid unnecessary VMSS model + // updates. + if !hasModelChanges && !s.ShouldPatchCustomData { + log.Info("### removing virtual machine profile") + vmss.Properties.VirtualMachineProfile = nil + } else { + log.Info("### not removing virtual machine profile", "hasModelChanges", hasModelChanges, "shouldPatchCustomData", s.ShouldPatchCustomData) + } + + log.Info("updating VMSS", + "name", s.Name, + "capacity", vmss.SKU.Capacity, + "existingCapacity", existingInfraVMSS.Capacity, + "hasModelChanges", hasModelChanges, + "shouldPatchCustomData", s.ShouldPatchCustomData, + ) + return vmss, nil } @@ -280,9 +300,9 @@ func (s *ScaleSetSpec) Parameters(ctx context.Context, existing interface{}) (pa return vmss, nil } -func hasModelModifyingDifferences(infraVMSS *azure.VMSS, vmss armcompute.VirtualMachineScaleSet) bool { +func hasModelModifyingDifferences(ctx context.Context, infraVMSS *azure.VMSS, vmss armcompute.VirtualMachineScaleSet) bool { other := converters.SDKToVMSS(vmss, []armcompute.VirtualMachineScaleSetVM{}) - return infraVMSS.HasModelChanges(other) + return infraVMSS.HasModelChanges(ctx, other) } func (s *ScaleSetSpec) generateExtensions(ctx context.Context) ([]armcompute.VirtualMachineScaleSetExtension, error) { diff --git a/azure/types.go b/azure/types.go index 2e040edcc2b..5cb6e57aaca 100644 --- a/azure/types.go +++ b/azure/types.go @@ -17,9 +17,12 @@ limitations under the License. package azure import ( + "context" "reflect" "strings" + "sigs.k8s.io/cluster-api-provider-azure/util/tele" + "github.com/google/go-cmp/cmp" infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" azureutil "sigs.k8s.io/cluster-api-provider-azure/util/azure" @@ -104,6 +107,7 @@ type ( State infrav1.ProvisioningState `json:"vmState,omitempty"` BootstrappingState infrav1.ProvisioningState `json:"bootstrappingState,omitempty"` OrchestrationMode infrav1.OrchestrationModeType `json:"orchestrationMode,omitempty"` + LatestModelApplied bool `json:"latestModelApplied,omitempty"` } // VMSS defines a virtual machine scale set. @@ -122,11 +126,24 @@ type ( ) // HasModelChanges returns true if the spec fields which will mutate the Azure VMSS model are different. -func (vmss VMSS) HasModelChanges(other VMSS) bool { - equal := cmp.Equal(vmss.Image, other.Image) && - cmp.Equal(vmss.Identity, other.Identity) && - cmp.Equal(vmss.Zones, other.Zones) && - cmp.Equal(vmss.Sku, other.Sku) +func (vmss VMSS) HasModelChanges(ctx context.Context, other VMSS) bool { + ctx, log, done := tele.StartSpanWithLogger(ctx, "vmss.HasModelChanges") + defer done() + + imgEqual := cmp.Equal(vmss.Image, other.Image) + identityEqual := cmp.Equal(vmss.Identity, other.Identity) + zonesEqual := cmp.Equal(vmss.Zones, other.Zones) + skuEqual := cmp.Equal(vmss.Sku, other.Sku) + + equal := imgEqual && + identityEqual && + zonesEqual && + skuEqual + + if !equal { + log.Info("VMSS has changed", "vmssName", vmss.Name, "imgEqual", imgEqual, "identityEqual", identityEqual, "zonesEqual", zonesEqual, "skuEqual", skuEqual) + } + return !equal } diff --git a/azure/types_test.go b/azure/types_test.go index f84b79761b6..4f425fcf677 100644 --- a/azure/types_test.go +++ b/azure/types_test.go @@ -17,6 +17,7 @@ limitations under the License. package azure import ( + "context" "testing" . "github.com/onsi/gomega" @@ -131,7 +132,7 @@ func TestVMSS_HasModelChanges(t *testing.T) { t.Run(c.Name, func(t *testing.T) { l, r := c.Factory() g := NewWithT(t) - g.Expect(l.HasModelChanges(r)).To(Equal(c.HasModelChanges)) + g.Expect(l.HasModelChanges(context.Background(), r)).To(Equal(c.HasModelChanges)) }) } } diff --git a/config/capz/manager_image_patch.yaml b/config/capz/manager_image_patch.yaml index 47f8612fd1f..01467e26f06 100644 --- a/config/capz/manager_image_patch.yaml +++ b/config/capz/manager_image_patch.yaml @@ -8,5 +8,5 @@ spec: spec: containers: # Change the value of image field below to your controller image URL - - image: gcr.io/k8s-staging-cluster-api-azure/cluster-api-azure-controller:main + - image: localhost:5000/cluster-api-azure-controller-amd64:dev name: manager diff --git a/exp/controllers/azuremachinepoolmachine_controller.go b/exp/controllers/azuremachinepoolmachine_controller.go index 4401bad6e6b..8303fe1c6f6 100644 --- a/exp/controllers/azuremachinepoolmachine_controller.go +++ b/exp/controllers/azuremachinepoolmachine_controller.go @@ -294,8 +294,17 @@ func (ampmr *AzureMachinePoolMachineController) reconcileNormal(ctx context.Cont ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeWarning, "FailedVMState", "Azure scale set VM is in failed state") machineScope.SetFailureReason(capierrors.UpdateMachineError) machineScope.SetFailureMessage(errors.Errorf("Azure VM state is %s", state)) + case infrav1.Succeeded: + ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeNormal, "ProvisioningSucceeded", "Azure scale set VM is in succeeded state") + // clear failure reason/message in case it was there + // TODO(mw): not sure if this works, we also need to probably clear the failure reason/message in another way. + if machineScope.AzureMachinePoolMachine.Status.FailureReason != nil { + machineScope.SetFailureReason(capierrors.MachineStatusError("")) + machineScope.SetFailureMessage(errors.New("")) + } case infrav1.Deleting: - log.V(4).Info("deleting machine because state is Deleting", "machine", machineScope.Name()) + ampmr.Recorder.Eventf(machineScope.AzureMachinePoolMachine, corev1.EventTypeNormal, "Deleting", "Azure scale set VM is in deleting state") + log.V(2).Info("deleting machine because state is Deleting", "machine", machineScope.Name()) if err := ampmr.Client.Delete(ctx, machineScope.Machine); err != nil { return reconcile.Result{}, errors.Wrap(err, "machine failed to be deleted when deleting") } @@ -329,7 +338,6 @@ func (ampmr *AzureMachinePoolMachineController) reconcileDelete(ctx context.Cont controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer) return reconcile.Result{}, nil } - if !machineScope.AzureMachinePool.ObjectMeta.DeletionTimestamp.IsZero() { log.Info("Skipping VMSS VM deletion as VMSS delete will delete individual instances") @@ -337,6 +345,12 @@ func (ampmr *AzureMachinePoolMachineController) reconcileDelete(ctx context.Cont return reconcile.Result{}, nil } + if machineScope.ProvisioningState() == infrav1.Deleting || machineScope.ProvisioningState() == infrav1.Deleted { + log.V(2).Info(fmt.Sprintf("Skipping VMSS VM deletion as VMSS VM is already %s", machineScope.ProvisioningState())) + controllerutil.RemoveFinalizer(machineScope.AzureMachinePoolMachine, infrav1exp.AzureMachinePoolMachineFinalizer) + return reconcile.Result{}, nil + } + log.Info("Deleting AzureMachinePoolMachine") // deleting a single machine