diff --git a/api/v1beta1/types_class.go b/api/v1beta1/types_class.go index 452eab517f8..d6b853dba20 100644 --- a/api/v1beta1/types_class.go +++ b/api/v1beta1/types_class.go @@ -48,6 +48,7 @@ type AzureClusterClassSpec struct { // - GermanCloud: "AzureGermanCloud" // - PublicCloud: "AzurePublicCloud" // - USGovernmentCloud: "AzureUSGovernmentCloud" + // - StackCloud: "AzureStackCloud" // // Note that values other than the default must also be accompanied by corresponding changes to the // aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does @@ -186,6 +187,7 @@ type AzureManagedControlPlaneClassSpec struct { // - PublicCloud: "AzurePublicCloud" // - USGovernmentCloud: "AzureUSGovernmentCloud" // + // // Note that values other than the default must also be accompanied by corresponding changes to the // aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does // not support referring to multiple different clouds in a single installation. The following fields must diff --git a/azure/defaults.go b/azure/defaults.go index 02e5508fa5c..f17ae2d4394 100644 --- a/azure/defaults.go +++ b/azure/defaults.go @@ -27,6 +27,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/Azure/azure-sdk-for-go/sdk/tracing/azotel" + azureautorest "github.com/Azure/go-autorest/autorest/azure" "go.opentelemetry.io/otel" "sigs.k8s.io/cluster-api-provider-azure/util/tele" @@ -44,6 +45,8 @@ const ( ChinaCloudName = "AzureChinaCloud" // USGovernmentCloudName is the name of the Azure US Government cloud. USGovernmentCloudName = "AzureUSGovernmentCloud" + // StackCloudName is the name for Azure Stack hybrid cloud environments. + StackCloudName = "AzureStackCloud" ) const ( @@ -109,6 +112,16 @@ const ( CustomHeaderPrefix = "infrastructure.cluster.x-k8s.io/custom-header-" ) +const ( + // StackAPIVersionProfile is the API version profile to set for ARM clients. See: + // https://learn.microsoft.com/en-us/azure-stack/user/azure-stack-profiles-azure-resource-manager-versions?view=azs-2408#overview-of-the-2020-09-01-hybrid-profile + StackAPIVersionProfile = "2020-06-01" + + // StackDiskAPIVersionProfile is the API Version to set for the disk client. + // API Version Profile "2020-06-01" is not supported for disks. + StackDiskAPIVersionProfile = "2018-06-01" +) + var ( // LinuxBootstrapExtensionCommand is the command the VM bootstrap extension will execute to verify Linux nodes bootstrap completes successfully. LinuxBootstrapExtensionCommand = fmt.Sprintf("for i in $(seq 1 %d); do test -f %s && break; if [ $i -eq %d ]; then echo 'Error joining node to cluster: kubeadm init or join failed. To debug, check the cloud-init, kubelet, or other bootstrap logs: https://capz.sigs.k8s.io/self-managed/troubleshooting.html#checking-cloud-init-logs-ubuntu'; exit 1; else sleep %d; fi; done", bootstrapExtensionRetries, bootstrapSentinelFile, bootstrapExtensionRetries, bootstrapExtensionSleep) @@ -367,6 +380,21 @@ func ARMClientOptions(azureEnvironment string, extraPolicies ...policy.Policy) ( opts.Cloud = cloud.AzureChina case USGovernmentCloudName: opts.Cloud = cloud.AzureGovernment + case StackCloudName: + cloudEnv, err := azureautorest.EnvironmentFromName(azureEnvironment) + if err != nil { + return nil, fmt.Errorf("unable to get Azure Stack cloud environment: %w", err) + } + opts.APIVersion = StackAPIVersionProfile + opts.Cloud = cloud.Configuration{ + ActiveDirectoryAuthorityHost: cloudEnv.ActiveDirectoryEndpoint, + Services: map[cloud.ServiceName]cloud.ServiceConfiguration{ + cloud.ResourceManager: { + Audience: cloudEnv.TokenAudience, + Endpoint: cloudEnv.ResourceManagerEndpoint, + }, + }, + } case "": // No cloud name provided, so leave at defaults. default: diff --git a/azure/errors.go b/azure/errors.go index 0d719e80037..8e77269acd4 100644 --- a/azure/errors.go +++ b/azure/errors.go @@ -34,6 +34,12 @@ func ResourceNotFound(err error) bool { return errors.As(err, &rerr) && rerr.StatusCode == http.StatusNotFound } +// BadRequest parses an error to check if it its status code is Bad Request (400). +func BadRequest(err error) bool { + var rerr *azcore.ResponseError + return errors.As(err, &rerr) && rerr.StatusCode == http.StatusBadRequest +} + // VMDeletedError is returned when a virtual machine is deleted outside of capz. type VMDeletedError struct { ProviderID string diff --git a/azure/scope/cluster.go b/azure/scope/cluster.go index f7f035f2e7b..3a530d7722a 100644 --- a/azure/scope/cluster.go +++ b/azure/scope/cluster.go @@ -559,7 +559,7 @@ func (s *ClusterScope) VNetSpec() azure.ASOResourceSpecGetter[*asonetworkv1api20 // PrivateDNSSpec returns the private dns zone spec. func (s *ClusterScope) PrivateDNSSpec() (zoneSpec azure.ResourceSpecGetter, linkSpec, recordSpec []azure.ResourceSpecGetter) { - if s.IsAPIServerPrivate() { + if s.IsAPIServerPrivate() && !s.IsAzureStack() { resourceGroup := s.ResourceGroup() if s.AzureCluster.Spec.NetworkSpec.PrivateDNSZoneResourceGroup != "" { resourceGroup = s.AzureCluster.Spec.NetworkSpec.PrivateDNSZoneResourceGroup @@ -1251,3 +1251,8 @@ func (s *ClusterScope) getLastAppliedSecurityRules(nsgName string) map[string]in } return lastAppliedSecurityRules } + +// IsAzureStack returns true if the cluster is running on Azure Stack. +func (s *ClusterScope) IsAzureStack() bool { + return strings.EqualFold(s.Environment.Name, azure.StackCloudName) +} diff --git a/azure/scope/machine.go b/azure/scope/machine.go index 67635826d8b..f7f674b4589 100644 --- a/azure/scope/machine.go +++ b/azure/scope/machine.go @@ -150,7 +150,8 @@ func (m *MachineScope) InitMachineCache(ctx context.Context) error { } m.cache.availabilitySetSKU, err = skuCache.Get(ctx, string(armcompute.AvailabilitySetSKUTypesAligned), resourceskus.AvailabilitySets) - if err != nil { + // Resource SKU API for availability sets may not be available in Azure Stack environments. + if err != nil && !strings.EqualFold(m.CloudEnvironment(), azure.StackCloudName) { return errors.Wrapf(err, "failed to get availability set SKU %s in compute api", string(armcompute.AvailabilitySetSKUTypesAligned)) } } @@ -497,12 +498,13 @@ func (m *MachineScope) AvailabilitySetSpec() azure.ResourceSpecGetter { } spec := &availabilitysets.AvailabilitySetSpec{ - Name: availabilitySetName, - ResourceGroup: m.NodeResourceGroup(), - ClusterName: m.ClusterName(), - Location: m.Location(), - SKU: nil, - AdditionalTags: m.AdditionalTags(), + Name: availabilitySetName, + ResourceGroup: m.NodeResourceGroup(), + ClusterName: m.ClusterName(), + Location: m.Location(), + CloudEnvironment: m.CloudEnvironment(), + SKU: nil, + AdditionalTags: m.AdditionalTags(), } if m.cache != nil { diff --git a/azure/services/availabilitysets/spec.go b/azure/services/availabilitysets/spec.go index ea522da07ee..4d411edf184 100644 --- a/azure/services/availabilitysets/spec.go +++ b/azure/services/availabilitysets/spec.go @@ -19,24 +19,27 @@ package availabilitysets import ( "context" "strconv" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" "github.com/pkg/errors" "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-azure/azure" "sigs.k8s.io/cluster-api-provider-azure/azure/converters" "sigs.k8s.io/cluster-api-provider-azure/azure/services/resourceskus" ) // AvailabilitySetSpec defines the specification for an availability set. type AvailabilitySetSpec struct { - Name string - ResourceGroup string - ClusterName string - Location string - SKU *resourceskus.SKU - AdditionalTags infrav1.Tags + Name string + ResourceGroup string + ClusterName string + Location string + CloudEnvironment string + SKU *resourceskus.SKU + AdditionalTags infrav1.Tags } // ResourceName returns the name of the availability set. @@ -64,20 +67,10 @@ func (s *AvailabilitySetSpec) Parameters(_ context.Context, existing interface{} return nil, nil } - if s.SKU == nil { - return nil, errors.New("unable to get required availability set SKU from machine cache") - } - - var faultDomainCount *int32 - faultDomainCountStr, ok := s.SKU.GetCapability(resourceskus.MaximumPlatformFaultDomainCount) - if !ok { - return nil, errors.Errorf("unable to get required availability set SKU capability %s", resourceskus.MaximumPlatformFaultDomainCount) - } - count, err := strconv.ParseInt(faultDomainCountStr, 10, 32) + faultDomainCount, err := getFaultDomainCount(s.SKU, s.CloudEnvironment) if err != nil { - return nil, errors.Wrapf(err, "unable to parse availability set fault domain count") + return nil, err } - faultDomainCount = ptr.To[int32](int32(count)) asParams := armcompute.AvailabilitySet{ SKU: &armcompute.SKU{ @@ -98,3 +91,27 @@ func (s *AvailabilitySetSpec) Parameters(_ context.Context, existing interface{} return asParams, nil } + +func getFaultDomainCount(sku *resourceskus.SKU, cloudEnvironment string) (*int32, error) { + // Azure Stack environments may not implement the resource SKU API + // for availability sets. Use a default value instead. + if strings.EqualFold(cloudEnvironment, azure.StackCloudName) { + return ptr.To(int32(2)), nil + } + + if sku == nil { + return nil, errors.New("unable to get required availability set SKU from machine cache") + } + + var faultDomainCount *int32 + faultDomainCountStr, ok := sku.GetCapability(resourceskus.MaximumPlatformFaultDomainCount) + if !ok { + return nil, errors.Errorf("unable to get required availability set SKU capability %s", resourceskus.MaximumPlatformFaultDomainCount) + } + count, err := strconv.ParseInt(faultDomainCountStr, 10, 32) + if err != nil { + return nil, errors.Wrapf(err, "unable to parse availability set fault domain count") + } + faultDomainCount = ptr.To(int32(count)) + return faultDomainCount, nil +} diff --git a/azure/services/disks/client.go b/azure/services/disks/client.go index 58cdb4345fc..edfa947581c 100644 --- a/azure/services/disks/client.go +++ b/azure/services/disks/client.go @@ -18,6 +18,7 @@ package disks import ( "context" + "strings" "time" "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" @@ -38,6 +39,9 @@ type azureClient struct { // newClient creates a new disks client from an authorizer. func newClient(auth azure.Authorizer, apiCallTimeout time.Duration) (*azureClient, error) { opts, err := azure.ARMClientOptions(auth.CloudEnvironment()) + if strings.EqualFold(auth.CloudEnvironment(), azure.StackCloudName) { + opts.APIVersion = azure.StackDiskAPIVersionProfile + } if err != nil { return nil, errors.Wrap(err, "failed to create disks client options") } diff --git a/azure/services/publicips/publicips.go b/azure/services/publicips/publicips.go index 0854f89f583..be7e456a7ad 100644 --- a/azure/services/publicips/publicips.go +++ b/azure/services/publicips/publicips.go @@ -18,6 +18,7 @@ package publicips import ( "context" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4" "github.com/pkg/errors" @@ -151,6 +152,12 @@ func (s *Service) Delete(ctx context.Context) error { // isIPManaged returns true if the IP has an owned tag with the cluster name as value, // meaning that the IP's lifecycle is managed. func (s *Service) isIPManaged(ctx context.Context, spec azure.ResourceSpecGetter) (bool, error) { + if strings.EqualFold(s.Scope.CloudEnvironment(), azure.StackCloudName) { + // Azure Stack does not yet support getting tags with scope, + // so assume IPs are managed. + return true, nil + } + scope := azure.PublicIPID(s.Scope.SubscriptionID(), spec.ResourceGroupName(), spec.ResourceName()) result, err := s.TagsGetter.GetAtScope(ctx, scope) if err != nil { diff --git a/azure/services/virtualmachines/client.go b/azure/services/virtualmachines/client.go index 1e2bbea08d4..119f546b33f 100644 --- a/azure/services/virtualmachines/client.go +++ b/azure/services/virtualmachines/client.go @@ -109,14 +109,21 @@ func (ac *AzureClient) CreateOrUpdateAsync(ctx context.Context, spec azure.Resou // request to Azure and if accepted without error, the func will return a Poller which can be used to track the ongoing // progress of the operation. func (ac *AzureClient) DeleteAsync(ctx context.Context, spec azure.ResourceSpecGetter, resumeToken string) (poller *runtime.Poller[armcompute.VirtualMachinesClientDeleteResponse], err error) { - ctx, _, done := tele.StartSpanWithLogger(ctx, "virtualmachines.AzureClient.Delete") + ctx, log, done := tele.StartSpanWithLogger(ctx, "virtualmachines.AzureClient.Delete") defer done() forceDelete := ptr.To(true) opts := &armcompute.VirtualMachinesClientBeginDeleteOptions{ResumeToken: resumeToken, ForceDeletion: forceDelete} poller, err = ac.virtualmachines.BeginDelete(ctx, spec.ResourceGroupName(), spec.ResourceName(), opts) if err != nil { - return nil, err + if azure.BadRequest(err) { + log.Info("Failed to Begin VM Delete with Force Deletion, retrying without the force flag") + opts.ForceDeletion = ptr.To(false) + poller, err = ac.virtualmachines.BeginDelete(ctx, spec.ResourceGroupName(), spec.ResourceName(), opts) + } + if err != nil { + return nil, err + } } ctx, cancel := context.WithTimeout(ctx, ac.apiCallTimeout) diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml index 9c7cdabe38c..27db4c7950f 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml @@ -92,6 +92,7 @@ spec: - GermanCloud: "AzureGermanCloud" - PublicCloud: "AzurePublicCloud" - USGovernmentCloud: "AzureUSGovernmentCloud" + - StackCloud: "AzureStackCloud" Note that values other than the default must also be accompanied by corresponding changes to the aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml index 2b5acaaec48..aa2614d0a72 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclustertemplates.yaml @@ -65,6 +65,7 @@ spec: - GermanCloud: "AzureGermanCloud" - PublicCloud: "AzurePublicCloud" - USGovernmentCloud: "AzureUSGovernmentCloud" + - StackCloud: "AzureStackCloud" Note that values other than the default must also be accompanied by corresponding changes to the aso-controller-settings Secret to configure ASO to refer to the non-Public cloud. ASO currently does diff --git a/controllers/azuremachine_reconciler.go b/controllers/azuremachine_reconciler.go index 544ccc02694..294f45922e3 100644 --- a/controllers/azuremachine_reconciler.go +++ b/controllers/azuremachine_reconciler.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "strings" "github.com/pkg/errors" @@ -101,10 +102,19 @@ func newAzureMachineService(machineScope *scope.MachineScope) (*azureMachineServ virtualmachinesSvc, roleAssignmentsSvc, vmextensionsSvc, - tagsSvc, }, skuCache: cache, } + + // The tags service fails in Azure Stack because the current SDK implementation + // will throw an error when trying to get tags at scope on Azure Stack environments. + // This means tags can only be provided on Azure Stack machines at creation time + // and will not be reconciled day-2. Once the get-tags-at-scope SDK issue is + // addressed, this change can be reverted to add tagsSvc in all environments. + if !strings.EqualFold(machineScope.CloudEnvironment(), azure.StackCloudName) { + ams.services = append(ams.services, tagsSvc) + } + ams.Reconcile = ams.reconcile ams.Pause = ams.pause ams.Delete = ams.delete