From 927833cda05d4bf1ae99799dfc625e639c131030 Mon Sep 17 00:00:00 2001 From: Gong Zhang Date: Fri, 26 Sep 2025 14:23:02 +0800 Subject: [PATCH 1/2] Bump VMOP and add NodeAutoPlacement Feature Gate - Bump VMOP including Node AF/AAF support - Add NodeAutoPlacement Feature Gate (cherry picked from commit 700c8aee46af17b6a2eae9bd4722300104c96629) --- config/manager/manager.yaml | 2 +- feature/feature.go | 6 ++++++ go.mod | 4 ++-- go.sum | 4 ++-- test/go.mod | 4 ++-- test/go.sum | 4 ++-- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 401dd765e5..102217c078 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -21,7 +21,7 @@ spec: - "--diagnostics-address=${CAPI_DIAGNOSTICS_ADDRESS:=:8443}" - "--insecure-diagnostics=${CAPI_INSECURE_DIAGNOSTICS:=false}" - --v=4 - - "--feature-gates=MultiNetworks=${EXP_MULTI_NETWORKS:=false},NodeAntiAffinity=${EXP_NODE_ANTI_AFFINITY:=false},NamespaceScopedZones=${EXP_NAMESPACE_SCOPED_ZONES:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false}" + - "--feature-gates=MultiNetworks=${EXP_MULTI_NETWORKS:=false},NodeAntiAffinity=${EXP_NODE_ANTI_AFFINITY:=false},NamespaceScopedZones=${EXP_NAMESPACE_SCOPED_ZONES:=false},NodeAutoPlacement=${EXP_NODE_AUTO_PLACEMENT:=false},PriorityQueue=${EXP_PRIORITY_QUEUE:=false}" image: controller:latest imagePullPolicy: IfNotPresent name: manager diff --git a/feature/feature.go b/feature/feature.go index a233d351c7..1799aaeb68 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -44,6 +44,11 @@ const ( // alpha: v1.11 NamespaceScopedZones featuregate.Feature = "NamespaceScopedZones" + // NodeAutoPlacement is a feature gate for the NodeAutoPlacement functionality for supervisor. + // + // alpha: v1.15 + NodeAutoPlacement featuregate.Feature = "NodeAutoPlacement" + // PriorityQueue is a feature gate that controls if the controller uses the controller-runtime PriorityQueue // instead of the default queue implementation. // @@ -61,6 +66,7 @@ var defaultCAPVFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ // Every feature should be initiated here: NodeAntiAffinity: {Default: false, PreRelease: featuregate.Alpha}, NamespaceScopedZones: {Default: false, PreRelease: featuregate.Alpha}, + NodeAutoPlacement: {Default: false, PreRelease: featuregate.Alpha}, PriorityQueue: {Default: false, PreRelease: featuregate.Alpha}, MultiNetworks: {Default: false, PreRelease: featuregate.Alpha}, } diff --git a/go.mod b/go.mod index d8a4b971c4..800024e12e 100644 --- a/go.mod +++ b/go.mod @@ -4,13 +4,13 @@ go 1.24.0 replace sigs.k8s.io/cluster-api => sigs.k8s.io/cluster-api v1.11.1 -replace github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels => github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels v0.0.0-20240404200847-de75746a9505 +replace github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels => github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels v1.9.1-0.20250908141901-a9e1dfbc0045 require ( github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20241112044858-9da8637c1b0d // The version of vm-operator should be kept in sync with the manifests at: config/deployments/integration-tests - github.com/vmware-tanzu/vm-operator/api v1.8.6 + github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045 github.com/vmware-tanzu/vm-operator/external/ncp v0.0.0-20240404200847-de75746a9505 github.com/vmware/govmomi v0.51.0 ) diff --git a/go.sum b/go.sum index 39b1d7d876..c207a0d683 100644 --- a/go.sum +++ b/go.sum @@ -221,8 +221,8 @@ github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d h1:c github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d/go.mod h1:JbFOh22iDsT5BowJe0GgpMI5e2/S7cWaJlv9LdURVQM= github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20241112044858-9da8637c1b0d h1:z9lrzKVtNlujduv9BilzPxuge/LE2F0N1ms3TP4JZvw= github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20241112044858-9da8637c1b0d/go.mod h1:Q4JzNkNMvjo7pXtlB5/R3oME4Nhah7fAObWgghVmtxk= -github.com/vmware-tanzu/vm-operator/api v1.8.6 h1:NIndORjcnSmIlQsCMIewpIwg/ocRVDh2lYjOroTVLrU= -github.com/vmware-tanzu/vm-operator/api v1.8.6/go.mod h1:HHA2SNI9B5Yqtyp5t+Gt9WTWBi/fIkM6+MukDDSf11A= +github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045 h1:zME8crazIAWVJGboJpSLl+qcRYQ8yA6hPQojz28gY5M= +github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045/go.mod h1:hkc/QZCSHcosWWMPS6VWWR12WenZcNE3BaTJ/8A8sNE= github.com/vmware-tanzu/vm-operator/external/ncp v0.0.0-20240404200847-de75746a9505 h1:y4wXx1FUFqqSgJ/xUOEM1DLS2Uu0KaeLADWpzpioGTU= github.com/vmware-tanzu/vm-operator/external/ncp v0.0.0-20240404200847-de75746a9505/go.mod h1:5rqRJ9zGR+KnKbkGx373WgN8xJpvAj99kHnfoDYRO5I= github.com/vmware/govmomi v0.51.0 h1:n3RLS9aw/irTOKbiIyJzAb6rOat4YOVv/uDoRsNTSQI= diff --git a/test/go.mod b/test/go.mod index bdf2f03505..066205da23 100644 --- a/test/go.mod +++ b/test/go.mod @@ -8,12 +8,12 @@ replace sigs.k8s.io/cluster-api/test => sigs.k8s.io/cluster-api/test v1.11.1 replace sigs.k8s.io/cluster-api-provider-vsphere => ../ -replace github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels => github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels v0.0.0-20240404200847-de75746a9505 +replace github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels => github.com/vmware-tanzu/vm-operator/pkg/constants/testlabels v1.9.1-0.20250908141901-a9e1dfbc0045 require ( github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d // The version of vm-operator should be kept in sync with the manifests at: config/deployments/integration-tests - github.com/vmware-tanzu/vm-operator/api v1.8.6 + github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045 github.com/vmware/govmomi v0.51.0 ) diff --git a/test/go.sum b/test/go.sum index 78fe7f6a5a..2959ebfbd8 100644 --- a/test/go.sum +++ b/test/go.sum @@ -338,8 +338,8 @@ github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d h1:c github.com/vmware-tanzu/net-operator-api v0.0.0-20240326163340-1f32d6bf7f9d/go.mod h1:JbFOh22iDsT5BowJe0GgpMI5e2/S7cWaJlv9LdURVQM= github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20241112044858-9da8637c1b0d h1:z9lrzKVtNlujduv9BilzPxuge/LE2F0N1ms3TP4JZvw= github.com/vmware-tanzu/nsx-operator/pkg/apis v0.0.0-20241112044858-9da8637c1b0d/go.mod h1:Q4JzNkNMvjo7pXtlB5/R3oME4Nhah7fAObWgghVmtxk= -github.com/vmware-tanzu/vm-operator/api v1.8.6 h1:NIndORjcnSmIlQsCMIewpIwg/ocRVDh2lYjOroTVLrU= -github.com/vmware-tanzu/vm-operator/api v1.8.6/go.mod h1:HHA2SNI9B5Yqtyp5t+Gt9WTWBi/fIkM6+MukDDSf11A= +github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045 h1:zME8crazIAWVJGboJpSLl+qcRYQ8yA6hPQojz28gY5M= +github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250908141901-a9e1dfbc0045/go.mod h1:hkc/QZCSHcosWWMPS6VWWR12WenZcNE3BaTJ/8A8sNE= github.com/vmware-tanzu/vm-operator/external/ncp v0.0.0-20240404200847-de75746a9505 h1:y4wXx1FUFqqSgJ/xUOEM1DLS2Uu0KaeLADWpzpioGTU= github.com/vmware-tanzu/vm-operator/external/ncp v0.0.0-20240404200847-de75746a9505/go.mod h1:5rqRJ9zGR+KnKbkGx373WgN8xJpvAj99kHnfoDYRO5I= github.com/vmware/govmomi v0.51.0 h1:n3RLS9aw/irTOKbiIyJzAb6rOat4YOVv/uDoRsNTSQI= From 597af80bc4b538d2fb1d36bd16806a8b6da163ad Mon Sep 17 00:00:00 2001 From: Sagar Muchhal Date: Mon, 29 Sep 2025 15:24:17 -0700 Subject: [PATCH 2/2] Initial impl for VSphereMachine AAF changes Signed-off-by: Sagar Muchhal --- pkg/services/vmoperator/constants.go | 2 + pkg/services/vmoperator/vmopmachine.go | 112 +++++++++++++++++++++++-- 2 files changed, 108 insertions(+), 6 deletions(-) diff --git a/pkg/services/vmoperator/constants.go b/pkg/services/vmoperator/constants.go index 011082a06c..2be4fc55a6 100644 --- a/pkg/services/vmoperator/constants.go +++ b/pkg/services/vmoperator/constants.go @@ -19,6 +19,8 @@ package vmoperator const ( kubeTopologyZoneLabelKey = "topology.kubernetes.io/zone" + kubeHostNameLabelKey = "kubernetes.io/hostname" + nodePoolLabelKey = "node-pool" // ControlPlaneVMClusterModuleGroupName is the name used for the control plane Cluster Module. ControlPlaneVMClusterModuleGroupName = "control-plane-group" diff --git a/pkg/services/vmoperator/vmopmachine.go b/pkg/services/vmoperator/vmopmachine.go index 840b166406..763f8e787d 100644 --- a/pkg/services/vmoperator/vmopmachine.go +++ b/pkg/services/vmoperator/vmopmachine.go @@ -41,6 +41,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/v1beta1" vmwarev1 "sigs.k8s.io/cluster-api-provider-vsphere/apis/vmware/v1beta1" + "sigs.k8s.io/cluster-api-provider-vsphere/feature" capvcontext "sigs.k8s.io/cluster-api-provider-vsphere/pkg/context" "sigs.k8s.io/cluster-api-provider-vsphere/pkg/context/vmware" infrautilv1 "sigs.k8s.io/cluster-api-provider-vsphere/pkg/util" @@ -171,10 +172,6 @@ func (v *VmopMachineService) ReconcileNormal(ctx context.Context, machineCtx cap return false, errors.New("received unexpected SupervisorMachineContext type") } - if supervisorMachineCtx.Machine.Spec.FailureDomain != "" { - supervisorMachineCtx.VSphereMachine.Spec.FailureDomain = ptr.To(supervisorMachineCtx.Machine.Spec.FailureDomain) - } - // If debug logging is enabled, report the number of vms in the cluster before and after the reconcile if log.V(5).Enabled() { vms, err := v.getVirtualMachinesInCluster(ctx, supervisorMachineCtx) @@ -188,6 +185,96 @@ func (v *VmopMachineService) ReconcileNormal(ctx context.Context, machineCtx cap // Set the VM state. Will get reset throughout the reconcile supervisorMachineCtx.VSphereMachine.Status.VMStatus = vmwarev1.VirtualMachineStatePending + // TODO: add check for control plane machine + var vmAffinitySpec *vmoprv1.VirtualMachineAffinitySpec + if feature.Gates.Enabled(feature.NodeAutoPlacement) && + supervisorMachineCtx.Machine.Spec.FailureDomain == "" && + len(supervisorMachineCtx.VSphereCluster.Status.FailureDomains) > 1 { + // Check for the presence of a VirtualMachineGroup with the name and namespace same as the name of the Cluster + vmOperatorVMGroup := &vmoprv1.VirtualMachineGroup{} + key := client.ObjectKey{ + Namespace: supervisorMachineCtx.Cluster.Namespace, + Name: supervisorMachineCtx.Cluster.Name, + } + err := v.Client.Get(ctx, key, vmOperatorVMGroup) + if err != nil { + if !apierrors.IsNotFound(err) { + return false, err + } + if apierrors.IsNotFound(err) { + log.V(4).Info("VirtualMachineGroup not found, requeueing") + return true, nil + } + } + + // Check the presence of the node-pool label on the VirtualMachineGroup + nodePool := supervisorMachineCtx.Machine.Labels[clusterv1.MachineDeploymentNameLabel] + if zone, ok := vmOperatorVMGroup.Labels[fmt.Sprintf("capv/%s", nodePool)]; ok && zone != "" { + supervisorMachineCtx.VSphereMachine.Spec.FailureDomain = ptr.To(zone) + } + + // Fetch the MachineDeployment objects for the Cluster and generate the list of names + // to define the anti-affinity for the VM object. + mdList := &clusterv1.MachineDeploymentList{} + if err := v.Client.List(ctx, mdList, + client.InNamespace(supervisorMachineCtx.Cluster.Namespace), + client.MatchingLabels{ + clusterv1.ClusterNameLabel: supervisorMachineCtx.Cluster.Name, + }); err != nil { + return false, err + } + + antiAffineMDNames := []string{} + for _, md := range mdList.Items { + if md.Spec.Template.Spec.FailureDomain == "" && md.Name != nodePool { + antiAffineMDNames = append(antiAffineMDNames, md.Name) + } + } + + vmAffinitySpec = &vmoprv1.VirtualMachineAffinitySpec{ + VMAffinity: &vmoprv1.VirtualMachineAffinityVMAffinitySpec{ + RequiredDuringSchedulingIgnoredDuringExecution: []vmoprv1.VMAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + nodePoolLabelKey: nodePool, + }, + }, + TopologyKey: kubeTopologyZoneLabelKey, + }, + }, + }, + VMAntiAffinity: &vmoprv1.VirtualMachineAntiAffinityVMAffinitySpec{ + PreferredDuringSchedulingIgnoredDuringExecution: []vmoprv1.VMAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + nodePoolLabelKey: nodePool, + }, + }, + TopologyKey: kubeHostNameLabelKey, + }, + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: nodePoolLabelKey, + Operator: metav1.LabelSelectorOpIn, + Values: antiAffineMDNames, + }, + }, + }, + TopologyKey: kubeTopologyZoneLabelKey, + }, + }, + }, + } + } + + if supervisorMachineCtx.Machine.Spec.FailureDomain != "" { + supervisorMachineCtx.VSphereMachine.Spec.FailureDomain = ptr.To(supervisorMachineCtx.Machine.Spec.FailureDomain) + } + // Check for the presence of an existing object vmOperatorVM := &vmoprv1.VirtualMachine{} key, err := virtualMachineObjectKey(supervisorMachineCtx.Machine.Name, supervisorMachineCtx.Machine.Namespace, supervisorMachineCtx.VSphereMachine.Spec.NamingStrategy) @@ -208,7 +295,7 @@ func (v *VmopMachineService) ReconcileNormal(ctx context.Context, machineCtx cap } // Reconcile the VM Operator VirtualMachine. - if err := v.reconcileVMOperatorVM(ctx, supervisorMachineCtx, vmOperatorVM); err != nil { + if err := v.reconcileVMOperatorVM(ctx, supervisorMachineCtx, vmOperatorVM, vmAffinitySpec); err != nil { v1beta1conditions.MarkFalse(supervisorMachineCtx.VSphereMachine, infrav1.VMProvisionedCondition, vmwarev1.VMCreationFailedReason, clusterv1beta1.ConditionSeverityWarning, "failed to create or update VirtualMachine: %v", err) v1beta2conditions.Set(supervisorMachineCtx.VSphereMachine, metav1.Condition{ @@ -378,7 +465,8 @@ func (v *VmopMachineService) GetHostInfo(ctx context.Context, machineCtx capvcon return vmOperatorVM.Status.Host, nil } -func (v *VmopMachineService) reconcileVMOperatorVM(ctx context.Context, supervisorMachineCtx *vmware.SupervisorMachineContext, vmOperatorVM *vmoprv1.VirtualMachine) error { +// update the method to accept the vmAffinitySpec +func (v *VmopMachineService) reconcileVMOperatorVM(ctx context.Context, supervisorMachineCtx *vmware.SupervisorMachineContext, vmOperatorVM *vmoprv1.VirtualMachine, vmAffinitySpec *vmoprv1.VirtualMachineAffinitySpec) error { // All Machine resources should define the version of Kubernetes to use. if supervisorMachineCtx.Machine.Spec.Version == "" { return errors.Errorf( @@ -494,6 +582,15 @@ func (v *VmopMachineService) reconcileVMOperatorVM(ctx context.Context, supervis vmOperatorVM = typedModified } + if vmAffinitySpec != nil { + if vmOperatorVM.Spec.Affinity == nil { + vmOperatorVM.Spec.Affinity = vmAffinitySpec + } + if vmOperatorVM.Spec.GroupName == "" { + vmOperatorVM.Spec.GroupName = supervisorMachineCtx.GetCluster().Name + } + } + // Make sure the VSphereMachine owns the VM Operator VirtualMachine. if err := ctrlutil.SetControllerReference(supervisorMachineCtx.VSphereMachine, vmOperatorVM, v.Client.Scheme()); err != nil { return errors.Wrapf(err, "failed to mark %s %s/%s as owner of %s %s/%s", @@ -800,6 +897,9 @@ func getVMLabels(supervisorMachineCtx *vmware.SupervisorMachineContext, vmLabels // resources associated with the target cluster. vmLabels[clusterv1.ClusterNameLabel] = supervisorMachineCtx.GetClusterContext().Cluster.Name + // Ensure the VM has the machine deployment name label + vmLabels[nodePoolLabelKey] = supervisorMachineCtx.Machine.Labels[clusterv1.MachineDeploymentNameLabel] + return vmLabels }