Skip to content

Commit a84f383

Browse files
[release/v1.59] Fix vSphere anti-affinity (#1781)
* Fix vSphere anti-affinity (#1779) Signed-off-by: Waleed Malik <[email protected]> * Update image for vsphere anti-affinity prow job Signed-off-by: Waleed Malik <[email protected]> --------- Signed-off-by: Waleed Malik <[email protected]>
1 parent 9e086c4 commit a84f383

File tree

6 files changed

+156
-69
lines changed

6 files changed

+156
-69
lines changed

.prow/provider-vsphere.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,35 @@ presubmits:
140140
cpu: 2
141141
limits:
142142
memory: 7Gi
143+
144+
- name: pull-machine-controller-e2e-vsphere-anti-affinity
145+
always_run: false
146+
decorate: true
147+
clone_uri: "ssh://[email protected]/kubermatic/machine-controller.git"
148+
labels:
149+
preset-hetzner: "true"
150+
preset-e2e-ssh: "true"
151+
preset-vsphere: "true"
152+
preset-rhel: "true"
153+
preset-goproxy: "true"
154+
preset-kind-volume-mounts: "true"
155+
preset-docker-mirror: "true"
156+
preset-kubeconfig-ci: "true"
157+
spec:
158+
containers:
159+
- image: quay.io/kubermatic/build:go-1.22-node-18-kind-0.22-5
160+
command:
161+
- "./hack/ci/run-e2e-tests.sh"
162+
args:
163+
- "TestVsphereAntiAffinityProvisioningE2E"
164+
env:
165+
- name: CLOUD_PROVIDER
166+
value: vsphere
167+
securityContext:
168+
privileged: true
169+
resources:
170+
requests:
171+
memory: 7Gi
172+
cpu: 2
173+
limits:
174+
memory: 7Gi

pkg/cloudprovider/provider/vsphere/helper.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,10 @@ func createClonedVM(ctx context.Context, log *zap.SugaredLogger, vmName string,
103103
if err := clonedVMTask.Wait(ctx); err != nil {
104104
return nil, fmt.Errorf("error when waiting for result of clone task: %w", err)
105105
}
106-
107106
virtualMachine, err := session.Finder.VirtualMachine(ctx, vmName)
108107
if err != nil {
109108
return nil, fmt.Errorf("failed to get virtual machine object after cloning: %w", err)
110109
}
111-
112110
vmDevices, err := virtualMachine.Device(ctx)
113111
if err != nil {
114112
return nil, fmt.Errorf("failed to list devices of template VM: %w", err)
@@ -138,7 +136,6 @@ func createClonedVM(ctx context.Context, log *zap.SugaredLogger, vmName string,
138136

139137
guestInfoUserData = "guestinfo.ignition.config.data"
140138
guestInfoUserDataEncoding = "guestinfo.ignition.config.data.encoding"
141-
142139
for _, item := range mvm.Config.VAppConfig.GetVmConfigInfo().Property {
143140
switch item.Id {
144141
case guestInfoUserData:
@@ -170,7 +167,6 @@ func createClonedVM(ctx context.Context, log *zap.SugaredLogger, vmName string,
170167
}
171168

172169
diskUUIDEnabled := true
173-
174170
var deviceSpecs []types.BaseVirtualDeviceConfigSpec
175171
if config.DiskSizeGB != nil {
176172
disks, err := getDisksFromVM(ctx, virtualMachine)
@@ -221,7 +217,6 @@ func createClonedVM(ctx context.Context, log *zap.SugaredLogger, vmName string,
221217
if err := removeFloppyDevice(ctx, virtualMachine); err != nil {
222218
return nil, fmt.Errorf("failed to remove floppy device: %w", err)
223219
}
224-
225220
return virtualMachine, nil
226221
}
227222

pkg/cloudprovider/provider/vsphere/provider.go

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"net/url"
2424
"os"
2525
"strings"
26-
"sync"
2726

2827
"github.com/vmware/govmomi/find"
2928
"github.com/vmware/govmomi/object"
@@ -49,7 +48,6 @@ import (
4948

5049
type provider struct {
5150
configVarResolver *providerconfig.ConfigVarResolver
52-
mutex sync.Mutex
5351
}
5452

5553
// New returns a VSphere provider.
@@ -384,8 +382,7 @@ func (p *provider) create(ctx context.Context, log *zap.SugaredLogger, machine *
384382
}
385383

386384
if config.VMAntiAffinity {
387-
machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
388-
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machineSetName, config); err != nil {
385+
if err := p.createOrUpdateVMAntiAffinityRule(ctx, log, session, machine, config); err != nil {
389386
return nil, fmt.Errorf("failed to add VM to anti affinity rule: %w", err)
390387
}
391388
}
@@ -452,6 +449,12 @@ func (p *provider) Cleanup(ctx context.Context, log *zap.SugaredLogger, machine
452449
return false, fmt.Errorf("failed to delete tags: %w", err)
453450
}
454451

452+
if config.VMAntiAffinity {
453+
if err := p.createOrUpdateVMAntiAffinityRule(ctx, log, session, machine, config); err != nil {
454+
return false, fmt.Errorf("failed to update VMs in anti-affinity rule: %w", err)
455+
}
456+
}
457+
455458
powerState, err := virtualMachine.PowerState(ctx)
456459
if err != nil {
457460
return false, fmt.Errorf("failed to get virtual machine power state: %w", err)
@@ -507,13 +510,6 @@ func (p *provider) Cleanup(ctx context.Context, log *zap.SugaredLogger, machine
507510
return false, fmt.Errorf("failed to destroy vm %s: %w", virtualMachine.Name(), err)
508511
}
509512

510-
if config.VMAntiAffinity {
511-
machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
512-
if err := p.createOrUpdateVMAntiAffinityRule(ctx, session, machineSetName, config); err != nil {
513-
return false, fmt.Errorf("failed to add VM to anti affinity rule: %w", err)
514-
}
515-
}
516-
517513
if pc.OperatingSystem != providerconfigtypes.OperatingSystemFlatcar {
518514
filemanager := datastore.NewFileManager(session.Datacenter, false)
519515

pkg/cloudprovider/provider/vsphere/rule.go

Lines changed: 45 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -20,64 +20,75 @@ import (
2020
"context"
2121
"errors"
2222
"fmt"
23-
"reflect"
2423
"strings"
25-
"time"
24+
"sync"
2625

27-
"github.com/aws/smithy-go/ptr"
2826
"github.com/vmware/govmomi/find"
2927
"github.com/vmware/govmomi/object"
3028
"github.com/vmware/govmomi/vim25/mo"
3129
"github.com/vmware/govmomi/vim25/types"
30+
"go.uber.org/zap"
31+
32+
clusterv1alpha1 "github.com/kubermatic/machine-controller/pkg/apis/cluster/v1alpha1"
33+
34+
"k8s.io/utils/ptr"
3235
)
3336

37+
var lock sync.Mutex
38+
3439
// createOrUpdateVMAntiAffinityRule creates or updates an anti affinity rule with the name in the given cluster.
3540
// VMs are attached to the rule based on their folder path and name prefix in vsphere.
3641
// A minimum of two VMs is required.
37-
func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session *Session, name string, config *Config) error {
38-
p.mutex.Lock()
39-
defer p.mutex.Unlock()
40-
42+
func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, log *zap.SugaredLogger, session *Session, machine *clusterv1alpha1.Machine, config *Config) error {
43+
lock.Lock()
44+
defer lock.Unlock()
4145
cluster, err := session.Finder.ClusterComputeResource(ctx, config.Cluster)
4246
if err != nil {
4347
return err
4448
}
4549

50+
machineSetName := machine.Name[:strings.LastIndex(machine.Name, "-")]
4651
vmsInFolder, err := session.Finder.VirtualMachineList(ctx, strings.Join([]string{config.Folder, "*"}, "/"))
4752
if err != nil {
4853
if errors.Is(err, &find.NotFoundError{}) {
49-
return removeVMAntiAffinityRule(ctx, session, config.Cluster, name)
54+
return removeVMAntiAffinityRule(ctx, session, config.Cluster, machineSetName)
5055
}
5156
return err
5257
}
5358

5459
var ruleVMRef []types.ManagedObjectReference
5560
for _, vm := range vmsInFolder {
56-
if strings.HasPrefix(vm.Name(), name) {
61+
// Only add VMs with the same machineSetName to the rule and exclude the machine itself if it is being deleted
62+
if strings.HasPrefix(vm.Name(), machineSetName) && !(vm.Name() == machine.Name && machine.DeletionTimestamp != nil) {
5763
ruleVMRef = append(ruleVMRef, vm.Reference())
5864
}
5965
}
6066

61-
// minimum of two vms required
62-
if len(ruleVMRef) < 2 {
63-
return removeVMAntiAffinityRule(ctx, session, config.Cluster, name)
67+
if len(ruleVMRef) == 0 {
68+
log.Debugf("No VMs in folder %s with name prefix %s found", config.Folder, machineSetName)
69+
return removeVMAntiAffinityRule(ctx, session, config.Cluster, machineSetName)
70+
} else if len(ruleVMRef) < 2 {
71+
// DRS rule must have at least two virtual machine members
72+
log.Debugf("Not enough VMs in folder %s to create anti-affinity rule", config.Folder)
73+
return nil
6474
}
6575

66-
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, name)
76+
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, machineSetName)
6777
if err != nil {
6878
return err
6979
}
7080

81+
log.Debugf("Creating or updating anti-affinity rule for VMs %v in cluster %s", ruleVMRef, config.Cluster)
7182
operation := types.ArrayUpdateOperationEdit
7283

7384
//create new rule
7485
if info == nil {
7586
info = &types.ClusterAntiAffinityRuleSpec{
7687
ClusterRuleInfo: types.ClusterRuleInfo{
77-
Enabled: ptr.Bool(true),
78-
Mandatory: ptr.Bool(false),
79-
Name: name,
80-
UserCreated: ptr.Bool(true),
88+
Enabled: ptr.To(true),
89+
Mandatory: ptr.To(false),
90+
Name: machineSetName,
91+
UserCreated: ptr.To(true),
8192
},
8293
}
8394
operation = types.ArrayUpdateOperationAdd
@@ -95,49 +106,22 @@ func (p *provider) createOrUpdateVMAntiAffinityRule(ctx context.Context, session
95106
},
96107
}
97108

109+
log.Debugf("Performing %q for anti-affinity rule for VMs %v in cluster %s", operation, ruleVMRef, config.Cluster)
98110
task, err := cluster.Reconfigure(ctx, spec, true)
99111
if err != nil {
100112
return err
101113
}
102114

103-
err = task.Wait(ctx)
115+
taskResult, err := task.WaitForResult(ctx)
104116
if err != nil {
105-
return err
117+
return fmt.Errorf("error waiting for cluster %v reconfiguration to complete", cluster.Name())
106118
}
107-
108-
return waitForRule(ctx, cluster, info)
109-
}
110-
111-
// waitForRule checks periodically the vsphere api for the ClusterAntiAffinityRule and returns error if the rule was not found after a timeout.
112-
func waitForRule(ctx context.Context, cluster *object.ClusterComputeResource, rule *types.ClusterAntiAffinityRuleSpec) error {
113-
timeout := time.NewTimer(10 * time.Second)
114-
ticker := time.NewTicker(500 * time.Millisecond)
115-
defer timeout.Stop()
116-
defer ticker.Stop()
117-
118-
for {
119-
select {
120-
case <-timeout.C:
121-
122-
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, rule.Name)
123-
if err != nil {
124-
return err
125-
}
126-
127-
if !reflect.DeepEqual(rule, info) {
128-
return fmt.Errorf("expected anti affinity changes not found in vsphere")
129-
}
130-
case <-ticker.C:
131-
info, err := findClusterAntiAffinityRuleByName(ctx, cluster, rule.Name)
132-
if err != nil {
133-
return err
134-
}
135-
136-
if reflect.DeepEqual(rule, info) {
137-
return nil
138-
}
139-
}
119+
if taskResult.State != types.TaskInfoStateSuccess {
120+
return fmt.Errorf("cluster %v reconfiguration task was not successful", cluster.Name())
140121
}
122+
log.Debugf("Successfully created/updated anti-affinity rule for machineset %v against machine %v", machineSetName, machine.Name)
123+
124+
return nil
141125
}
142126

143127
// removeVMAntiAffinityRule removes an anti affinity rule with the name in the given cluster.
@@ -172,7 +156,15 @@ func removeVMAntiAffinityRule(ctx context.Context, session *Session, clusterPath
172156
if err != nil {
173157
return err
174158
}
175-
return task.Wait(ctx)
159+
160+
taskResult, err := task.WaitForResult(ctx)
161+
if err != nil {
162+
return fmt.Errorf("error waiting for cluster %v reconfiguration to complete", cluster.Name())
163+
}
164+
if taskResult.State != types.TaskInfoStateSuccess {
165+
return fmt.Errorf("cluster %v reconfiguration task was not successful", cluster.Name())
166+
}
167+
return nil
176168
}
177169

178170
func findClusterAntiAffinityRuleByName(ctx context.Context, cluster *object.ClusterComputeResource, name string) (*types.ClusterAntiAffinityRuleSpec, error) {

test/e2e/provisioning/all_e2e_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ const (
6666
LinodeManifest = "./testdata/machinedeployment-linode.yaml"
6767
VMwareCloudDirectorManifest = "./testdata/machinedeployment-vmware-cloud-director.yaml"
6868
VSPhereManifest = "./testdata/machinedeployment-vsphere.yaml"
69+
VSPhereAntiAffinityManifest = "./testdata/machinedeployment-vsphere-anti-affinity.yaml"
6970
VSPhereMultipleNICManifest = "./testdata/machinedeployment-vsphere-multiple-nic.yaml"
7071
VSPhereDSCManifest = "./testdata/machinedeployment-vsphere-datastore-cluster.yaml"
7172
VSPhereResourcePoolManifest = "./testdata/machinedeployment-vsphere-resource-pool.yaml"
@@ -855,6 +856,23 @@ func TestVsphereMultipleNICProvisioningE2E(t *testing.T) {
855856
runScenarios(t, selector, params, VSPhereMultipleNICManifest, fmt.Sprintf("vs-%s", *testRunIdentifier))
856857
}
857858

859+
// TestVsphereAntiAffinityProvisioningE2E - is the same as the TestVsphereProvisioning suit but has anti-affinity rules applied to the VMs.
860+
func TestVsphereAntiAffinityProvisioningE2E(t *testing.T) {
861+
t.Parallel()
862+
863+
params := getVSphereTestParams(t)
864+
865+
scenario := scenario{
866+
name: "VSphere Anti-Affinity provisioning",
867+
osName: "ubuntu",
868+
containerRuntime: defaultContainerRuntime,
869+
kubernetesVersion: defaultKubernetesVersion,
870+
executor: verifyCreateAndDelete,
871+
}
872+
873+
testScenario(t, scenario, *testRunIdentifier, params, VSPhereAntiAffinityManifest, false)
874+
}
875+
858876
// TestVsphereDatastoreClusterProvisioning - is the same as the TestVsphereProvisioning suite but specifies a DatastoreCluster
859877
// instead of the Datastore in the provider specs.
860878
func TestVsphereDatastoreClusterProvisioningE2E(t *testing.T) {
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
apiVersion: "cluster.k8s.io/v1alpha1"
2+
kind: MachineDeployment
3+
metadata:
4+
name: << MACHINE_NAME >>
5+
namespace: kube-system
6+
annotations:
7+
k8c.io/operating-system-profile: osp-<< OS_NAME >>
8+
spec:
9+
replicas: 3
10+
strategy:
11+
type: RollingUpdate
12+
rollingUpdate:
13+
maxSurge: 1
14+
maxUnavailable: 0
15+
selector:
16+
matchLabels:
17+
name: << MACHINE_NAME >>
18+
template:
19+
metadata:
20+
labels:
21+
name: << MACHINE_NAME >>
22+
spec:
23+
providerSpec:
24+
value:
25+
sshPublicKeys:
26+
- "<< YOUR_PUBLIC_KEY >>"
27+
cloudProvider: "vsphere"
28+
cloudProviderSpec:
29+
templateVMName: '<< OS_Image_Template >>'
30+
username: '<< VSPHERE_USERNAME >>'
31+
vsphereURL: '<< VSPHERE_ADDRESS >>'
32+
datacenter: 'Hamburg'
33+
folder: '/Hamburg/vm/Kubermatic-ci'
34+
password: << VSPHERE_PASSWORD >>
35+
# example: 'https://your-vcenter:8443'. '/sdk' gets appended automatically
36+
cluster: Kubermatic
37+
vmAntiAffinity: true
38+
datastore: vsan
39+
cpus: 2
40+
MemoryMB: 4096
41+
diskSizeGB: << DISK_SIZE >>
42+
allowInsecure: true
43+
operatingSystem: "<< OS_NAME >>"
44+
operatingSystemSpec:
45+
distUpgradeOnBoot: false
46+
disableAutoUpdate: true
47+
attachSubscription: false
48+
# 'rhelSubscriptionManagerUser' is only used for rhel os and can be set via env var `RHEL_SUBSCRIPTION_MANAGER_USER`
49+
rhelSubscriptionManagerUser: "<< RHEL_SUBSCRIPTION_MANAGER_USER >>"
50+
# 'rhelSubscriptionManagerPassword' is only used for rhel os and can be set via env var `RHEL_SUBSCRIPTION_MANAGER_PASSWORD`
51+
rhelSubscriptionManagerPassword: "<< RHEL_SUBSCRIPTION_MANAGER_PASSWORD >>"
52+
rhsmOfflineToken: "<< REDHAT_SUBSCRIPTIONS_OFFLINE_TOKEN >>"
53+
versions:
54+
kubelet: "<< KUBERNETES_VERSION >>"

0 commit comments

Comments
 (0)