Skip to content

Commit d23a9c1

Browse files
committed
Do not watch guest.net / multi priority queues
This patch further restricts the async watcher from monitoring guest.net due to the same churn in VKS nodes that caused guest.ipStack to be so busy. This is okay since the summary.guest will still trigger a reconcile when a VM gets its primary IP. Additionally, this patch also introduces multiple priority queues for a VM's lifecycle. When a VM is going to be reconciled, it will now fall into one of the following buckets: - priorityLow int = handler.LowPriority // 0 - priorityCreating int = 100 - priorityPowerStateChange int = 90 - priorityWaitingForIP int = 90 - priorityDeleting int = 80 - priorityWaitingForDiskPromo int = 70 Thus, it no longer matters if async signal is sending thousands of VMs a minute, because VMs that are being created, deleted, waiting on a power state change, waiting for an IP, or waiting for disk promotion will all be moved to the head of the line.
1 parent b6d62a4 commit d23a9c1

File tree

21 files changed

+1861
-151
lines changed

21 files changed

+1861
-151
lines changed

.golangci.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ linters:
177177
pkg: github.com/vmware-tanzu/vm-operator/pkg/exit
178178
- alias: pkglog
179179
pkg: github.com/vmware-tanzu/vm-operator/pkg/log
180+
- alias: pkgnil
181+
pkg: github.com/vmware-tanzu/vm-operator/pkg/util/nil
180182
- alias: ctxop
181183
pkg: github.com/vmware-tanzu/vm-operator/pkg/context/operation
182184
- alias: pkgmgr

controllers/virtualmachine/virtualmachine/virtualmachine_controller.go

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import (
2626
vmopv1 "github.com/vmware-tanzu/vm-operator/api/v1alpha5"
2727
byokv1 "github.com/vmware-tanzu/vm-operator/external/byok/api/v1alpha1"
2828
cnsv1alpha1 "github.com/vmware-tanzu/vm-operator/external/vsphere-csi-driver/api/v1alpha1"
29-
"github.com/vmware-tanzu/vm-operator/pkg/conditions"
29+
pkgcond "github.com/vmware-tanzu/vm-operator/pkg/conditions"
3030
pkgcfg "github.com/vmware-tanzu/vm-operator/pkg/config"
3131
pkgconst "github.com/vmware-tanzu/vm-operator/pkg/constants"
3232
pkgctx "github.com/vmware-tanzu/vm-operator/pkg/context"
@@ -84,13 +84,21 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err
8484
proberManager)
8585

8686
builder := ctrl.NewControllerManagedBy(mgr).
87-
For(controlledType).
87+
Named(strings.ToLower(controlledTypeName)).
8888
WithOptions(controller.Options{
8989
MaxConcurrentReconciles: ctx.MaxConcurrentReconciles,
9090
SkipNameValidation: SkipNameValidation,
9191
LogConstructor: pkglog.ControllerLogConstructor(controllerNameShort, controlledType, mgr.GetScheme()),
9292
})
9393

94+
// Watch VirtualMachines.
95+
builder = builder.Watches(
96+
controlledType,
97+
&kubeutil.EnqueueRequestForObject{
98+
Logger: ctrl.Log.WithName("vmqueue"),
99+
GetPriority: kubeutil.GetVirtualMachineReconcilePriority,
100+
})
101+
94102
builder = builder.Watches(&vmopv1.VirtualMachineClass{},
95103
handler.EnqueueRequestsFromMapFunc(classToVMMapperFn(ctx, r.Client)))
96104

@@ -105,7 +113,10 @@ func AddToManager(ctx *pkgctx.ControllerManagerContext, mgr manager.Manager) err
105113
if pkgcfg.FromContext(ctx).AsyncSignalEnabled {
106114
builder = builder.WatchesRawSource(source.Channel(
107115
cource.FromContextWithBuffer(ctx, "VirtualMachine", 100),
108-
&handler.EnqueueRequestForObject{}))
116+
&kubeutil.EnqueueRequestForObject{
117+
Logger: ctrl.Log.WithName("asyncvmqueue"),
118+
GetPriority: kubeutil.GetVirtualMachineReconcilePriority,
119+
}))
109120
}
110121

111122
if pkgcfg.FromContext(ctx).Features.FastDeploy {
@@ -392,7 +403,7 @@ func requeueDelay(
392403
// Create VMs on the provider. Do not queue immediately to avoid exponential
393404
// backoff.
394405
if ignoredCreateErr(err) ||
395-
!conditions.IsTrue(ctx.VM, vmopv1.VirtualMachineConditionCreated) {
406+
!pkgcond.IsTrue(ctx.VM, vmopv1.VirtualMachineConditionCreated) {
396407

397408
return pkgcfg.FromContext(ctx).CreateVMRequeueDelay
398409
}
@@ -661,7 +672,7 @@ func (r *Reconciler) isVMICacheReady(ctx *pkgctx.VirtualMachineContext) bool {
661672
}
662673

663674
// Assert the image hardware is ready.
664-
if !conditions.IsTrue(
675+
if !pkgcond.IsTrue(
665676
vmic,
666677
vmopv1.VirtualMachineImageCacheConditionHardwareReady) {
667678

@@ -695,7 +706,7 @@ func (r *Reconciler) isVMICacheReady(ctx *pkgctx.VirtualMachineContext) bool {
695706
}
696707

697708
// Assert the cached disks are ready.
698-
if locStatus == nil || !conditions.IsTrue(
709+
if locStatus == nil || !pkgcond.IsTrue(
699710
locStatus,
700711
vmopv1.ReadyConditionType) {
701712

controllers/virtualmachinepublishrequest/virtualmachinepublishrequest_controller.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ import (
4545
"github.com/vmware-tanzu/vm-operator/pkg/patch"
4646
"github.com/vmware-tanzu/vm-operator/pkg/providers"
4747
"github.com/vmware-tanzu/vm-operator/pkg/record"
48-
pkgutil "github.com/vmware-tanzu/vm-operator/pkg/util"
4948
kubeutil "github.com/vmware-tanzu/vm-operator/pkg/util/kube"
49+
pkgnil "github.com/vmware-tanzu/vm-operator/pkg/util/nil"
5050
)
5151

5252
const (
@@ -500,7 +500,7 @@ func (r *Reconciler) checkIsTargetValid(ctx *pkgctx.VirtualMachinePublishRequest
500500
return fmt.Errorf("failed to get item %q from library: %w", targetItemName, err)
501501
}
502502

503-
if !pkgutil.IsNil(item) {
503+
if !pkgnil.IsNil(item) {
504504
objKey := client.ObjectKey{Name: vmPubReq.Spec.Target.Location.Name, Namespace: vmPubReq.Namespace}
505505
ctx.Logger.Info("target item already exists in the content library",
506506
"library", objKey, "itemName", targetItemName)
@@ -992,7 +992,7 @@ func (r *Reconciler) findCorrelatedItemIDByName(ctx *pkgctx.VirtualMachinePublis
992992
return "", fmt.Errorf("failed to get item %q from library: %w", targetItemName, err)
993993
}
994994

995-
if !pkgutil.IsNil(item) {
995+
if !pkgnil.IsNil(item) {
996996
// Item already exists in the content library, check if it is created from
997997
// this VirtualMachinePublishRequest from its description.
998998
// If VC forgets the task, or the task hadn't proceeded far enough to be submitted to VC

docs/concepts/workloads/vm-controller.md

Lines changed: 134 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,96 @@
22

33
The `VirtualMachine` controller is responsible for reconciling `VirtualMachine` objects.
44

5-
## Reconcile
5+
## Priority
6+
7+
The VirtualMachine controller leverages controller-runtime's [priority queue feature](https://github.com/kubernetes-sigs/controller-runtime/issues/2374) to intelligently order reconciliation requests based on the operational state of each VM. Priority queues allow the controller to process more urgent operations (like VM creation or deletion) ahead of routine maintenance operations (like resync events), improving system responsiveness and resource utilization.
8+
9+
### Priority queues
10+
11+
Controller-runtime's priority queue implementation uses a min-heap data structure where items with **higher numerical values have higher priority** and are dequeued first. When a Kubernetes event (create, update, delete, or generic) triggers a reconciliation, the controller can assign a specific priority value to that request. This ensures that critical operations don't get stuck behind less urgent work, especially during periods of high activity like controller restarts when many resync events are generated simultaneously.
12+
13+
The priority queue feature addresses the common problem of event storms during resyncs, where hundreds or thousands of objects might be queued at once. By assigning lower priorities to resync events and higher priorities to user-initiated changes, the system remains responsive to important operations.
14+
15+
### Priority levels
16+
17+
The VirtualMachine controller implements five priority levels, evaluated in the following order:
18+
19+
| Priority Level | Value | VM State | Description |
20+
|----------------|-------|----------|-------------|
21+
| **Creating** | 100 | `VirtualMachineConditionCreated` is not `True` | Highest priority. Assigned to VMs that are being created for the first time. This ensures new VM requests are processed quickly. |
22+
| **PowerStateChange** | 99 | `spec.powerState``status.powerState` | Second highest priority. Assigned when a VM needs a power state transition (power on, power off, or suspend). Power operations are time-sensitive and user-visible. |
23+
| **Deleting** | 98 | `deletionTimestamp` is set | Assigned to VMs being deleted. Cleanup operations have high priority to free resources promptly. |
24+
| **WaitingForIP** | 97 | Powered on, networking enabled, but no IP address assigned | Assigned when a VM is powered on with networking enabled but hasn't yet received an IP address (neither IPv4 nor IPv6). This ensures the controller actively monitors for network readiness. |
25+
| **WaitingForDiskPromo** | 96 | `spec.promoteDisksMode``Disabled` and `VirtualMachineDiskPromotionSynced` is not `True` | Assigned when disk promotion is configured but not yet completed. Ensures timely processing of disk promotion operations. |
26+
| **Default** | -1 to -4 | All other cases | Used for routine reconciliations of stable VMs. The default value varies by event type (see below), with all defaults being negative numbers to indicate lower priority than the specific VM state priorities. |
27+
28+
The **Default** priority level varies based on the Kubernetes event type that triggered the reconciliation:
29+
30+
| Event Type | Default Priority | When Applied |
31+
|------------|------------------|--------------|
32+
| Create | -1 | Events from initial controller cache sync |
33+
| Update | -2 | Events caused by an update to the object's desired state |
34+
| Delete | -3 | Delete events (though typically overridden by state-based priorities) |
35+
| Generic | -4 | Generic events like periodic resyncs |
36+
37+
This tiered default system ensures that even among routine operations, there's an inherent ordering: create events from cache sync are processed before unchanged updates, which are processed before generic resyncs.
38+
39+
### Priority evaluation
40+
41+
The controller evaluates priorities using a cascade of conditions, with each check returning immediately if matched:
42+
43+
1. **Annotation Override**: If the VM has the `vmoperator.vmware.com.protected/reconcile-priority` annotation set to a valid integer, that value is used directly, bypassing all other checks.
44+
45+
!!! note "Privileged users only"
46+
47+
Only privileged users, such as the VM Operator service account, may override the reconcile priority via annotation.
48+
49+
2. **Non-VM Objects**: If the object is not a `VirtualMachine`, the default priority is returned.
50+
51+
3. **Deletion**: If the VM has a `deletionTimestamp`, return `PriorityVirtualMachineDeleting`.
52+
53+
4. **Creation**: If the `VirtualMachineConditionCreated` condition is not `True`, return `PriorityVirtualMachineCreating`.
54+
55+
5. **Power State Change**: If `spec.powerState` doesn't match `status.powerState`, return `PriorityVirtualMachinePowerStateChange`.
56+
57+
6. **Waiting for IP**: If the VM is powered on (`status.powerState == PoweredOn`) and networking is enabled (`spec.network.disabled != true`), but no IP address is assigned (`status.network.primaryIP4` and `status.network.primaryIP6` are both empty), return `PriorityVirtualMachineWaitingForIP`.
58+
59+
7. **Disk Promotion**: If `spec.promoteDisksMode` is not `Disabled` and the `VirtualMachineDiskPromotionSynced` condition is not `True`, return `PriorityVirtualMachineWaitingForDiskPromo`.
60+
61+
8. **Default**: If none of the above conditions match, return the default priority.
62+
63+
### Priority assignment
64+
65+
Different Kubernetes event types receive different priority handling:
66+
67+
- **Create Events**: During initial controller startup (when processing the initial list of objects from the API server), create events are assigned the default priority. For new objects created after startup, the priority function is evaluated.
68+
69+
- **Update Events**: If the resource version hasn't changed between old and new objects, the default priority is used. Otherwise, the priority function evaluates the VM's current state.
70+
71+
- **Delete Events**: Always evaluated through the priority function, typically resulting in `PriorityVirtualMachineDeleting`.
72+
73+
- **Generic Events**: Always evaluated through the priority function based on the VM's current state.
74+
75+
### Priority examples
76+
77+
**Scenario 1: New VM Creation**
78+
When a user creates a new VirtualMachine resource, the reconcile request receives `PriorityVirtualMachineCreating (100)`, ensuring it's processed ahead of routine updates to existing VMs.
79+
80+
**Scenario 2: Power State Change on Existing VM**
81+
When a user changes `spec.powerState` from `PoweredOff` to `PoweredOn`, the reconcile request receives `PriorityVirtualMachinePowerStateChange (99)`, placing it near the front of the queue.
82+
83+
**Scenario 3: Waiting for IP Address**
84+
After a VM is powered on and networking is configured, but the IP hasn't been assigned yet, subsequent reconciliations receive `PriorityVirtualMachineWaitingForIP (97)`, ensuring the controller actively monitors for IP assignment completion.
85+
86+
**Scenario 4: Routine Reconciliation**
87+
For a stable, running VM with no pending changes, reconcile requests (like periodic resyncs) receive the default priority, allowing more urgent operations to be processed first.
88+
89+
**Scenario 5: Controller Restart with Many VMs**
90+
When the controller starts and processes its initial cache sync with 1,000 VMs, all initial reconciliations receive the default (lower) priority. If during this time a user creates a new VM or requests a power state change, those operations receive higher priority values and are processed first, maintaining system responsiveness.
91+
92+
## Workflows
93+
94+
### Overview
695

796
The main `Reconcile` function orchestrates the entire VirtualMachine reconciliation process, handling both creation/updates and deletions:
897

@@ -36,7 +125,7 @@ flowchart TD
36125
ReturnResult --> End
37126
```
38127

39-
## ReconcileDelete
128+
### Delete
40129

41130
The `ReconcileDelete` method handles the deletion of VirtualMachine resources, ensuring proper cleanup of the underlying vSphere VM and associated resources:
42131

@@ -72,12 +161,10 @@ flowchart TD
72161
Success --> End
73162
```
74163

75-
### ReconcileNormal
164+
### Normal
76165

77166
The `ReconcileNormal` method handles the creation and updating of VirtualMachine resources. It manages finalizers, checks for paused annotations, and orchestrates the main reconcile logic including fast deploy support and VMI cache readiness.
78167

79-
#### High-Level Reconcile Normal Flow
80-
81168
```mermaid
82169
flowchart TD
83170
Start([Start ReconcileNormal]) --> CheckPause{Has pause<br />annotation?}
@@ -108,7 +195,9 @@ flowchart TD
108195
SkipReconcile --> Return
109196
```
110197

111-
#### VM Provider CreateOrUpdate Decision Logic
198+
## vSphere provider
199+
200+
### CreateOrUpdate
112201

113202
The VM provider determines whether to create or update a VM based on whether it exists in vSphere:
114203

@@ -143,7 +232,7 @@ flowchart TD
143232
ReturnErr --> End
144233
```
145234

146-
#### VM Creation Workflow with Fast Deploy
235+
### Fast deploy
147236

148237
When creating a VM, the system supports both traditional content library deployment and fast deploy optimization:
149238

@@ -180,7 +269,41 @@ flowchart TD
180269
Success --> End
181270
```
182271

183-
#### VM Update Workflow
272+
#### VMI Cache Integration (Fast Deploy)
273+
274+
When Fast Deploy is enabled, the controller integrates with VirtualMachineImageCache resources to optimize VM creation:
275+
276+
```mermaid
277+
flowchart TD
278+
Start([Check VMI Cache Ready]) --> HasLabel{VM has VMI<br />cache label?}
279+
HasLabel -->|No| Ready[Return ready - no cache needed]
280+
HasLabel -->|Yes| GetCache[Get VMI cache object]
281+
282+
GetCache --> CacheExists{Cache<br />exists?}
283+
CacheExists -->|No| NotReady[Return not ready]
284+
CacheExists -->|Yes| CheckOVF{OVF condition<br />true?}
285+
286+
CheckOVF -->|No| NotReady
287+
CheckOVF -->|Yes| HasLocation{VM has location<br />annotation?}
288+
289+
HasLocation -->|No| RemoveLabel[Remove VMI cache label]
290+
HasLocation -->|Yes| FindLocation[Find matching location status]
291+
292+
RemoveLabel --> Ready
293+
FindLocation --> LocationReady{Location files<br />ready?}
294+
295+
LocationReady -->|No| NotReady
296+
LocationReady -->|Yes| RemoveAnnotations[Remove cache label & annotation]
297+
RemoveAnnotations --> Ready
298+
299+
Ready --> End([End - Ready])
300+
NotReady --> End2([End - Not Ready])
301+
```
302+
303+
This comprehensive workflow documentation shows how the VirtualMachine controller orchestrates VM lifecycle management, including the sophisticated fast deploy optimization that uses cached VM images for faster provisioning.
304+
305+
306+
### Update
184307

185308
When updating an existing VM, the system reconciles various aspects in a specific order:
186309

@@ -230,7 +353,7 @@ flowchart TD
230353
Success --> End
231354
```
232355

233-
#### Status Reconciliation Details
356+
### Status
234357

235358
The status reconciliation updates the VM's observed state from vSphere:
236359

@@ -274,7 +397,7 @@ flowchart TD
274397
Success --> End([End])
275398
```
276399

277-
#### Config Reconciliation Process
400+
### Config
278401

279402
Config reconciliation ensures the VM configuration matches the desired spec:
280403

@@ -302,7 +425,7 @@ flowchart TD
302425
Success --> End([End])
303426
```
304427

305-
#### Power State Reconciliation
428+
### Power state
306429

307430
Power state reconciliation manages VM power operations:
308431

@@ -349,37 +472,3 @@ flowchart TD
349472
Success --> End
350473
```
351474

352-
### VMI Cache Integration (Fast Deploy)
353-
354-
When Fast Deploy is enabled, the controller integrates with VirtualMachineImageCache resources to optimize VM creation:
355-
356-
#### VMI Cache Readiness Check
357-
358-
```mermaid
359-
flowchart TD
360-
Start([Check VMI Cache Ready]) --> HasLabel{VM has VMI<br />cache label?}
361-
HasLabel -->|No| Ready[Return ready - no cache needed]
362-
HasLabel -->|Yes| GetCache[Get VMI cache object]
363-
364-
GetCache --> CacheExists{Cache<br />exists?}
365-
CacheExists -->|No| NotReady[Return not ready]
366-
CacheExists -->|Yes| CheckOVF{OVF condition<br />true?}
367-
368-
CheckOVF -->|No| NotReady
369-
CheckOVF -->|Yes| HasLocation{VM has location<br />annotation?}
370-
371-
HasLocation -->|No| RemoveLabel[Remove VMI cache label]
372-
HasLocation -->|Yes| FindLocation[Find matching location status]
373-
374-
RemoveLabel --> Ready
375-
FindLocation --> LocationReady{Location files<br />ready?}
376-
377-
LocationReady -->|No| NotReady
378-
LocationReady -->|Yes| RemoveAnnotations[Remove cache label & annotation]
379-
RemoveAnnotations --> Ready
380-
381-
Ready --> End([End - Ready])
382-
NotReady --> End2([End - Not Ready])
383-
```
384-
385-
This comprehensive workflow documentation shows how the VirtualMachine controller orchestrates VM lifecycle management, including the sophisticated fast deploy optimization that uses cached VM images for faster provisioning.

pkg/constants/constants.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,4 +236,8 @@ const (
236236

237237
// VCFAIDAnnotationKey is the annotation key for the VCFA ID for GOSC customization.
238238
VCFAIDAnnotationKey = "vmoperator.vmware.com/vcfa-id"
239+
240+
// ReconcilePriorityAnnotationKey is the annotation key that specifies the
241+
// reconcile priority for an object.
242+
ReconcilePriorityAnnotationKey = "vmoperator.vmware.com.protected/reconcile-priority"
239243
)

0 commit comments

Comments
 (0)