Skip to content

Commit 7a817bc

Browse files
fix(failuredomain): Set a has if none set
1 parent 1e70278 commit 7a817bc

File tree

2 files changed

+88
-12
lines changed

2 files changed

+88
-12
lines changed

pkg/controllers/failuredomainrollout/controller.go

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,31 +52,40 @@ func (r *Reconciler) SetupWithManager(
5252

5353
func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
5454
logger := ctrl.LoggerFrom(ctx).WithValues("cluster", req.NamespacedName)
55+
logger.Info("Starting failure domain rollout reconciliation")
5556

5657
var cluster clusterv1.Cluster
5758
if err := r.Get(ctx, req.NamespacedName, &cluster); err != nil {
5859
if apierrors.IsNotFound(err) {
59-
logger.V(5).Info("Cluster not found, skipping reconciliation")
60+
logger.Info("Cluster not found, skipping reconciliation")
6061
return ctrl.Result{}, nil
6162
}
6263
return ctrl.Result{}, fmt.Errorf("failed to get Cluster %s: %w", req.NamespacedName, err)
6364
}
6465

66+
logger.Info(
67+
"Successfully retrieved cluster",
68+
"hasTopology",
69+
cluster.Spec.Topology != nil,
70+
"failureDomainCount",
71+
len(cluster.Status.FailureDomains),
72+
)
73+
6574
// Skip if cluster is not using topology
6675
if cluster.Spec.Topology == nil {
67-
logger.V(5).Info("Cluster is not using topology, skipping reconciliation")
76+
logger.Info("Cluster is not using topology, skipping reconciliation")
6877
return ctrl.Result{}, nil
6978
}
7079

7180
// Skip if cluster doesn't have a control plane reference
7281
if cluster.Spec.ControlPlaneRef == nil {
73-
logger.V(5).Info("Cluster has no control plane reference, skipping reconciliation")
82+
logger.Info("Cluster has no control plane reference, skipping reconciliation")
7483
return ctrl.Result{}, nil
7584
}
7685

7786
// Skip if cluster doesn't have failure domains
7887
if len(cluster.Status.FailureDomains) == 0 {
79-
logger.V(5).Info("Cluster has no failure domains, skipping reconciliation")
88+
logger.Info("Cluster has no failure domains, skipping reconciliation")
8089
return ctrl.Result{}, nil
8190
}
8291

@@ -88,13 +97,16 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
8897
}
8998
if err := r.Get(ctx, kcpKey, &kcp); err != nil {
9099
if apierrors.IsNotFound(err) {
91-
logger.V(5).Info("KubeAdmControlPlane not found, skipping reconciliation")
100+
logger.Info("KubeAdmControlPlane not found, skipping reconciliation")
92101
return ctrl.Result{}, nil
93102
}
94103
return ctrl.Result{}, fmt.Errorf("failed to get KubeAdmControlPlane %s: %w", kcpKey, err)
95104
}
96105

106+
logger.Info("Successfully retrieved KubeAdmControlPlane", "kcpName", kcp.Name, "replicas", kcp.Spec.Replicas)
107+
97108
// Check if we need to trigger a rollout
109+
logger.Info("Evaluating rollout necessity")
98110
needsRollout, reason, err := r.shouldTriggerRollout(ctx, &cluster, &kcp)
99111
if err != nil {
100112
return ctrl.Result{}, fmt.Errorf("failed to determine if rollout is needed: %w", err)
@@ -117,14 +129,36 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
117129
// Store the current failure domain hash
118130
fdHash := r.calculateFailureDomainHash(cluster.Status.FailureDomains)
119131
kcpCopy.Annotations[FailureDomainHashAnnotation] = fdHash
132+
logger.Info("Updating KubeAdmControlPlane with rollout trigger", "failureDomainHash", fdHash)
120133

121134
if err := r.Update(ctx, kcpCopy); err != nil {
122135
return ctrl.Result{}, fmt.Errorf("failed to update KubeAdmControlPlane %s: %w", kcpKey, err)
123136
}
124137

125138
logger.Info("Successfully triggered rollout", "rolloutAfter", now.Format(time.RFC3339))
139+
} else {
140+
logger.Info("No rollout needed", "reason", "failure domain configuration is stable")
141+
142+
// Check if we need to store the hash for the first time (when no previous hash exists)
143+
currentHash := r.calculateFailureDomainHash(cluster.Status.FailureDomains)
144+
_, hasStoredHash := kcp.Annotations[FailureDomainHashAnnotation]
145+
146+
if !hasStoredHash {
147+
logger.Info("Storing initial failure domain hash", "failureDomainHash", currentHash)
148+
kcpCopy := kcp.DeepCopy()
149+
if kcpCopy.Annotations == nil {
150+
kcpCopy.Annotations = make(map[string]string)
151+
}
152+
kcpCopy.Annotations[FailureDomainHashAnnotation] = currentHash
153+
154+
if err := r.Update(ctx, kcpCopy); err != nil {
155+
return ctrl.Result{}, fmt.Errorf("failed to store initial failure domain hash %s: %w", kcpKey, err)
156+
}
157+
logger.Info("Successfully stored initial failure domain hash")
158+
}
126159
}
127160

161+
logger.Info("Completed failure domain rollout reconciliation")
128162
return ctrl.Result{}, nil
129163
}
130164

@@ -138,54 +172,71 @@ func (r *Reconciler) shouldTriggerRollout(
138172

139173
// Calculate current failure domain hash
140174
currentFDHash := r.calculateFailureDomainHash(cluster.Status.FailureDomains)
175+
logger.Info("Calculated current failure domain hash", "hash", currentFDHash)
141176

142177
// Get the stored hash from annotations
143178
storedFDHash, hasStoredHash := kcp.Annotations[FailureDomainHashAnnotation]
179+
logger.Info("Retrieved stored failure domain hash", "hash", storedFDHash, "hasStoredHash", hasStoredHash)
144180

145181
// If no stored hash, this is the first time - store the hash but don't trigger rollout
146182
if !hasStoredHash {
147-
logger.V(5).Info("No previous failure domain hash found, storing current hash")
183+
logger.Info("No previous failure domain hash found, storing current hash")
148184
return false, "", nil
149185
}
150186

151187
// If hashes are the same, no changes detected
152188
if currentFDHash == storedFDHash {
153-
logger.V(5).Info("No failure domain changes detected")
189+
logger.Info("No failure domain changes detected")
154190
return false, "", nil
155191
}
156192

193+
logger.Info(
194+
"Failure domain hash changed, analyzing impact",
195+
"currentHash",
196+
currentFDHash,
197+
"storedHash",
198+
storedFDHash,
199+
)
200+
157201
// Get the current KCP machines to understand current placement
158202
currentlyUsedFailureDomains, err := r.getCurrentlyUsedFailureDomains(ctx, cluster)
159203
if err != nil {
160204
return false, "", fmt.Errorf("failed to get currently used failure domains: %w", err)
161205
}
162206

163-
logger.V(5).Info("Analyzing failure domain changes",
207+
logger.Info("Analyzing failure domain changes",
164208
"currentlyUsedFailureDomains", currentlyUsedFailureDomains,
165209
"availableFailureDomains", getAvailableFailureDomains(cluster.Status.FailureDomains))
166210

167211
// Check if any currently used failure domain is disabled or removed
212+
logger.Info("Checking if used failure domains are still valid")
168213
for _, usedFD := range currentlyUsedFailureDomains {
169214
if fd, exists := cluster.Status.FailureDomains[usedFD]; !exists {
215+
logger.Info("Found removed failure domain in use", "failureDomain", usedFD)
170216
return true, fmt.Sprintf("failure domain %s is removed", usedFD), nil
171217
} else if !fd.ControlPlane {
218+
logger.Info("Found disabled failure domain in use", "failureDomain", usedFD)
172219
return true, fmt.Sprintf("failure domain %s is disabled for control plane", usedFD), nil
173220
}
174221
}
222+
logger.Info("All used failure domains are still valid")
175223

176224
// Check if the distribution could be improved
177225
availableFDs := getAvailableFailureDomains(cluster.Status.FailureDomains)
226+
logger.Info("Checking if distribution can be improved", "availableFailureDomains", availableFDs)
178227
shouldImprove, err := r.shouldImproveDistribution(ctx, cluster, kcp, availableFDs)
179228
if err != nil {
180229
return false, "", fmt.Errorf("failed to check distribution improvement: %w", err)
181230
}
182231
if shouldImprove {
232+
logger.Info("Distribution can be improved, triggering rollout")
183233
return true, "failure domain distribution could be improved for better fault tolerance", nil
184234
}
235+
logger.Info("Distribution is already optimal, no improvement needed")
185236

186237
// If we reach here, failure domains changed but no meaningful impact
187238
// (e.g., new failure domains added but existing ones still valid)
188-
logger.V(5).Info("Failure domains changed but no meaningful impact detected")
239+
logger.Info("Failure domains changed but no meaningful impact detected")
189240
return false, "", nil
190241
}
191242

@@ -200,11 +251,13 @@ func (r *Reconciler) shouldImproveDistribution(
200251

201252
// Skip if no replicas specified
202253
if kcp.Spec.Replicas == nil {
254+
logger.Info("No replicas specified, skipping distribution analysis")
203255
return false, nil
204256
}
205257

206258
replicas := int(*kcp.Spec.Replicas)
207259
availableCount := len(availableFDs)
260+
logger.Info("Starting distribution analysis", "replicas", replicas, "availableFailureDomains", availableCount)
208261

209262
// Get actual machine distribution
210263
var machines clusterv1.MachineList
@@ -224,12 +277,27 @@ func (r *Reconciler) shouldImproveDistribution(
224277
}
225278

226279
currentUsedCount := len(currentDistribution)
280+
logger.Info(
281+
"Current failure domain distribution",
282+
"distribution",
283+
currentDistribution,
284+
"usedFailureDomains",
285+
currentUsedCount,
286+
)
227287

228288
// If we have more available FDs than we're currently using, we might be able to improve
229289
if availableCount > currentUsedCount {
290+
logger.Info("More failure domains available than currently used, analyzing improvement potential")
230291
// Calculate ideal distribution: spread replicas as evenly as possible
231292
baseReplicasPerFD := replicas / availableCount
232293
extraReplicas := replicas % availableCount
294+
logger.Info(
295+
"Calculated ideal distribution",
296+
"baseReplicasPerFD",
297+
baseReplicasPerFD,
298+
"extraReplicas",
299+
extraReplicas,
300+
)
233301

234302
// Check if current distribution is suboptimal
235303
// Ideal: some FDs have (baseReplicasPerFD + 1), others have baseReplicasPerFD
@@ -241,7 +309,7 @@ func (r *Reconciler) shouldImproveDistribution(
241309
// Check if any FD has more than the ideal maximum
242310
for fd, count := range currentDistribution {
243311
if count > maxIdealPerFD {
244-
logger.V(5).Info("Found suboptimal distribution",
312+
logger.Info("Found suboptimal distribution",
245313
"failureDomain", fd,
246314
"currentCount", count,
247315
"maxIdeal", maxIdealPerFD,
@@ -300,7 +368,7 @@ func (r *Reconciler) shouldImproveDistribution(
300368
if canImprove {
301369
for _, fd := range availableFDs {
302370
if _, used := currentDistribution[fd]; !used {
303-
logger.V(5).Info("Found unused failure domain that could improve distribution",
371+
logger.Info("Found unused failure domain that could improve distribution",
304372
"unusedFailureDomain", fd,
305373
"currentDistribution", currentDistribution,
306374
"minCount", minCount,
@@ -312,8 +380,11 @@ func (r *Reconciler) shouldImproveDistribution(
312380
}
313381
}
314382
}
383+
} else {
384+
logger.Info("Using all available failure domains or distribution cannot be improved")
315385
}
316386

387+
logger.Info("No distribution improvement possible")
317388
return false, nil
318389
}
319390

@@ -322,6 +393,8 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
322393
ctx context.Context,
323394
cluster *clusterv1.Cluster,
324395
) ([]string, error) {
396+
logger := ctrl.LoggerFrom(ctx).WithValues("cluster", client.ObjectKeyFromObject(cluster))
397+
325398
var machines clusterv1.MachineList
326399
if err := r.List(ctx, &machines, client.InNamespace(cluster.Namespace), client.MatchingLabels{
327400
clusterv1.ClusterNameLabel: cluster.Name,
@@ -330,6 +403,8 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
330403
return nil, fmt.Errorf("failed to list control plane machines: %w", err)
331404
}
332405

406+
logger.Info("Retrieved control plane machines", "machineCount", len(machines.Items))
407+
333408
usedFailureDomains := make(map[string]struct{})
334409
for _, machine := range machines.Items {
335410
if machine.Spec.FailureDomain != nil {
@@ -341,6 +416,7 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
341416
for fd := range usedFailureDomains {
342417
result = append(result, fd)
343418
}
419+
logger.Info("Identified currently used failure domains", "usedFailureDomains", result)
344420
return result, nil
345421
}
346422

pkg/controllers/failuredomainrollout/doc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@
1515
//
1616
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters,verbs=get;list;watch
1717
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters/status,verbs=get
18-
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=kubeadmcontrolplanes,verbs=get;list;watch;update;patch
18+
// +kubebuilder:rbac:groups=controlplane.cluster.x-k8s.io,resources=kubeadmcontrolplanes,verbs=get;list;watch;update;patch
1919
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines,verbs=get;list;watch
2020
package failuredomainrollout

0 commit comments

Comments
 (0)