@@ -52,31 +52,40 @@ func (r *Reconciler) SetupWithManager(
5252
5353func (r * Reconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
5454 logger := ctrl .LoggerFrom (ctx ).WithValues ("cluster" , req .NamespacedName )
55+ logger .Info ("Starting failure domain rollout reconciliation" )
5556
5657 var cluster clusterv1.Cluster
5758 if err := r .Get (ctx , req .NamespacedName , & cluster ); err != nil {
5859 if apierrors .IsNotFound (err ) {
59- logger .V ( 5 ). Info ("Cluster not found, skipping reconciliation" )
60+ logger .Info ("Cluster not found, skipping reconciliation" )
6061 return ctrl.Result {}, nil
6162 }
6263 return ctrl.Result {}, fmt .Errorf ("failed to get Cluster %s: %w" , req .NamespacedName , err )
6364 }
6465
66+ logger .Info (
67+ "Successfully retrieved cluster" ,
68+ "hasTopology" ,
69+ cluster .Spec .Topology != nil ,
70+ "failureDomainCount" ,
71+ len (cluster .Status .FailureDomains ),
72+ )
73+
6574 // Skip if cluster is not using topology
6675 if cluster .Spec .Topology == nil {
67- logger .V ( 5 ). Info ("Cluster is not using topology, skipping reconciliation" )
76+ logger .Info ("Cluster is not using topology, skipping reconciliation" )
6877 return ctrl.Result {}, nil
6978 }
7079
7180 // Skip if cluster doesn't have a control plane reference
7281 if cluster .Spec .ControlPlaneRef == nil {
73- logger .V ( 5 ). Info ("Cluster has no control plane reference, skipping reconciliation" )
82+ logger .Info ("Cluster has no control plane reference, skipping reconciliation" )
7483 return ctrl.Result {}, nil
7584 }
7685
7786 // Skip if cluster doesn't have failure domains
7887 if len (cluster .Status .FailureDomains ) == 0 {
79- logger .V ( 5 ). Info ("Cluster has no failure domains, skipping reconciliation" )
88+ logger .Info ("Cluster has no failure domains, skipping reconciliation" )
8089 return ctrl.Result {}, nil
8190 }
8291
@@ -88,13 +97,16 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
8897 }
8998 if err := r .Get (ctx , kcpKey , & kcp ); err != nil {
9099 if apierrors .IsNotFound (err ) {
91- logger .V ( 5 ). Info ("KubeAdmControlPlane not found, skipping reconciliation" )
100+ logger .Info ("KubeAdmControlPlane not found, skipping reconciliation" )
92101 return ctrl.Result {}, nil
93102 }
94103 return ctrl.Result {}, fmt .Errorf ("failed to get KubeAdmControlPlane %s: %w" , kcpKey , err )
95104 }
96105
106+ logger .Info ("Successfully retrieved KubeAdmControlPlane" , "kcpName" , kcp .Name , "replicas" , kcp .Spec .Replicas )
107+
97108 // Check if we need to trigger a rollout
109+ logger .Info ("Evaluating rollout necessity" )
98110 needsRollout , reason , err := r .shouldTriggerRollout (ctx , & cluster , & kcp )
99111 if err != nil {
100112 return ctrl.Result {}, fmt .Errorf ("failed to determine if rollout is needed: %w" , err )
@@ -117,14 +129,36 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
117129 // Store the current failure domain hash
118130 fdHash := r .calculateFailureDomainHash (cluster .Status .FailureDomains )
119131 kcpCopy .Annotations [FailureDomainHashAnnotation ] = fdHash
132+ logger .Info ("Updating KubeAdmControlPlane with rollout trigger" , "failureDomainHash" , fdHash )
120133
121134 if err := r .Update (ctx , kcpCopy ); err != nil {
122135 return ctrl.Result {}, fmt .Errorf ("failed to update KubeAdmControlPlane %s: %w" , kcpKey , err )
123136 }
124137
125138 logger .Info ("Successfully triggered rollout" , "rolloutAfter" , now .Format (time .RFC3339 ))
139+ } else {
140+ logger .Info ("No rollout needed" , "reason" , "failure domain configuration is stable" )
141+
142+ // Check if we need to store the hash for the first time (when no previous hash exists)
143+ currentHash := r .calculateFailureDomainHash (cluster .Status .FailureDomains )
144+ _ , hasStoredHash := kcp .Annotations [FailureDomainHashAnnotation ]
145+
146+ if ! hasStoredHash {
147+ logger .Info ("Storing initial failure domain hash" , "failureDomainHash" , currentHash )
148+ kcpCopy := kcp .DeepCopy ()
149+ if kcpCopy .Annotations == nil {
150+ kcpCopy .Annotations = make (map [string ]string )
151+ }
152+ kcpCopy .Annotations [FailureDomainHashAnnotation ] = currentHash
153+
154+ if err := r .Update (ctx , kcpCopy ); err != nil {
155+ return ctrl.Result {}, fmt .Errorf ("failed to store initial failure domain hash %s: %w" , kcpKey , err )
156+ }
157+ logger .Info ("Successfully stored initial failure domain hash" )
158+ }
126159 }
127160
161+ logger .Info ("Completed failure domain rollout reconciliation" )
128162 return ctrl.Result {}, nil
129163}
130164
@@ -138,54 +172,71 @@ func (r *Reconciler) shouldTriggerRollout(
138172
139173 // Calculate current failure domain hash
140174 currentFDHash := r .calculateFailureDomainHash (cluster .Status .FailureDomains )
175+ logger .Info ("Calculated current failure domain hash" , "hash" , currentFDHash )
141176
142177 // Get the stored hash from annotations
143178 storedFDHash , hasStoredHash := kcp .Annotations [FailureDomainHashAnnotation ]
179+ logger .Info ("Retrieved stored failure domain hash" , "hash" , storedFDHash , "hasStoredHash" , hasStoredHash )
144180
145181 // If no stored hash, this is the first time - store the hash but don't trigger rollout
146182 if ! hasStoredHash {
147- logger .V ( 5 ). Info ("No previous failure domain hash found, storing current hash" )
183+ logger .Info ("No previous failure domain hash found, storing current hash" )
148184 return false , "" , nil
149185 }
150186
151187 // If hashes are the same, no changes detected
152188 if currentFDHash == storedFDHash {
153- logger .V ( 5 ). Info ("No failure domain changes detected" )
189+ logger .Info ("No failure domain changes detected" )
154190 return false , "" , nil
155191 }
156192
193+ logger .Info (
194+ "Failure domain hash changed, analyzing impact" ,
195+ "currentHash" ,
196+ currentFDHash ,
197+ "storedHash" ,
198+ storedFDHash ,
199+ )
200+
157201 // Get the current KCP machines to understand current placement
158202 currentlyUsedFailureDomains , err := r .getCurrentlyUsedFailureDomains (ctx , cluster )
159203 if err != nil {
160204 return false , "" , fmt .Errorf ("failed to get currently used failure domains: %w" , err )
161205 }
162206
163- logger .V ( 5 ). Info ("Analyzing failure domain changes" ,
207+ logger .Info ("Analyzing failure domain changes" ,
164208 "currentlyUsedFailureDomains" , currentlyUsedFailureDomains ,
165209 "availableFailureDomains" , getAvailableFailureDomains (cluster .Status .FailureDomains ))
166210
167211 // Check if any currently used failure domain is disabled or removed
212+ logger .Info ("Checking if used failure domains are still valid" )
168213 for _ , usedFD := range currentlyUsedFailureDomains {
169214 if fd , exists := cluster .Status .FailureDomains [usedFD ]; ! exists {
215+ logger .Info ("Found removed failure domain in use" , "failureDomain" , usedFD )
170216 return true , fmt .Sprintf ("failure domain %s is removed" , usedFD ), nil
171217 } else if ! fd .ControlPlane {
218+ logger .Info ("Found disabled failure domain in use" , "failureDomain" , usedFD )
172219 return true , fmt .Sprintf ("failure domain %s is disabled for control plane" , usedFD ), nil
173220 }
174221 }
222+ logger .Info ("All used failure domains are still valid" )
175223
176224 // Check if the distribution could be improved
177225 availableFDs := getAvailableFailureDomains (cluster .Status .FailureDomains )
226+ logger .Info ("Checking if distribution can be improved" , "availableFailureDomains" , availableFDs )
178227 shouldImprove , err := r .shouldImproveDistribution (ctx , cluster , kcp , availableFDs )
179228 if err != nil {
180229 return false , "" , fmt .Errorf ("failed to check distribution improvement: %w" , err )
181230 }
182231 if shouldImprove {
232+ logger .Info ("Distribution can be improved, triggering rollout" )
183233 return true , "failure domain distribution could be improved for better fault tolerance" , nil
184234 }
235+ logger .Info ("Distribution is already optimal, no improvement needed" )
185236
186237 // If we reach here, failure domains changed but no meaningful impact
187238 // (e.g., new failure domains added but existing ones still valid)
188- logger .V ( 5 ). Info ("Failure domains changed but no meaningful impact detected" )
239+ logger .Info ("Failure domains changed but no meaningful impact detected" )
189240 return false , "" , nil
190241}
191242
@@ -200,11 +251,13 @@ func (r *Reconciler) shouldImproveDistribution(
200251
201252 // Skip if no replicas specified
202253 if kcp .Spec .Replicas == nil {
254+ logger .Info ("No replicas specified, skipping distribution analysis" )
203255 return false , nil
204256 }
205257
206258 replicas := int (* kcp .Spec .Replicas )
207259 availableCount := len (availableFDs )
260+ logger .Info ("Starting distribution analysis" , "replicas" , replicas , "availableFailureDomains" , availableCount )
208261
209262 // Get actual machine distribution
210263 var machines clusterv1.MachineList
@@ -224,12 +277,27 @@ func (r *Reconciler) shouldImproveDistribution(
224277 }
225278
226279 currentUsedCount := len (currentDistribution )
280+ logger .Info (
281+ "Current failure domain distribution" ,
282+ "distribution" ,
283+ currentDistribution ,
284+ "usedFailureDomains" ,
285+ currentUsedCount ,
286+ )
227287
228288 // If we have more available FDs than we're currently using, we might be able to improve
229289 if availableCount > currentUsedCount {
290+ logger .Info ("More failure domains available than currently used, analyzing improvement potential" )
230291 // Calculate ideal distribution: spread replicas as evenly as possible
231292 baseReplicasPerFD := replicas / availableCount
232293 extraReplicas := replicas % availableCount
294+ logger .Info (
295+ "Calculated ideal distribution" ,
296+ "baseReplicasPerFD" ,
297+ baseReplicasPerFD ,
298+ "extraReplicas" ,
299+ extraReplicas ,
300+ )
233301
234302 // Check if current distribution is suboptimal
235303 // Ideal: some FDs have (baseReplicasPerFD + 1), others have baseReplicasPerFD
@@ -241,7 +309,7 @@ func (r *Reconciler) shouldImproveDistribution(
241309 // Check if any FD has more than the ideal maximum
242310 for fd , count := range currentDistribution {
243311 if count > maxIdealPerFD {
244- logger .V ( 5 ). Info ("Found suboptimal distribution" ,
312+ logger .Info ("Found suboptimal distribution" ,
245313 "failureDomain" , fd ,
246314 "currentCount" , count ,
247315 "maxIdeal" , maxIdealPerFD ,
@@ -300,7 +368,7 @@ func (r *Reconciler) shouldImproveDistribution(
300368 if canImprove {
301369 for _ , fd := range availableFDs {
302370 if _ , used := currentDistribution [fd ]; ! used {
303- logger .V ( 5 ). Info ("Found unused failure domain that could improve distribution" ,
371+ logger .Info ("Found unused failure domain that could improve distribution" ,
304372 "unusedFailureDomain" , fd ,
305373 "currentDistribution" , currentDistribution ,
306374 "minCount" , minCount ,
@@ -312,8 +380,11 @@ func (r *Reconciler) shouldImproveDistribution(
312380 }
313381 }
314382 }
383+ } else {
384+ logger .Info ("Using all available failure domains or distribution cannot be improved" )
315385 }
316386
387+ logger .Info ("No distribution improvement possible" )
317388 return false , nil
318389}
319390
@@ -322,6 +393,8 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
322393 ctx context.Context ,
323394 cluster * clusterv1.Cluster ,
324395) ([]string , error ) {
396+ logger := ctrl .LoggerFrom (ctx ).WithValues ("cluster" , client .ObjectKeyFromObject (cluster ))
397+
325398 var machines clusterv1.MachineList
326399 if err := r .List (ctx , & machines , client .InNamespace (cluster .Namespace ), client.MatchingLabels {
327400 clusterv1 .ClusterNameLabel : cluster .Name ,
@@ -330,6 +403,8 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
330403 return nil , fmt .Errorf ("failed to list control plane machines: %w" , err )
331404 }
332405
406+ logger .Info ("Retrieved control plane machines" , "machineCount" , len (machines .Items ))
407+
333408 usedFailureDomains := make (map [string ]struct {})
334409 for _ , machine := range machines .Items {
335410 if machine .Spec .FailureDomain != nil {
@@ -341,6 +416,7 @@ func (r *Reconciler) getCurrentlyUsedFailureDomains(
341416 for fd := range usedFailureDomains {
342417 result = append (result , fd )
343418 }
419+ logger .Info ("Identified currently used failure domains" , "usedFailureDomains" , result )
344420 return result , nil
345421}
346422
0 commit comments