@@ -29,6 +29,7 @@ import (
29
29
k8serrors "k8s.io/apimachinery/pkg/api/errors"
30
30
"k8s.io/apimachinery/pkg/labels"
31
31
clientset "k8s.io/client-go/kubernetes"
32
+ "k8s.io/client-go/util/retry"
32
33
"k8s.io/klog/v2"
33
34
34
35
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -265,23 +266,70 @@ func (b *buildReconciler) updateMachineOSBuild(ctx context.Context, old, current
265
266
}
266
267
267
268
if ! oldState .IsBuildFailure () && curState .IsBuildFailure () {
268
- klog .Infof ("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection" , current .Name )
269
+ klog .Infof ("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection and setting BuildDegraded condition" , current .Name )
270
+
271
+ // Before setting BuildDegraded, check if another MachineOSBuild is active or if this one has been superseded
272
+ mosbList , err := b .getMachineOSBuildsForMachineOSConfig (mosc )
273
+ if err != nil {
274
+ return fmt .Errorf ("could not get MachineOSBuilds for MachineOSConfig %q: %w" , mosc .Name , err )
275
+ }
276
+
277
+ // Check if there are any newer or active builds that would supersede this failure
278
+ hasActiveBuild := false
279
+ isCurrentBuildStale := false
280
+ for _ , mosb := range mosbList {
281
+ // Skip the current failed build
282
+ if mosb .Name == current .Name {
283
+ continue
284
+ }
285
+ mosbState := ctrlcommon .NewMachineOSBuildState (mosb )
286
+ // Check if there's an active build (building, prepared, or succeeded)
287
+ if mosbState .IsBuilding () || mosbState .IsBuildPrepared () || mosbState .IsBuildSuccess () {
288
+ hasActiveBuild = true
289
+ klog .Infof ("Found active MachineOSBuild %s, skipping BuildDegraded condition for failed build %s" , mosb .Name , current .Name )
290
+ break
291
+ }
292
+ }
293
+
294
+ // Also check if the failed MachineOSBuild is no longer referenced by the MachineOSConfig
295
+ if mosc .Status .CurrentImagePullSpec != "" && ! isMachineOSBuildCurrentForMachineOSConfigWithPullspec (mosc , current ) {
296
+ isCurrentBuildStale = true
297
+ klog .Infof ("Failed MachineOSBuild %s is no longer current for MachineOSConfig %s, skipping BuildDegraded condition" , current .Name , mosc .Name )
298
+ }
299
+
300
+ // Only set BuildDegraded if there are no active builds and this build is still current
301
+ if hasActiveBuild || isCurrentBuildStale {
302
+ klog .Infof ("Skipping BuildDegraded condition for failed MachineOSBuild %s (hasActiveBuild=%v, isStale=%v)" , current .Name , hasActiveBuild , isCurrentBuildStale )
303
+ return nil
304
+ }
305
+
269
306
mcp , err := b .machineConfigPoolLister .Get (mosc .Spec .MachineConfigPool .Name )
270
307
if err != nil {
271
308
return fmt .Errorf ("could not get MachineConfigPool from MachineOSConfig %q: %w" , mosc .Name , err )
272
309
}
273
310
274
- // Just so I don't have to remove the mcp right now :P
275
- klog .Infof ("Target MachineConfigPool %q" , mcp .Name )
276
-
277
- // Implement code to degrade MCP
278
- return nil
311
+ // Set BuildDegraded condition
312
+ buildError := getBuildErrorFromMOSB (current )
313
+ return b .syncBuildFailureStatus (ctx , mcp , buildError , current .Name )
279
314
}
280
315
281
316
// If the build was successful, clean up the build objects and propagate the
282
317
// final image pushspec onto the MachineOSConfig object.
318
+ // Also clear BuildDegraded condition if it was set due to a previously failed build
283
319
if ! oldState .IsBuildSuccess () && curState .IsBuildSuccess () {
284
320
klog .Infof ("MachineOSBuild %s succeeded, cleaning up all ephemeral objects used for the build" , current .Name )
321
+
322
+ mcp , err := b .machineConfigPoolLister .Get (mosc .Spec .MachineConfigPool .Name )
323
+ if err != nil {
324
+ return fmt .Errorf ("could not get MachineConfigPool from MachineOSConfig %q: %w" , mosc .Name , err )
325
+ }
326
+
327
+ // Clear BuildDegraded condition if set
328
+ if err := b .syncBuildSuccessStatus (ctx , mcp ); err != nil {
329
+ klog .Errorf ("Failed to clear BuildDegraded condition for pool %s: %v" , mcp .Name , err )
330
+ }
331
+
332
+ // Clean up ephemeral objects
285
333
if err := imagebuilder .NewJobImageBuilder (b .kubeclient , b .mcfgclient , current , mosc ).Clean (ctx ); err != nil {
286
334
return err
287
335
}
@@ -412,6 +460,16 @@ func (b *buildReconciler) startBuild(ctx context.Context, mosb *mcfgv1.MachineOS
412
460
return fmt .Errorf ("could not update MachineOSConfig %q status for MachineOSBuild %q: %w" , mosc .Name , mosb .Name , err )
413
461
}
414
462
463
+ // Initialize BuildDegraded condition to False when build starts
464
+ mcp , err := b .machineConfigPoolLister .Get (mosc .Spec .MachineConfigPool .Name )
465
+ if err != nil {
466
+ return fmt .Errorf ("could not get MachineConfigPool from MachineOSConfig %q: %w" , mosc .Name , err )
467
+ }
468
+
469
+ if err := b .initializeBuildDegradedCondition (ctx , mcp ); err != nil {
470
+ klog .Errorf ("Failed to initialize BuildDegraded condition for pool %s: %v" , mcp .Name , err )
471
+ }
472
+
415
473
return nil
416
474
}
417
475
@@ -506,9 +564,10 @@ func (b *buildReconciler) createNewMachineOSBuildOrReuseExisting(ctx context.Con
506
564
return fmt .Errorf ("could not get MachineConfigPool %s for MachineOSConfig %s: %w" , mosc .Spec .MachineConfigPool .Name , mosc .Name , err )
507
565
}
508
566
509
- // TODO: Consider what we should do in the event of a degraded MachineConfigPool.
510
- if ctrlcommon .IsPoolAnyDegraded (mcp ) {
511
- return fmt .Errorf ("MachineConfigPool %s is degraded" , mcp .Name )
567
+ // Allow builds to retry when pool is degraded only due to BuildDegraded,
568
+ // but prevent builds for other types of degradation (NodeDegraded, RenderDegraded)
569
+ if b .shouldPreventBuildDueToDegradation (mcp ) {
570
+ return fmt .Errorf ("MachineConfigPool %s is degraded due to non-build issues" , mcp .Name )
512
571
}
513
572
514
573
// TODO: Consider using a ConfigMap lister to get this value instead of the API server.
@@ -952,6 +1011,11 @@ func (b *buildReconciler) deleteMOSBImage(ctx context.Context, mosb *mcfgv1.Mach
952
1011
return err
953
1012
}
954
1013
if err := b .imageclient .ImageV1 ().ImageStreamTags (ns ).Delete (context .TODO (), img , metav1.DeleteOptions {}); err != nil {
1014
+ // If the ImageStreamTag doesn't exist (e.g., build failed before pushing), ignore the error
1015
+ if k8serrors .IsNotFound (err ) {
1016
+ klog .Infof ("ImageStreamTag %s not found for MachineOSBuild %s, skipping deletion (likely failed build)" , img , mosb .Name )
1017
+ return nil
1018
+ }
955
1019
return fmt .Errorf ("could not delete image %s from internal registry for MachineOSBuild %s: %w" , image , mosb .Name , err )
956
1020
}
957
1021
return nil
@@ -1549,6 +1613,29 @@ func (b *buildReconciler) deleteImage(ctx context.Context, pullspec string, mosb
1549
1613
return b .imagepruner .DeleteImage (ctx , pullspec , secret , cc )
1550
1614
}
1551
1615
1616
+ // Determines if builds should be prevented due to pool degradation.
1617
+ // Returns true if pool has ANY degraded condition other than BuildDegraded,
1618
+ // but false if degraded ONLY due to BuildDegraded (to allow retry attempts).
1619
+ func (b * buildReconciler ) shouldPreventBuildDueToDegradation (mcp * mcfgv1.MachineConfigPool ) bool {
1620
+ // Check for ALL degradation conditions except BuildDegraded that should prevent new builds
1621
+ // We intentionally exclude BuildDegraded to allow retries
1622
+ // We check specific conditions rather than overall Degraded since Degraded=True could be due to BuildDegraded alone
1623
+ nonBuildDegradedTypes := []mcfgv1.MachineConfigPoolConditionType {
1624
+ mcfgv1 .MachineConfigPoolNodeDegraded ,
1625
+ mcfgv1 .MachineConfigPoolRenderDegraded ,
1626
+ mcfgv1 .MachineConfigPoolPinnedImageSetsDegraded ,
1627
+ mcfgv1 .MachineConfigPoolSynchronizerDegraded ,
1628
+ }
1629
+
1630
+ for _ , condType := range nonBuildDegradedTypes {
1631
+ if apihelpers .IsMachineConfigPoolConditionTrue (mcp .Status .Conditions , condType ) {
1632
+ return true
1633
+ }
1634
+ }
1635
+
1636
+ return false
1637
+ }
1638
+
1552
1639
// reconcileImageRebuild calls RequiresRebuild to see if an MC changes the kernel args, ext, or osimageurl.
1553
1640
// if it does, we build a new image in our new MOSB
1554
1641
func (b * buildReconciler ) reconcileImageRebuild (oldMCP , curMCP * mcfgv1.MachineConfigPool ) (bool , error ) {
@@ -1564,3 +1651,65 @@ func (b *buildReconciler) reconcileImageRebuild(oldMCP, curMCP *mcfgv1.MachineCo
1564
1651
1565
1652
return ctrlcommon .RequiresRebuild (curr , des ), nil
1566
1653
}
1654
+
1655
+ // Clears BuildDegraded condition when a new build starts (allowing retry after failure)
1656
+ func (b * buildReconciler ) initializeBuildDegradedCondition (ctx context.Context , pool * mcfgv1.MachineConfigPool ) error {
1657
+ // Check if BuildDegraded condition is already False - if so, no update needed
1658
+ if apihelpers .IsMachineConfigPoolConditionFalse (pool .Status .Conditions , mcfgv1 .MachineConfigPoolImageBuildDegraded ) {
1659
+ return nil
1660
+ }
1661
+
1662
+ // Clear BuildDegraded condition (even if it was True from previous failure) when new build starts
1663
+ return retry .RetryOnConflict (retry .DefaultRetry , func () error {
1664
+ // Get fresh copy for update to avoid conflicts
1665
+ currentPool , err := b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().Get (ctx , pool .Name , metav1.GetOptions {})
1666
+ if err != nil {
1667
+ return err
1668
+ }
1669
+
1670
+ buildDegraded := apihelpers .NewMachineConfigPoolCondition (mcfgv1 .MachineConfigPoolImageBuildDegraded , corev1 .ConditionFalse , "BuildStarted" , "Build started for pool " + currentPool .Name )
1671
+ apihelpers .SetMachineConfigPoolCondition (& currentPool .Status , * buildDegraded )
1672
+
1673
+ _ , err = b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().UpdateStatus (ctx , currentPool , metav1.UpdateOptions {})
1674
+ return err
1675
+ })
1676
+ }
1677
+
1678
+ // Clears BuildDegraded condition when build succeeds
1679
+ func (b * buildReconciler ) syncBuildSuccessStatus (ctx context.Context , pool * mcfgv1.MachineConfigPool ) error {
1680
+ return retry .RetryOnConflict (retry .DefaultRetry , func () error {
1681
+ // Get fresh copy for update to avoid conflicts
1682
+ currentPool , err := b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().Get (ctx , pool .Name , metav1.GetOptions {})
1683
+ if err != nil {
1684
+ return err
1685
+ }
1686
+
1687
+ buildDegraded := apihelpers .NewMachineConfigPoolCondition (mcfgv1 .MachineConfigPoolImageBuildDegraded , corev1 .ConditionFalse , "BuildSucceeded" , "Build succeeded for pool " + currentPool .Name )
1688
+ apihelpers .SetMachineConfigPoolCondition (& currentPool .Status , * buildDegraded )
1689
+
1690
+ _ , err = b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().UpdateStatus (ctx , currentPool , metav1.UpdateOptions {})
1691
+ return err
1692
+ })
1693
+ }
1694
+
1695
+ // Sets BuildDegraded condition when build fails
1696
+ func (b * buildReconciler ) syncBuildFailureStatus (ctx context.Context , pool * mcfgv1.MachineConfigPool , buildErr error , mosbName string ) error {
1697
+ updateErr := retry .RetryOnConflict (retry .DefaultRetry , func () error {
1698
+ // Get fresh copy for update to avoid conflicts
1699
+ currentPool , err := b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().Get (ctx , pool .Name , metav1.GetOptions {})
1700
+ if err != nil {
1701
+ return err
1702
+ }
1703
+
1704
+ // The message content may be truncated https://github.com/kubernetes/apimachinery/blob/f5dd29d6ada12819a4a6ddc97d5bdf812f8a1cad/pkg/apis/meta/v1/types.go#L1619-L1635
1705
+ buildDegraded := apihelpers .NewMachineConfigPoolCondition (mcfgv1 .MachineConfigPoolImageBuildDegraded , corev1 .ConditionTrue , "BuildFailed" , fmt .Sprintf ("Failed to build OS image for pool %s (MachineOSBuild: %s): %v" , currentPool .Name , mosbName , buildErr ))
1706
+ apihelpers .SetMachineConfigPoolCondition (& currentPool .Status , * buildDegraded )
1707
+
1708
+ _ , updateErr := b .mcfgclient .MachineconfigurationV1 ().MachineConfigPools ().UpdateStatus (ctx , currentPool , metav1.UpdateOptions {})
1709
+ return updateErr
1710
+ })
1711
+ if updateErr != nil {
1712
+ klog .Errorf ("Error updating MachineConfigPool %s BuildDegraded status: %v" , pool .Name , updateErr )
1713
+ }
1714
+ return buildErr
1715
+ }
0 commit comments