Skip to content

Commit 67d8fd6

Browse files
Merge pull request #5251 from umohnani8/mcp-d2
MCO-1536: Add new ImageBuildDegraded status option to MCP
2 parents 58dbcb8 + fff5a04 commit 67d8fd6

File tree

8 files changed

+435
-41
lines changed

8 files changed

+435
-41
lines changed

cmd/machine-config-controller/start.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ func createControllers(ctx *ctrlcommon.ControllerContext) []ctrlcommon.Controlle
239239
ctx.KubeInformerFactory.Core().V1().Nodes(),
240240
ctx.KubeInformerFactory.Core().V1().Pods(),
241241
ctx.OCLInformerFactory.Machineconfiguration().V1().MachineOSConfigs(),
242+
ctx.OCLInformerFactory.Machineconfiguration().V1().MachineOSBuilds(),
242243
ctx.ConfigInformerFactory.Config().V1().Schedulers(),
243244
ctx.ClientBuilder.KubeClientOrDie("node-update-controller"),
244245
ctx.ClientBuilder.MachineConfigClientOrDie("node-update-controller"),

pkg/controller/build/helpers.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,3 +289,15 @@ func extractNSAndNameWithTag(imageRef string) (string, string, error) {
289289

290290
return parts[1], parts[2], nil
291291
}
292+
293+
var errUnknownBuildFailure = fmt.Errorf("build failed for unknown reason")
294+
295+
// Extracts meaningful error from MachineOSBuild
296+
func getBuildErrorFromMOSB(mosb *mcfgv1.MachineOSBuild) error {
297+
for _, condition := range mosb.Status.Conditions {
298+
if condition.Type == "Failed" && condition.Status == metav1.ConditionTrue {
299+
return fmt.Errorf("%s: %s", condition.Reason, condition.Message)
300+
}
301+
}
302+
return errUnknownBuildFailure
303+
}

pkg/controller/build/reconciler.go

Lines changed: 158 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
k8serrors "k8s.io/apimachinery/pkg/api/errors"
3030
"k8s.io/apimachinery/pkg/labels"
3131
clientset "k8s.io/client-go/kubernetes"
32+
"k8s.io/client-go/util/retry"
3233
"k8s.io/klog/v2"
3334

3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -265,23 +266,70 @@ func (b *buildReconciler) updateMachineOSBuild(ctx context.Context, old, current
265266
}
266267

267268
if !oldState.IsBuildFailure() && curState.IsBuildFailure() {
268-
klog.Infof("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection", current.Name)
269+
klog.Infof("MachineOSBuild %s failed, leaving ephemeral objects in place for inspection and setting BuildDegraded condition", current.Name)
270+
271+
// Before setting BuildDegraded, check if another MachineOSBuild is active or if this one has been superseded
272+
mosbList, err := b.getMachineOSBuildsForMachineOSConfig(mosc)
273+
if err != nil {
274+
return fmt.Errorf("could not get MachineOSBuilds for MachineOSConfig %q: %w", mosc.Name, err)
275+
}
276+
277+
// Check if there are any newer or active builds that would supersede this failure
278+
hasActiveBuild := false
279+
isCurrentBuildStale := false
280+
for _, mosb := range mosbList {
281+
// Skip the current failed build
282+
if mosb.Name == current.Name {
283+
continue
284+
}
285+
mosbState := ctrlcommon.NewMachineOSBuildState(mosb)
286+
// Check if there's an active build (building, prepared, or succeeded)
287+
if mosbState.IsBuilding() || mosbState.IsBuildPrepared() || mosbState.IsBuildSuccess() {
288+
hasActiveBuild = true
289+
klog.Infof("Found active MachineOSBuild %s, skipping BuildDegraded condition for failed build %s", mosb.Name, current.Name)
290+
break
291+
}
292+
}
293+
294+
// Also check if the failed MachineOSBuild is no longer referenced by the MachineOSConfig
295+
if mosc.Status.CurrentImagePullSpec != "" && !isMachineOSBuildCurrentForMachineOSConfigWithPullspec(mosc, current) {
296+
isCurrentBuildStale = true
297+
klog.Infof("Failed MachineOSBuild %s is no longer current for MachineOSConfig %s, skipping BuildDegraded condition", current.Name, mosc.Name)
298+
}
299+
300+
// Only set BuildDegraded if there are no active builds and this build is still current
301+
if hasActiveBuild || isCurrentBuildStale {
302+
klog.Infof("Skipping BuildDegraded condition for failed MachineOSBuild %s (hasActiveBuild=%v, isStale=%v)", current.Name, hasActiveBuild, isCurrentBuildStale)
303+
return nil
304+
}
305+
269306
mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
270307
if err != nil {
271308
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
272309
}
273310

274-
// Just so I don't have to remove the mcp right now :P
275-
klog.Infof("Target MachineConfigPool %q", mcp.Name)
276-
277-
// Implement code to degrade MCP
278-
return nil
311+
// Set BuildDegraded condition
312+
buildError := getBuildErrorFromMOSB(current)
313+
return b.syncBuildFailureStatus(ctx, mcp, buildError, current.Name)
279314
}
280315

281316
// If the build was successful, clean up the build objects and propagate the
282317
// final image pushspec onto the MachineOSConfig object.
318+
// Also clear BuildDegraded condition if it was set due to a previously failed build
283319
if !oldState.IsBuildSuccess() && curState.IsBuildSuccess() {
284320
klog.Infof("MachineOSBuild %s succeeded, cleaning up all ephemeral objects used for the build", current.Name)
321+
322+
mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
323+
if err != nil {
324+
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
325+
}
326+
327+
// Clear BuildDegraded condition if set
328+
if err := b.syncBuildSuccessStatus(ctx, mcp); err != nil {
329+
klog.Errorf("Failed to clear BuildDegraded condition for pool %s: %v", mcp.Name, err)
330+
}
331+
332+
// Clean up ephemeral objects
285333
if err := imagebuilder.NewJobImageBuilder(b.kubeclient, b.mcfgclient, current, mosc).Clean(ctx); err != nil {
286334
return err
287335
}
@@ -412,6 +460,16 @@ func (b *buildReconciler) startBuild(ctx context.Context, mosb *mcfgv1.MachineOS
412460
return fmt.Errorf("could not update MachineOSConfig %q status for MachineOSBuild %q: %w", mosc.Name, mosb.Name, err)
413461
}
414462

463+
// Initialize BuildDegraded condition to False when build starts
464+
mcp, err := b.machineConfigPoolLister.Get(mosc.Spec.MachineConfigPool.Name)
465+
if err != nil {
466+
return fmt.Errorf("could not get MachineConfigPool from MachineOSConfig %q: %w", mosc.Name, err)
467+
}
468+
469+
if err := b.initializeBuildDegradedCondition(ctx, mcp); err != nil {
470+
klog.Errorf("Failed to initialize BuildDegraded condition for pool %s: %v", mcp.Name, err)
471+
}
472+
415473
return nil
416474
}
417475

@@ -506,9 +564,10 @@ func (b *buildReconciler) createNewMachineOSBuildOrReuseExisting(ctx context.Con
506564
return fmt.Errorf("could not get MachineConfigPool %s for MachineOSConfig %s: %w", mosc.Spec.MachineConfigPool.Name, mosc.Name, err)
507565
}
508566

509-
// TODO: Consider what we should do in the event of a degraded MachineConfigPool.
510-
if ctrlcommon.IsPoolAnyDegraded(mcp) {
511-
return fmt.Errorf("MachineConfigPool %s is degraded", mcp.Name)
567+
// Allow builds to retry when pool is degraded only due to BuildDegraded,
568+
// but prevent builds for other types of degradation (NodeDegraded, RenderDegraded)
569+
if b.shouldPreventBuildDueToDegradation(mcp) {
570+
return fmt.Errorf("MachineConfigPool %s is degraded due to non-build issues", mcp.Name)
512571
}
513572

514573
// TODO: Consider using a ConfigMap lister to get this value instead of the API server.
@@ -952,6 +1011,11 @@ func (b *buildReconciler) deleteMOSBImage(ctx context.Context, mosb *mcfgv1.Mach
9521011
return err
9531012
}
9541013
if err := b.imageclient.ImageV1().ImageStreamTags(ns).Delete(context.TODO(), img, metav1.DeleteOptions{}); err != nil {
1014+
// If the ImageStreamTag doesn't exist (e.g., build failed before pushing), ignore the error
1015+
if k8serrors.IsNotFound(err) {
1016+
klog.Infof("ImageStreamTag %s not found for MachineOSBuild %s, skipping deletion (likely failed build)", img, mosb.Name)
1017+
return nil
1018+
}
9551019
return fmt.Errorf("could not delete image %s from internal registry for MachineOSBuild %s: %w", image, mosb.Name, err)
9561020
}
9571021
return nil
@@ -1549,6 +1613,29 @@ func (b *buildReconciler) deleteImage(ctx context.Context, pullspec string, mosb
15491613
return b.imagepruner.DeleteImage(ctx, pullspec, secret, cc)
15501614
}
15511615

1616+
// Determines if builds should be prevented due to pool degradation.
1617+
// Returns true if pool has ANY degraded condition other than BuildDegraded,
1618+
// but false if degraded ONLY due to BuildDegraded (to allow retry attempts).
1619+
func (b *buildReconciler) shouldPreventBuildDueToDegradation(mcp *mcfgv1.MachineConfigPool) bool {
1620+
// Check for ALL degradation conditions except BuildDegraded that should prevent new builds
1621+
// We intentionally exclude BuildDegraded to allow retries
1622+
// We check specific conditions rather than overall Degraded since Degraded=True could be due to BuildDegraded alone
1623+
nonBuildDegradedTypes := []mcfgv1.MachineConfigPoolConditionType{
1624+
mcfgv1.MachineConfigPoolNodeDegraded,
1625+
mcfgv1.MachineConfigPoolRenderDegraded,
1626+
mcfgv1.MachineConfigPoolPinnedImageSetsDegraded,
1627+
mcfgv1.MachineConfigPoolSynchronizerDegraded,
1628+
}
1629+
1630+
for _, condType := range nonBuildDegradedTypes {
1631+
if apihelpers.IsMachineConfigPoolConditionTrue(mcp.Status.Conditions, condType) {
1632+
return true
1633+
}
1634+
}
1635+
1636+
return false
1637+
}
1638+
15521639
// reconcileImageRebuild calls RequiresRebuild to see if an MC changes the kernel args, ext, or osimageurl.
15531640
// if it does, we build a new image in our new MOSB
15541641
func (b *buildReconciler) reconcileImageRebuild(oldMCP, curMCP *mcfgv1.MachineConfigPool) (bool, error) {
@@ -1564,3 +1651,65 @@ func (b *buildReconciler) reconcileImageRebuild(oldMCP, curMCP *mcfgv1.MachineCo
15641651

15651652
return ctrlcommon.RequiresRebuild(curr, des), nil
15661653
}
1654+
1655+
// Clears BuildDegraded condition when a new build starts (allowing retry after failure)
1656+
func (b *buildReconciler) initializeBuildDegradedCondition(ctx context.Context, pool *mcfgv1.MachineConfigPool) error {
1657+
// Check if BuildDegraded condition is already False - if so, no update needed
1658+
if apihelpers.IsMachineConfigPoolConditionFalse(pool.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
1659+
return nil
1660+
}
1661+
1662+
// Clear BuildDegraded condition (even if it was True from previous failure) when new build starts
1663+
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
1664+
// Get fresh copy for update to avoid conflicts
1665+
currentPool, err := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().Get(ctx, pool.Name, metav1.GetOptions{})
1666+
if err != nil {
1667+
return err
1668+
}
1669+
1670+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "Build started for pool "+currentPool.Name)
1671+
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
1672+
1673+
_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
1674+
return err
1675+
})
1676+
}
1677+
1678+
// Clears BuildDegraded condition when build succeeds
1679+
func (b *buildReconciler) syncBuildSuccessStatus(ctx context.Context, pool *mcfgv1.MachineConfigPool) error {
1680+
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
1681+
// Get fresh copy for update to avoid conflicts
1682+
currentPool, err := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().Get(ctx, pool.Name, metav1.GetOptions{})
1683+
if err != nil {
1684+
return err
1685+
}
1686+
1687+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build succeeded for pool "+currentPool.Name)
1688+
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
1689+
1690+
_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
1691+
return err
1692+
})
1693+
}
1694+
1695+
// Sets BuildDegraded condition when build fails
1696+
func (b *buildReconciler) syncBuildFailureStatus(ctx context.Context, pool *mcfgv1.MachineConfigPool, buildErr error, mosbName string) error {
1697+
updateErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
1698+
// Get fresh copy for update to avoid conflicts
1699+
currentPool, err := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().Get(ctx, pool.Name, metav1.GetOptions{})
1700+
if err != nil {
1701+
return err
1702+
}
1703+
1704+
// The message content may be truncated https://github.com/kubernetes/apimachinery/blob/f5dd29d6ada12819a4a6ddc97d5bdf812f8a1cad/pkg/apis/meta/v1/types.go#L1619-L1635
1705+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, "BuildFailed", fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))
1706+
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
1707+
1708+
_, updateErr := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
1709+
return updateErr
1710+
})
1711+
if updateErr != nil {
1712+
klog.Errorf("Error updating MachineConfigPool %s BuildDegraded status: %v", pool.Name, updateErr)
1713+
}
1714+
return buildErr
1715+
}

0 commit comments

Comments
 (0)