@@ -822,19 +822,31 @@ func (dn *Daemon) updateOnClusterLayering(oldConfig, newConfig *mcfgv1.MachineCo
822822 }
823823
824824 if dn .nodeWriter != nil {
825- // Refetch node from lister to get fresh state before checking guard.
826- // This prevents overwriting Degraded/Unreconcilable states that were just set.
827- freshNode , err := dn .nodeLister .Get (dn .name )
828- if err != nil {
829- return fmt .Errorf ("error fetching fresh node state: %w" , err )
830- }
831- state , err := getNodeAnnotationExt (freshNode , constants .MachineConfigDaemonStateAnnotationKey , true )
825+ // First check dn.node (from informer cache)
826+ state , err := getNodeAnnotationExt (dn .node , constants .MachineConfigDaemonStateAnnotationKey , true )
832827 if err != nil {
833828 return err
834829 }
830+
835831 if state != constants .MachineConfigDaemonStateDegraded && state != constants .MachineConfigDaemonStateUnreconcilable {
836- if err := dn .nodeWriter .SetWorking (); err != nil {
837- return fmt .Errorf ("error setting node's state to Working: %w" , err )
832+ // Second check: fetch DIRECTLY from API server to bypass stale informer cache.
833+ // This prevents a race condition where SetDegraded() was called but the informer
834+ // cache hasn't synced yet, causing SetWorking() to overwrite the Degraded state.
835+ // See: https://issues.redhat.com/browse/OCPBUGS-71227
836+ freshNode , err := dn .kubeClient .CoreV1 ().Nodes ().Get (context .TODO (), dn .name , metav1.GetOptions {})
837+ if err != nil {
838+ return fmt .Errorf ("error fetching fresh node state from API: %w" , err )
839+ }
840+
841+ state , err = getNodeAnnotationExt (freshNode , constants .MachineConfigDaemonStateAnnotationKey , true )
842+ if err != nil {
843+ return err
844+ }
845+
846+ if state != constants .MachineConfigDaemonStateDegraded && state != constants .MachineConfigDaemonStateUnreconcilable {
847+ if err := dn .nodeWriter .SetWorking (); err != nil {
848+ return fmt .Errorf ("error setting node's state to Working: %w" , err )
849+ }
838850 }
839851 }
840852 }
@@ -1074,13 +1086,31 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
10741086 oldConfig = canonicalizeEmptyMC (oldConfig )
10751087
10761088 if dn .nodeWriter != nil {
1089+ // First check dn.node (from informer cache)
10771090 state , err := getNodeAnnotationExt (dn .node , constants .MachineConfigDaemonStateAnnotationKey , true )
10781091 if err != nil {
10791092 return err
10801093 }
1094+
10811095 if state != constants .MachineConfigDaemonStateDegraded && state != constants .MachineConfigDaemonStateUnreconcilable {
1082- if err := dn .nodeWriter .SetWorking (); err != nil {
1083- return fmt .Errorf ("error setting node's state to Working: %w" , err )
1096+ // Second check: fetch DIRECTLY from API server to bypass stale informer cache.
1097+ // This prevents a race condition where SetDegraded() was called but the informer
1098+ // cache hasn't synced yet, causing SetWorking() to overwrite the Degraded state.
1099+ // See: https://issues.redhat.com/browse/OCPBUGS-71227
1100+ freshNode , err := dn .kubeClient .CoreV1 ().Nodes ().Get (context .TODO (), dn .name , metav1.GetOptions {})
1101+ if err != nil {
1102+ return fmt .Errorf ("error fetching fresh node state from API: %w" , err )
1103+ }
1104+
1105+ state , err = getNodeAnnotationExt (freshNode , constants .MachineConfigDaemonStateAnnotationKey , true )
1106+ if err != nil {
1107+ return err
1108+ }
1109+
1110+ if state != constants .MachineConfigDaemonStateDegraded && state != constants .MachineConfigDaemonStateUnreconcilable {
1111+ if err := dn .nodeWriter .SetWorking (); err != nil {
1112+ return fmt .Errorf ("error setting node's state to Working: %w" , err )
1113+ }
10841114 }
10851115 }
10861116 }
0 commit comments