Skip to content

Commit 63e080a

Browse files
fix pivot error
1 parent 0b05692 commit 63e080a

File tree

1 file changed

+41
-11
lines changed

1 file changed

+41
-11
lines changed

pkg/daemon/update.go

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -822,19 +822,31 @@ func (dn *Daemon) updateOnClusterLayering(oldConfig, newConfig *mcfgv1.MachineCo
822822
}
823823

824824
if dn.nodeWriter != nil {
825-
// Refetch node from lister to get fresh state before checking guard.
826-
// This prevents overwriting Degraded/Unreconcilable states that were just set.
827-
freshNode, err := dn.nodeLister.Get(dn.name)
828-
if err != nil {
829-
return fmt.Errorf("error fetching fresh node state: %w", err)
830-
}
831-
state, err := getNodeAnnotationExt(freshNode, constants.MachineConfigDaemonStateAnnotationKey, true)
825+
// First check dn.node (from informer cache)
826+
state, err := getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonStateAnnotationKey, true)
832827
if err != nil {
833828
return err
834829
}
830+
835831
if state != constants.MachineConfigDaemonStateDegraded && state != constants.MachineConfigDaemonStateUnreconcilable {
836-
if err := dn.nodeWriter.SetWorking(); err != nil {
837-
return fmt.Errorf("error setting node's state to Working: %w", err)
832+
// Second check: fetch DIRECTLY from API server to bypass stale informer cache.
833+
// This prevents a race condition where SetDegraded() was called but the informer
834+
// cache hasn't synced yet, causing SetWorking() to overwrite the Degraded state.
835+
// See: https://issues.redhat.com/browse/OCPBUGS-71227
836+
freshNode, err := dn.kubeClient.CoreV1().Nodes().Get(context.TODO(), dn.name, metav1.GetOptions{})
837+
if err != nil {
838+
return fmt.Errorf("error fetching fresh node state from API: %w", err)
839+
}
840+
841+
state, err = getNodeAnnotationExt(freshNode, constants.MachineConfigDaemonStateAnnotationKey, true)
842+
if err != nil {
843+
return err
844+
}
845+
846+
if state != constants.MachineConfigDaemonStateDegraded && state != constants.MachineConfigDaemonStateUnreconcilable {
847+
if err := dn.nodeWriter.SetWorking(); err != nil {
848+
return fmt.Errorf("error setting node's state to Working: %w", err)
849+
}
838850
}
839851
}
840852
}
@@ -1074,13 +1086,31 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig, skipCertifi
10741086
oldConfig = canonicalizeEmptyMC(oldConfig)
10751087

10761088
if dn.nodeWriter != nil {
1089+
// First check dn.node (from informer cache)
10771090
state, err := getNodeAnnotationExt(dn.node, constants.MachineConfigDaemonStateAnnotationKey, true)
10781091
if err != nil {
10791092
return err
10801093
}
1094+
10811095
if state != constants.MachineConfigDaemonStateDegraded && state != constants.MachineConfigDaemonStateUnreconcilable {
1082-
if err := dn.nodeWriter.SetWorking(); err != nil {
1083-
return fmt.Errorf("error setting node's state to Working: %w", err)
1096+
// Second check: fetch DIRECTLY from API server to bypass stale informer cache.
1097+
// This prevents a race condition where SetDegraded() was called but the informer
1098+
// cache hasn't synced yet, causing SetWorking() to overwrite the Degraded state.
1099+
// See: https://issues.redhat.com/browse/OCPBUGS-71227
1100+
freshNode, err := dn.kubeClient.CoreV1().Nodes().Get(context.TODO(), dn.name, metav1.GetOptions{})
1101+
if err != nil {
1102+
return fmt.Errorf("error fetching fresh node state from API: %w", err)
1103+
}
1104+
1105+
state, err = getNodeAnnotationExt(freshNode, constants.MachineConfigDaemonStateAnnotationKey, true)
1106+
if err != nil {
1107+
return err
1108+
}
1109+
1110+
if state != constants.MachineConfigDaemonStateDegraded && state != constants.MachineConfigDaemonStateUnreconcilable {
1111+
if err := dn.nodeWriter.SetWorking(); err != nil {
1112+
return fmt.Errorf("error setting node's state to Working: %w", err)
1113+
}
10841114
}
10851115
}
10861116
}

0 commit comments

Comments
 (0)