Skip to content

Commit 029b72b

Browse files
committed
adding lock to node data map
1 parent 1866891 commit 029b72b

File tree

2 files changed

+78
-33
lines changed

2 files changed

+78
-33
lines changed

pkg/controller/nodelifecycle/node_lifecycle_controller.go

Lines changed: 74 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,43 @@ type nodeHealthData struct {
169169
lease *coordv1beta1.Lease
170170
}
171171

172+
func (n *nodeHealthData) deepCopy() *nodeHealthData {
173+
if n == nil {
174+
return nil
175+
}
176+
return &nodeHealthData{
177+
probeTimestamp: n.probeTimestamp,
178+
readyTransitionTimestamp: n.readyTransitionTimestamp,
179+
status: n.status.DeepCopy(),
180+
lease: n.lease.DeepCopy(),
181+
}
182+
}
183+
184+
type nodeHealthMap struct {
185+
lock sync.RWMutex
186+
nodeHealths map[string]*nodeHealthData
187+
}
188+
189+
func newNodeHealthMap() *nodeHealthMap {
190+
return &nodeHealthMap{
191+
nodeHealths: make(map[string]*nodeHealthData),
192+
}
193+
}
194+
195+
// getDeepCopy - returns copy of node health data.
196+
// It prevents data being changed after retrieving it from the map.
197+
func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData {
198+
n.lock.RLock()
199+
defer n.lock.RUnlock()
200+
return n.nodeHealths[name].deepCopy()
201+
}
202+
203+
func (n *nodeHealthMap) set(name string, data *nodeHealthData) {
204+
n.lock.Lock()
205+
defer n.lock.Unlock()
206+
n.nodeHealths[name] = data
207+
}
208+
172209
// Controller is the controller that manages node's life cycle.
173210
type Controller struct {
174211
taintManager *scheduler.NoExecuteTaintManager
@@ -186,7 +223,7 @@ type Controller struct {
186223

187224
knownNodeSet map[string]*v1.Node
188225
// per Node map storing last observed health together with a local time when it was observed.
189-
nodeHealthMap map[string]*nodeHealthData
226+
nodeHealthMap *nodeHealthMap
190227

191228
// Lock to access evictor workers
192229
evictorLock sync.Mutex
@@ -305,7 +342,7 @@ func NewNodeLifecycleController(
305342
kubeClient: kubeClient,
306343
now: metav1.Now,
307344
knownNodeSet: make(map[string]*v1.Node),
308-
nodeHealthMap: make(map[string]*nodeHealthData),
345+
nodeHealthMap: newNodeHealthMap(),
309346
recorder: recorder,
310347
nodeMonitorPeriod: nodeMonitorPeriod,
311348
nodeStartupGracePeriod: nodeStartupGracePeriod,
@@ -722,6 +759,11 @@ func (nc *Controller) monitorNodeHealth() error {
722759
}
723760

724761
decisionTimestamp := nc.now()
762+
nodeHealthData := nc.nodeHealthMap.getDeepCopy(node.Name)
763+
if nodeHealthData == nil {
764+
klog.Errorf("Skipping %v node processing: health data doesn't exist.", node.Name)
765+
continue
766+
}
725767
if currentReadyCondition != nil {
726768
// Check eviction timeout against decisionTimestamp
727769
switch observedReadyCondition.Status {
@@ -740,12 +782,12 @@ func (nc *Controller) monitorNodeHealth() error {
740782
)
741783
}
742784
} else {
743-
if decisionTimestamp.After(nc.nodeHealthMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
785+
if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
744786
if nc.evictPods(node) {
745787
klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v",
746788
node.Name,
747789
decisionTimestamp,
748-
nc.nodeHealthMap[node.Name].readyTransitionTimestamp,
790+
nodeHealthData.readyTransitionTimestamp,
749791
nc.podEvictionTimeout,
750792
)
751793
}
@@ -766,12 +808,12 @@ func (nc *Controller) monitorNodeHealth() error {
766808
)
767809
}
768810
} else {
769-
if decisionTimestamp.After(nc.nodeHealthMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
811+
if decisionTimestamp.After(nodeHealthData.probeTimestamp.Add(nc.podEvictionTimeout)) {
770812
if nc.evictPods(node) {
771813
klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v",
772814
node.Name,
773815
decisionTimestamp,
774-
nc.nodeHealthMap[node.Name].readyTransitionTimestamp,
816+
nodeHealthData.readyTransitionTimestamp,
775817
nc.podEvictionTimeout-gracePeriod,
776818
)
777819
}
@@ -849,6 +891,11 @@ func legacyIsMasterNode(nodeName string) bool {
849891
// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
850892
// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
851893
func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
894+
nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
895+
defer func() {
896+
nc.nodeHealthMap.set(node.Name, nodeHealth)
897+
}()
898+
852899
var gracePeriod time.Duration
853900
var observedReadyCondition v1.NodeCondition
854901
_, currentReadyCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady)
@@ -863,10 +910,10 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
863910
LastTransitionTime: node.CreationTimestamp,
864911
}
865912
gracePeriod = nc.nodeStartupGracePeriod
866-
if _, found := nc.nodeHealthMap[node.Name]; found {
867-
nc.nodeHealthMap[node.Name].status = &node.Status
913+
if nodeHealth != nil {
914+
nodeHealth.status = &node.Status
868915
} else {
869-
nc.nodeHealthMap[node.Name] = &nodeHealthData{
916+
nodeHealth = &nodeHealthData{
870917
status: &node.Status,
871918
probeTimestamp: node.CreationTimestamp,
872919
readyTransitionTimestamp: node.CreationTimestamp,
@@ -877,7 +924,6 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
877924
observedReadyCondition = *currentReadyCondition
878925
gracePeriod = nc.nodeMonitorGracePeriod
879926
}
880-
savedNodeHealth, found := nc.nodeHealthMap[node.Name]
881927
// There are following cases to check:
882928
// - both saved and new status have no Ready Condition set - we leave everything as it is,
883929
// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
@@ -894,29 +940,29 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
894940
// if that's the case, but it does not seem necessary.
895941
var savedCondition *v1.NodeCondition
896942
var savedLease *coordv1beta1.Lease
897-
if found {
898-
_, savedCondition = nodeutil.GetNodeCondition(savedNodeHealth.status, v1.NodeReady)
899-
savedLease = savedNodeHealth.lease
943+
if nodeHealth != nil {
944+
_, savedCondition = nodeutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
945+
savedLease = nodeHealth.lease
900946
}
901947

902-
if !found {
948+
if nodeHealth == nil {
903949
klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name)
904-
savedNodeHealth = &nodeHealthData{
950+
nodeHealth = &nodeHealthData{
905951
status: &node.Status,
906952
probeTimestamp: nc.now(),
907953
readyTransitionTimestamp: nc.now(),
908954
}
909955
} else if savedCondition == nil && currentReadyCondition != nil {
910956
klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name)
911-
savedNodeHealth = &nodeHealthData{
957+
nodeHealth = &nodeHealthData{
912958
status: &node.Status,
913959
probeTimestamp: nc.now(),
914960
readyTransitionTimestamp: nc.now(),
915961
}
916962
} else if savedCondition != nil && currentReadyCondition == nil {
917963
klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name)
918964
// TODO: figure out what to do in this case. For now we do the same thing as above.
919-
savedNodeHealth = &nodeHealthData{
965+
nodeHealth = &nodeHealthData{
920966
status: &node.Status,
921967
probeTimestamp: nc.now(),
922968
readyTransitionTimestamp: nc.now(),
@@ -929,14 +975,14 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
929975
klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition)
930976
transitionTime = nc.now()
931977
} else {
932-
transitionTime = savedNodeHealth.readyTransitionTimestamp
978+
transitionTime = nodeHealth.readyTransitionTimestamp
933979
}
934980
if klog.V(5) {
935-
klog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, savedNodeHealth.status, node.Status)
981+
klog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, nodeHealth.status, node.Status)
936982
} else {
937983
klog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name)
938984
}
939-
savedNodeHealth = &nodeHealthData{
985+
nodeHealth = &nodeHealthData{
940986
status: &node.Status,
941987
probeTimestamp: nc.now(),
942988
readyTransitionTimestamp: transitionTime,
@@ -950,13 +996,12 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
950996
// take no action.
951997
observedLease, _ = nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
952998
if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
953-
savedNodeHealth.lease = observedLease
954-
savedNodeHealth.probeTimestamp = nc.now()
999+
nodeHealth.lease = observedLease
1000+
nodeHealth.probeTimestamp = nc.now()
9551001
}
9561002
}
957-
nc.nodeHealthMap[node.Name] = savedNodeHealth
9581003

959-
if nc.now().After(savedNodeHealth.probeTimestamp.Add(gracePeriod)) {
1004+
if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) {
9601005
// NodeReady condition or lease was last set longer ago than gracePeriod, so
9611006
// update it to Unknown (regardless of its current value) in the master.
9621007

@@ -984,7 +1029,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
9841029
})
9851030
} else {
9861031
klog.V(4).Infof("node %v hasn't been updated for %+v. Last %v is: %+v",
987-
node.Name, nc.now().Time.Sub(savedNodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition)
1032+
node.Name, nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition)
9881033
if currentCondition.Status != v1.ConditionUnknown {
9891034
currentCondition.Status = v1.ConditionUnknown
9901035
currentCondition.Reason = "NodeStatusUnknown"
@@ -1001,9 +1046,9 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
10011046
klog.Errorf("Error updating node %s: %v", node.Name, err)
10021047
return gracePeriod, observedReadyCondition, currentReadyCondition, err
10031048
}
1004-
nc.nodeHealthMap[node.Name] = &nodeHealthData{
1049+
nodeHealth = &nodeHealthData{
10051050
status: &node.Status,
1006-
probeTimestamp: nc.nodeHealthMap[node.Name].probeTimestamp,
1051+
probeTimestamp: nodeHealth.probeTimestamp,
10071052
readyTransitionTimestamp: nc.now(),
10081053
lease: observedLease,
10091054
}
@@ -1086,10 +1131,10 @@ func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.Nod
10861131
// When exiting disruption mode update probe timestamps on all Nodes.
10871132
now := nc.now()
10881133
for i := range nodes {
1089-
v := nc.nodeHealthMap[nodes[i].Name]
1134+
v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name)
10901135
v.probeTimestamp = now
10911136
v.readyTransitionTimestamp = now
1092-
nc.nodeHealthMap[nodes[i].Name] = v
1137+
nc.nodeHealthMap.set(nodes[i].Name, v)
10931138
}
10941139
// We reset all rate limiters to settings appropriate for the given state.
10951140
for k := range nc.zoneStates {

pkg/controller/nodelifecycle/node_lifecycle_controller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3252,19 +3252,19 @@ func TestTryUpdateNodeHealth(t *testing.T) {
32523252

32533253
for _, test := range tests {
32543254
t.Run(test.name, func(t *testing.T) {
3255-
nodeController.nodeHealthMap[test.node.Name] = &nodeHealthData{
3255+
nodeController.nodeHealthMap.set(test.node.Name, &nodeHealthData{
32563256
status: &test.node.Status,
32573257
probeTimestamp: test.node.CreationTimestamp,
32583258
readyTransitionTimestamp: test.node.CreationTimestamp,
3259-
}
3259+
})
32603260
_, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(test.node)
32613261
if err != nil {
32623262
t.Fatalf("unexpected error: %v", err)
32633263
}
3264-
_, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap[test.node.Name].status, v1.NodeReady)
3264+
_, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap.getDeepCopy(test.node.Name).status, v1.NodeReady)
32653265
savedStatus := getStatus(savedReadyCondition)
32663266
currentStatus := getStatus(currentReadyCondition)
3267-
if currentStatus != savedStatus {
3267+
if !apiequality.Semantic.DeepEqual(currentStatus, savedStatus) {
32683268
t.Errorf("expected %v, got %v", savedStatus, currentStatus)
32693269
}
32703270
})

0 commit comments

Comments
 (0)