Skip to content

Commit ea2e16c

Browse files
authored
Fix concurrent map read write fatal error. (#1476)
* Fix concurrent map read write fatal error. Signed-off-by: litaixun <[email protected]> * Fix UT. Signed-off-by: litaixun <[email protected]> --------- Signed-off-by: litaixun <[email protected]>
1 parent ce0a434 commit ea2e16c

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

pkg/scheduler/nodes.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,5 +111,18 @@ func (m *nodeManager) GetNode(nodeID string) (*device.NodeInfo, error) {
111111
func (m *nodeManager) ListNodes() (map[string]*device.NodeInfo, error) {
112112
m.mutex.RLock()
113113
defer m.mutex.RUnlock()
114-
return m.nodes, nil
114+
nodesCopy := make(map[string]*device.NodeInfo, len(m.nodes))
115+
for nodeID, nodeInfo := range m.nodes {
116+
if nodeInfo == nil || nodeInfo.Node == nil {
117+
klog.Warningf("ListNodes nodes copy step skip node(%s) because of nil NodeInfo or NodeInfo.Node", nodeID)
118+
continue
119+
}
120+
nodeInfoCopy := &device.NodeInfo{
121+
ID: nodeInfo.ID,
122+
Node: nodeInfo.Node.DeepCopy(),
123+
Devices: append([]device.DeviceInfo{}, nodeInfo.Devices...),
124+
}
125+
nodesCopy[nodeID] = nodeInfoCopy
126+
}
127+
return nodesCopy, nil
115128
}

pkg/scheduler/scheduler_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ func Test_getNodesUsage(t *testing.T) {
4747
nodeMage := newNodeManager()
4848
nodeMage.addNode("node1", &device.NodeInfo{
4949
ID: "node1",
50+
Node: &corev1.Node{
51+
ObjectMeta: metav1.ObjectMeta{
52+
Name: "node1",
53+
},
54+
},
5055
Devices: []device.DeviceInfo{
5156
{
5257
ID: "GPU0",

0 commit comments

Comments
 (0)