Skip to content

Commit 7476f46

Browse files
committed
memorymanager: fix checkpoint file comparison
For a resource within a group, such as memory, we should validate the total `Free` and total `Reserved` size of the expected `machineState` and state restored from checkpoint file after kubelet start. If total `Free` and total `Reserved` are equal, the restored state is valid. The old comparison however was done by reflection. There're times when the memory accounting is equals but the allocations across the NUMA nodes are varies. In such cases we still need to consider the states as equals. Signed-off-by: Talor Itzhak <[email protected]>
1 parent 6709317 commit 7476f46

File tree

1 file changed

+24
-13
lines changed

1 file changed

+24
-13
lines changed

pkg/kubelet/cm/memorymanager/policy_static.go

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -681,27 +681,38 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
681681
return false
682682
}
683683

684-
if memoryState1.TotalMemSize != memoryState2.TotalMemSize || memoryState1.SystemReserved != memoryState2.SystemReserved || memoryState1.Allocatable != memoryState2.Allocatable {
685-
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
684+
if memoryState1.TotalMemSize != memoryState2.TotalMemSize {
685+
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
686686
return false
687687
}
688688

689-
totalFree1 := uint64(0)
690-
totalReserved1 := uint64(0)
691-
totalFree2 := uint64(0)
692-
totalReserved2 := uint64(0)
693-
for _, nodeId := range nodeState1.Cells {
694-
totalFree1 += ms1[nodeId].MemoryMap[resourceName].Free
695-
totalReserved1 += ms1[nodeId].MemoryMap[resourceName].Reserved
696-
totalFree2 += ms2[nodeId].MemoryMap[resourceName].Free
697-
totalReserved2 += ms2[nodeId].MemoryMap[resourceName].Reserved
689+
if memoryState1.SystemReserved != memoryState2.SystemReserved {
690+
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
691+
return false
698692
}
699693

700-
if totalFree1 != totalFree2 || totalReserved1 != totalReserved2 {
701-
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
694+
if memoryState1.Allocatable != memoryState2.Allocatable {
695+
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
702696
return false
703697
}
704698

699+
tmpState1 := state.MemoryTable{}
700+
tmpState2 := state.MemoryTable{}
701+
for _, nodeID := range nodeState1.Cells {
702+
tmpState1.Free += ms1[nodeID].MemoryMap[resourceName].Free
703+
tmpState1.Reserved += ms1[nodeID].MemoryMap[resourceName].Reserved
704+
tmpState2.Free += ms2[nodeID].MemoryMap[resourceName].Free
705+
tmpState2.Reserved += ms2[nodeID].MemoryMap[resourceName].Reserved
706+
}
707+
708+
if tmpState1.Free != tmpState2.Free {
709+
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
710+
return false
711+
}
712+
if tmpState1.Reserved != tmpState2.Reserved {
713+
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
714+
return false
715+
}
705716
}
706717
}
707718
return true

0 commit comments

Comments
 (0)