Skip to content

Commit f153edf

Browse files
authored
Merge pull request kubernetes#123443 from Tal-or/mm_consistent_memory_numa_alloc
memorymanager: avoid violating NUMA node memory allocation rule
2 parents dd820a1 + c019114 commit f153edf

File tree

2 files changed

+226
-0
lines changed

2 files changed

+226
-0
lines changed

pkg/kubelet/cm/memorymanager/policy_static.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
157157
bestHint = extendedHint
158158
}
159159

160+
// the best hint might violate the NUMA allocation rule on which
161+
// NUMA node cannot have both single and cross NUMA node allocations
162+
// https://kubernetes.io/blog/2021/08/11/kubernetes-1-22-feature-memory-manager-moves-to-beta/#single-vs-cross-numa-node-allocation
163+
if isAffinityViolatingNUMAAllocations(machineState, bestHint.NUMANodeAffinity) {
164+
return fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation")
165+
}
166+
160167
var containerBlocks []state.Block
161168
maskBits := bestHint.NUMANodeAffinity.GetBits()
162169
for resourceName, requestedSize := range requestedResources {
@@ -992,3 +999,26 @@ func isNUMAAffinitiesEqual(numaAffinity1, numaAffinity2 []int) bool {
992999

9931000
return bitMask1.IsEqual(bitMask2)
9941001
}
1002+
1003+
func isAffinityViolatingNUMAAllocations(machineState state.NUMANodeMap, mask bitmask.BitMask) bool {
1004+
maskBits := mask.GetBits()
1005+
singleNUMAHint := len(maskBits) == 1
1006+
for _, nodeID := range mask.GetBits() {
1007+
// the node was never used for the memory allocation
1008+
if machineState[nodeID].NumberOfAssignments == 0 {
1009+
continue
1010+
}
1011+
if singleNUMAHint {
1012+
continue
1013+
}
1014+
// the node used for the single NUMA memory allocation, it cannot be used for the multi NUMA node allocation
1015+
if len(machineState[nodeID].Cells) == 1 {
1016+
return true
1017+
}
1018+
// the node already used with a different group of nodes, it cannot be used within the current hint
1019+
if !areGroupsEqual(machineState[nodeID].Cells, maskBits) {
1020+
return true
1021+
}
1022+
}
1023+
return false
1024+
}

pkg/kubelet/cm/memorymanager/policy_static_test.go

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,6 +1765,126 @@ func TestStaticPolicyAllocate(t *testing.T) {
17651765
pod: getPod("pod1", "container1", requirementsGuaranteed),
17661766
topologyHint: &topologymanager.TopologyHint{Preferred: true},
17671767
},
1768+
{
1769+
description: "should validate NUMA node can not have both single and cross NUMA node memory allocations",
1770+
assignments: state.ContainerMemoryAssignments{
1771+
"pod1": map[string][]state.Block{
1772+
"container1": {
1773+
{
1774+
NUMAAffinity: []int{0},
1775+
Type: v1.ResourceMemory,
1776+
Size: 1024 * mb,
1777+
},
1778+
},
1779+
},
1780+
},
1781+
expectedAssignments: state.ContainerMemoryAssignments{
1782+
"pod1": map[string][]state.Block{
1783+
"container1": {
1784+
{
1785+
NUMAAffinity: []int{0},
1786+
Type: v1.ResourceMemory,
1787+
Size: 1024 * mb,
1788+
},
1789+
},
1790+
},
1791+
},
1792+
machineState: state.NUMANodeMap{
1793+
0: &state.NUMANodeState{
1794+
MemoryMap: map[v1.ResourceName]*state.MemoryTable{
1795+
v1.ResourceMemory: {
1796+
Allocatable: 1536 * mb,
1797+
Free: 512 * mb,
1798+
Reserved: 1024 * mb,
1799+
SystemReserved: 512 * mb,
1800+
TotalMemSize: 2176 * mb,
1801+
},
1802+
hugepages1Gi: {
1803+
Allocatable: gb,
1804+
Free: gb,
1805+
Reserved: 0,
1806+
SystemReserved: 0,
1807+
TotalMemSize: gb,
1808+
},
1809+
},
1810+
Cells: []int{0},
1811+
NumberOfAssignments: 1,
1812+
},
1813+
1: &state.NUMANodeState{
1814+
MemoryMap: map[v1.ResourceName]*state.MemoryTable{
1815+
v1.ResourceMemory: {
1816+
Allocatable: 512 * mb,
1817+
Free: 512 * mb,
1818+
Reserved: 0,
1819+
SystemReserved: 512 * mb,
1820+
TotalMemSize: 2176 * mb,
1821+
},
1822+
hugepages1Gi: {
1823+
Allocatable: gb,
1824+
Free: gb,
1825+
Reserved: 0,
1826+
SystemReserved: 0,
1827+
TotalMemSize: gb,
1828+
},
1829+
},
1830+
Cells: []int{1},
1831+
NumberOfAssignments: 0,
1832+
},
1833+
},
1834+
expectedMachineState: state.NUMANodeMap{
1835+
0: &state.NUMANodeState{
1836+
MemoryMap: map[v1.ResourceName]*state.MemoryTable{
1837+
v1.ResourceMemory: {
1838+
Allocatable: 1536 * mb,
1839+
Free: 512 * mb,
1840+
Reserved: 1024 * mb,
1841+
SystemReserved: 512 * mb,
1842+
TotalMemSize: 2176 * mb,
1843+
},
1844+
hugepages1Gi: {
1845+
Allocatable: gb,
1846+
Free: gb,
1847+
Reserved: 0,
1848+
SystemReserved: 0,
1849+
TotalMemSize: gb,
1850+
},
1851+
},
1852+
Cells: []int{0},
1853+
NumberOfAssignments: 1,
1854+
},
1855+
1: &state.NUMANodeState{
1856+
MemoryMap: map[v1.ResourceName]*state.MemoryTable{
1857+
v1.ResourceMemory: {
1858+
Allocatable: 512 * mb,
1859+
Free: 512 * mb,
1860+
Reserved: 0,
1861+
SystemReserved: 512 * mb,
1862+
TotalMemSize: 2176 * mb,
1863+
},
1864+
hugepages1Gi: {
1865+
Allocatable: gb,
1866+
Free: gb,
1867+
Reserved: 0,
1868+
SystemReserved: 0,
1869+
TotalMemSize: gb,
1870+
},
1871+
},
1872+
Cells: []int{1},
1873+
NumberOfAssignments: 0,
1874+
},
1875+
},
1876+
systemReserved: systemReservedMemory{
1877+
0: map[v1.ResourceName]uint64{
1878+
v1.ResourceMemory: 512 * mb,
1879+
},
1880+
1: map[v1.ResourceName]uint64{
1881+
v1.ResourceMemory: 512 * mb,
1882+
},
1883+
},
1884+
pod: getPod("pod2", "container1", requirementsGuaranteed),
1885+
topologyHint: &topologymanager.TopologyHint{NUMANodeAffinity: newNUMAAffinity(0, 1), Preferred: true},
1886+
expectedError: fmt.Errorf("[memorymanager] preferred hint violates NUMA node allocation"),
1887+
},
17681888
}
17691889

17701890
for _, testCase := range testCases {
@@ -3776,3 +3896,79 @@ func Test_getPodRequestedResources(t *testing.T) {
37763896
})
37773897
}
37783898
}
3899+
3900+
func Test_isAffinityViolatingNUMAAllocations(t *testing.T) {
3901+
testsCases := []struct {
3902+
description string
3903+
machineState map[int]*state.NUMANodeState
3904+
topologyHint *topologymanager.TopologyHint
3905+
isViolationExpected bool
3906+
}{
3907+
{
3908+
description: "violating NUMA allocations because given affinity asks for NUMA ID 1 which is on different cells group",
3909+
machineState: map[int]*state.NUMANodeState{
3910+
0: {
3911+
NumberOfAssignments: 1,
3912+
Cells: []int{0, 1},
3913+
},
3914+
1: {
3915+
NumberOfAssignments: 1,
3916+
Cells: []int{0, 1},
3917+
},
3918+
2: {
3919+
NumberOfAssignments: 1,
3920+
Cells: []int{2},
3921+
},
3922+
3: {
3923+
NumberOfAssignments: 0,
3924+
Cells: []int{3},
3925+
},
3926+
},
3927+
topologyHint: &topologymanager.TopologyHint{
3928+
NUMANodeAffinity: newNUMAAffinity(1, 2),
3929+
},
3930+
isViolationExpected: true,
3931+
},
3932+
{
3933+
description: "violating NUMA allocations because given affinity with multiple nodes asks for NUMA ID 1 which is used for a single NUMA node memory allocation",
3934+
machineState: map[int]*state.NUMANodeState{
3935+
0: {
3936+
NumberOfAssignments: 0,
3937+
Cells: []int{0, 1},
3938+
},
3939+
1: {
3940+
NumberOfAssignments: 1,
3941+
Cells: []int{1},
3942+
},
3943+
},
3944+
topologyHint: &topologymanager.TopologyHint{
3945+
NUMANodeAffinity: newNUMAAffinity(0, 1),
3946+
},
3947+
isViolationExpected: true,
3948+
},
3949+
{
3950+
description: "valid affinity, no prior assignments",
3951+
machineState: map[int]*state.NUMANodeState{
3952+
0: {
3953+
NumberOfAssignments: 0,
3954+
Cells: []int{0},
3955+
},
3956+
1: {
3957+
NumberOfAssignments: 0,
3958+
Cells: []int{1},
3959+
},
3960+
},
3961+
topologyHint: &topologymanager.TopologyHint{
3962+
NUMANodeAffinity: newNUMAAffinity(0, 1),
3963+
},
3964+
isViolationExpected: false,
3965+
},
3966+
}
3967+
for _, tc := range testsCases {
3968+
t.Run(tc.description, func(t *testing.T) {
3969+
if isAffinityViolatingNUMAAllocations(tc.machineState, tc.topologyHint.NUMANodeAffinity) != tc.isViolationExpected {
3970+
t.Errorf("isAffinityViolatingNUMAAllocations with affinity %v expected to return %t, got %t", tc.topologyHint.NUMANodeAffinity.GetBits(), tc.isViolationExpected, !tc.isViolationExpected)
3971+
}
3972+
})
3973+
}
3974+
}

0 commit comments

Comments
 (0)