Skip to content

Commit 4d4d4bd

Browse files
committed
Ensure devicemanager TopologyHints are regenerated after kubelet restart
This patch also includes test to make sure the newly added logic works as expected.
1 parent 9dc116e commit 4d4d4bd

File tree

3 files changed

+163
-15
lines changed

3 files changed

+163
-15
lines changed

pkg/kubelet/cm/devicemanager/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ go_test(
6060
"//staging/src/k8s.io/api/core/v1:go_default_library",
6161
"//staging/src/k8s.io/apimachinery/pkg/api/resource:go_default_library",
6262
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
63+
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
6364
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
6465
"//staging/src/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
6566
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",

pkg/kubelet/cm/devicemanager/topology_hints.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,21 @@ func (m *ManagerImpl) GetTopologyHints(pod v1.Pod, container v1.Container) map[s
4646
continue
4747
}
4848

49+
// Short circuit to regenerate the same hints if there are already
50+
// devices allocated to the Container. This might happen after a
51+
// kubelet restart, for example.
52+
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
53+
if allocated.Len() > 0 {
54+
if allocated.Len() != requested {
55+
klog.Errorf("[devicemanager] Resource '%v' already allocated to (pod %v, container %v) with different number than request: requested: %d, allocated: %d", resource, string(pod.UID), container.Name, requested, allocated.Len())
56+
deviceHints[resource] = []topologymanager.TopologyHint{}
57+
continue
58+
}
59+
klog.Infof("[devicemanager] Regenerating TopologyHints for resource '%v' already allocated to (pod %v, container %v)", resource, string(pod.UID), container.Name)
60+
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, requested)
61+
continue
62+
}
63+
4964
// Get the list of available devices, for which TopologyHints should be generated.
5065
available := m.getAvailableDevices(resource)
5166
if available.Len() < requested {

pkg/kubelet/cm/devicemanager/topology_hints_test.go

Lines changed: 147 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
v1 "k8s.io/api/core/v1"
2525
"k8s.io/apimachinery/pkg/api/resource"
26+
"k8s.io/apimachinery/pkg/types"
2627
"k8s.io/apimachinery/pkg/util/sets"
2728
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
2829
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
@@ -52,13 +53,17 @@ func makeSocketMask(sockets ...int) bitmask.BitMask {
5253
func TestGetTopologyHints(t *testing.T) {
5354
tcases := []struct {
5455
description string
56+
podUID string
57+
containerName string
5558
request map[string]string
5659
devices map[string][]pluginapi.Device
57-
allocatedDevices map[string][]string
60+
allocatedDevices map[string]map[string]map[string][]string
5861
expectedHints map[string][]topologymanager.TopologyHint
5962
}{
6063
{
61-
description: "Single Request, no alignment",
64+
description: "Single Request, no alignment",
65+
podUID: "fakePod",
66+
containerName: "fakeContainer",
6267
request: map[string]string{
6368
"testdevice": "1",
6469
},
@@ -73,7 +78,9 @@ func TestGetTopologyHints(t *testing.T) {
7378
},
7479
},
7580
{
76-
description: "Single Request, only one with alignment",
81+
description: "Single Request, only one with alignment",
82+
podUID: "fakePod",
83+
containerName: "fakeContainer",
7784
request: map[string]string{
7885
"testdevice": "1",
7986
},
@@ -97,7 +104,9 @@ func TestGetTopologyHints(t *testing.T) {
97104
},
98105
},
99106
{
100-
description: "Single Request, one device per socket",
107+
description: "Single Request, one device per socket",
108+
podUID: "fakePod",
109+
containerName: "fakeContainer",
101110
request: map[string]string{
102111
"testdevice": "1",
103112
},
@@ -125,7 +134,9 @@ func TestGetTopologyHints(t *testing.T) {
125134
},
126135
},
127136
{
128-
description: "Request for 2, one device per socket",
137+
description: "Request for 2, one device per socket",
138+
podUID: "fakePod",
139+
containerName: "fakeContainer",
129140
request: map[string]string{
130141
"testdevice": "2",
131142
},
@@ -145,7 +156,9 @@ func TestGetTopologyHints(t *testing.T) {
145156
},
146157
},
147158
{
148-
description: "Request for 2, 2 devices per socket",
159+
description: "Request for 2, 2 devices per socket",
160+
podUID: "fakePod",
161+
containerName: "fakeContainer",
149162
request: map[string]string{
150163
"testdevice": "2",
151164
},
@@ -175,7 +188,9 @@ func TestGetTopologyHints(t *testing.T) {
175188
},
176189
},
177190
{
178-
description: "Request for 2, optimal on 1 NUMA node, forced cross-NUMA",
191+
description: "Request for 2, optimal on 1 NUMA node, forced cross-NUMA",
192+
podUID: "fakePod",
193+
containerName: "fakeContainer",
179194
request: map[string]string{
180195
"testdevice": "2",
181196
},
@@ -187,8 +202,12 @@ func TestGetTopologyHints(t *testing.T) {
187202
makeNUMADevice("Dev4", 1),
188203
},
189204
},
190-
allocatedDevices: map[string][]string{
191-
"testdevice": {"Dev1", "Dev2"},
205+
allocatedDevices: map[string]map[string]map[string][]string{
206+
"fakePod": {
207+
"fakeOtherContainer": {
208+
"testdevice": {"Dev1", "Dev2"},
209+
},
210+
},
192211
},
193212
expectedHints: map[string][]topologymanager.TopologyHint{
194213
"testdevice": {
@@ -200,7 +219,9 @@ func TestGetTopologyHints(t *testing.T) {
200219
},
201220
},
202221
{
203-
description: "2 device types, mixed configuration",
222+
description: "2 device types, mixed configuration",
223+
podUID: "fakePod",
224+
containerName: "fakeContainer",
204225
request: map[string]string{
205226
"testdevice1": "2",
206227
"testdevice2": "1",
@@ -243,6 +264,110 @@ func TestGetTopologyHints(t *testing.T) {
243264
},
244265
},
245266
},
267+
{
268+
description: "Single device type, more requested than available",
269+
podUID: "fakePod",
270+
containerName: "fakeContainer",
271+
request: map[string]string{
272+
"testdevice": "6",
273+
},
274+
devices: map[string][]pluginapi.Device{
275+
"testdevice": {
276+
makeNUMADevice("Dev1", 0),
277+
makeNUMADevice("Dev2", 0),
278+
makeNUMADevice("Dev3", 1),
279+
makeNUMADevice("Dev4", 1),
280+
},
281+
},
282+
expectedHints: map[string][]topologymanager.TopologyHint{
283+
"testdevice": {},
284+
},
285+
},
286+
{
287+
description: "Single device type, all already allocated to container",
288+
podUID: "fakePod",
289+
containerName: "fakeContainer",
290+
request: map[string]string{
291+
"testdevice": "2",
292+
},
293+
devices: map[string][]pluginapi.Device{
294+
"testdevice": {
295+
makeNUMADevice("Dev1", 0),
296+
makeNUMADevice("Dev2", 0),
297+
},
298+
},
299+
allocatedDevices: map[string]map[string]map[string][]string{
300+
"fakePod": {
301+
"fakeContainer": {
302+
"testdevice": {"Dev1", "Dev2"},
303+
},
304+
},
305+
},
306+
expectedHints: map[string][]topologymanager.TopologyHint{
307+
"testdevice": {
308+
{
309+
NUMANodeAffinity: makeSocketMask(0),
310+
Preferred: true,
311+
},
312+
{
313+
NUMANodeAffinity: makeSocketMask(0, 1),
314+
Preferred: false,
315+
},
316+
},
317+
},
318+
},
319+
{
320+
description: "Single device type, less already allocated to container than requested",
321+
podUID: "fakePod",
322+
containerName: "fakeContainer",
323+
request: map[string]string{
324+
"testdevice": "4",
325+
},
326+
devices: map[string][]pluginapi.Device{
327+
"testdevice": {
328+
makeNUMADevice("Dev1", 0),
329+
makeNUMADevice("Dev2", 0),
330+
makeNUMADevice("Dev3", 1),
331+
makeNUMADevice("Dev4", 1),
332+
},
333+
},
334+
allocatedDevices: map[string]map[string]map[string][]string{
335+
"fakePod": {
336+
"fakeContainer": {
337+
"testdevice": {"Dev1", "Dev2"},
338+
},
339+
},
340+
},
341+
expectedHints: map[string][]topologymanager.TopologyHint{
342+
"testdevice": {},
343+
},
344+
},
345+
{
346+
description: "Single device type, more already allocated to container than requested",
347+
podUID: "fakePod",
348+
containerName: "fakeContainer",
349+
request: map[string]string{
350+
"testdevice": "2",
351+
},
352+
devices: map[string][]pluginapi.Device{
353+
"testdevice": {
354+
makeNUMADevice("Dev1", 0),
355+
makeNUMADevice("Dev2", 0),
356+
makeNUMADevice("Dev3", 1),
357+
makeNUMADevice("Dev4", 1),
358+
},
359+
},
360+
allocatedDevices: map[string]map[string]map[string][]string{
361+
"fakePod": {
362+
"fakeContainer": {
363+
"testdevice": {"Dev1", "Dev2", "Dev3", "Dev4"},
364+
},
365+
},
366+
},
367+
expectedHints: map[string][]topologymanager.TopologyHint{
368+
"testdevice": {},
369+
},
370+
},
246371
}
247372

248373
for _, tc := range tcases {
@@ -252,14 +377,16 @@ func TestGetTopologyHints(t *testing.T) {
252377
}
253378

254379
pod := makePod(resourceList)
380+
pod.UID = types.UID(tc.podUID)
381+
pod.Spec.Containers[0].Name = tc.containerName
255382

256383
m := ManagerImpl{
257384
allDevices: make(map[string]map[string]pluginapi.Device),
258385
healthyDevices: make(map[string]sets.String),
259386
allocatedDevices: make(map[string]sets.String),
260387
podDevices: make(podDevices),
261388
sourcesReady: &sourcesReadyStub{},
262-
activePods: func() []*v1.Pod { return []*v1.Pod{} },
389+
activePods: func() []*v1.Pod { return []*v1.Pod{pod} },
263390
numaNodes: []int{0, 1},
264391
}
265392

@@ -273,11 +400,16 @@ func TestGetTopologyHints(t *testing.T) {
273400
}
274401
}
275402

276-
for r := range tc.allocatedDevices {
277-
m.allocatedDevices[r] = sets.NewString()
403+
for p := range tc.allocatedDevices {
404+
for c := range tc.allocatedDevices[p] {
405+
for r, devices := range tc.allocatedDevices[p][c] {
406+
m.podDevices.insert(p, c, r, sets.NewString(devices...), nil)
278407

279-
for _, d := range tc.allocatedDevices[r] {
280-
m.allocatedDevices[r].Insert(d)
408+
m.allocatedDevices[r] = sets.NewString()
409+
for _, d := range devices {
410+
m.allocatedDevices[r].Insert(d)
411+
}
412+
}
281413
}
282414
}
283415

0 commit comments

Comments
 (0)