Skip to content

Commit cb9fdc4

Browse files
author
nolancon
committed
Device Manager - Refactor allocatePodResources
- allocatePodResources logic altered to allow for container by container device allocation. - New type PodReusableDevices - New field in devicemanager devicesToReuse
1 parent 0a9bd03 commit cb9fdc4

File tree

2 files changed

+34
-40
lines changed

2 files changed

+34
-40
lines changed

pkg/kubelet/cm/devicemanager/manager.go

Lines changed: 33 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ type ManagerImpl struct {
105105

106106
// Store of Topology Affinties that the Device Manager can query.
107107
topologyAffinityStore topologymanager.Store
108+
109+
// devicesToReuse contains devices that can be reused as they have been allocated to
110+
// init containers.
111+
devicesToReuse PodReusableDevices
108112
}
109113

110114
type endpointInfo struct {
@@ -114,6 +118,9 @@ type endpointInfo struct {
114118

115119
type sourcesReadyStub struct{}
116120

121+
// PodReusableDevices is a map by pod name of devices to reuse.
122+
type PodReusableDevices map[string]map[string]sets.String
123+
117124
func (s *sourcesReadyStub) AddSource(source string) {}
118125
func (s *sourcesReadyStub) AllReady() bool { return true }
119126

@@ -147,6 +154,7 @@ func newManagerImpl(socketPath string, numaNodeInfo cputopology.NUMANodeInfo, to
147154
podDevices: make(podDevices),
148155
numaNodes: numaNodes,
149156
topologyAffinityStore: topologyAffinityStore,
157+
devicesToReuse: make(PodReusableDevices),
150158
}
151159
manager.callback = manager.genericDeviceUpdateCallback
152160

@@ -350,54 +358,39 @@ func (m *ManagerImpl) isVersionCompatibleWithPlugin(versions []string) bool {
350358
return false
351359
}
352360

353-
func (m *ManagerImpl) allocatePodResources(pod *v1.Pod) error {
354-
devicesToReuse := make(map[string]sets.String)
355-
for _, container := range pod.Spec.InitContainers {
356-
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
357-
return err
358-
}
359-
m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
360-
}
361-
for _, container := range pod.Spec.Containers {
362-
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
363-
return err
364-
}
365-
m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
366-
}
367-
return nil
368-
}
369-
370361
// Allocate is the call that you can use to allocate a set of devices
371362
// from the registered device plugins.
372363
func (m *ManagerImpl) Allocate(pod *v1.Pod, container *v1.Container) error {
373-
// TODO: This function does not yet do what it is supposed to. The call to
374-
// allocatePodResources() below is still allocating devices to a pod all at
375-
// once. We need to unroll this logic to allow it to allocate devices on a
376-
// container-by-container basis instead. The main challenge will be
377-
// ensuring that we "reuse" devices from init containers when allocating
378-
// devices to app containers (just as the logic inside
379-
// allocatePodResources() currently does). The hard part being that we will
380-
// need to maintain the 'devicesToReuse' present in allocatePodResources()
381-
// across invocations of Allocate().
382-
//
383-
// My initial inclination to solve this with the least coode churn is:
384-
// 1) Create a new type called PodReusableDevices, defined as:
385-
// type PodReusableDevices map[string]map[string]sets.String
386-
// 2) Instantiate a PodReusableDevices map as a new field of the
387-
// devicemanager called devicesToReuse (similar to the local
388-
// devicesToReuse variable currently in allocatePodResources)
389-
// 3) Use devicesToReuse[string(pod.UID) just as devicesToReuse is used
390-
// today, being careful to create / destroy the nested maps where
391-
// appropriate.
392-
393-
err := m.allocatePodResources(pod)
394-
if err != nil {
395-
klog.Errorf("Failed to allocate device plugin resource for pod %s, container %s: %v", string(pod.UID), container.Name, err)
364+
if _, ok := m.devicesToReuse[string(pod.UID)]; !ok {
365+
m.devicesToReuse[string(pod.UID)] = make(map[string]sets.String)
366+
}
367+
// If pod entries to m.devicesToReuse other than the current pod exist, delete them.
368+
for podUID := range m.devicesToReuse {
369+
if podUID != string(pod.UID) {
370+
delete(m.devicesToReuse, podUID)
371+
}
372+
}
373+
// Allocate resources for init containers first as we know the caller always loops
374+
// through init containers before looping through app containers. Should the caller
375+
// ever change those semantics, this logic will need to be amended.
376+
for _, initContainer := range pod.Spec.InitContainers {
377+
if container.Name == initContainer.Name {
378+
if err := m.allocateContainerResources(pod, container, m.devicesToReuse[string(pod.UID)]); err != nil {
379+
return err
380+
}
381+
m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, m.devicesToReuse[string(pod.UID)])
382+
return nil
383+
}
384+
}
385+
if err := m.allocateContainerResources(pod, container, m.devicesToReuse[string(pod.UID)]); err != nil {
396386
return err
397387
}
388+
m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, m.devicesToReuse[string(pod.UID)])
398389
return nil
390+
399391
}
400392

393+
// UpdatePluginResources updates node resources based on devices already allocated to pods.
401394
func (m *ManagerImpl) UpdatePluginResources(node *schedulernodeinfo.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
402395
pod := attrs.Pod
403396

pkg/kubelet/cm/devicemanager/manager_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,7 @@ func getTestManager(tmpDir string, activePods ActivePodsFunc, testRes []TestReso
605605
allocatedDevices: make(map[string]sets.String),
606606
endpoints: make(map[string]endpointInfo),
607607
podDevices: make(podDevices),
608+
devicesToReuse: make(PodReusableDevices),
608609
topologyAffinityStore: topologymanager.NewFakeManager(),
609610
activePods: activePods,
610611
sourcesReady: &sourcesReadyStub{},

0 commit comments

Comments
 (0)